def cache_data( self, extracted_metadata: dict, cache_manager: CacheManager, allow_list: dict, ): for feature, meta_data in extracted_metadata.items(): if (allow_list[feature] and Explanation.Cached not in meta_data[EXPLANATION]): values = [] if feature == ACCESSIBILITY: values = meta_data[VALUES] data_to_be_cached = { VALUES: values, STAR_CASE: meta_data[STAR_CASE], TIMESTAMP: get_utc_now(), EXPLANATION: meta_data[EXPLANATION], } create_cache_entry( cache_manager.get_domain(), feature, data_to_be_cached, self._logger, )
def print_exceptions(maximum_age_in_seconds: int): database: Session = ProfilerSession() query = database.query( db_models.Record.exception, db_models.Record.timestamp, db_models.Record.url, ) meta_rows = database.execute(query) failure_urls = [] print_data = {} for meta_row in meta_rows: timestamp = meta_row[1] if timestamp < (get_utc_now() - maximum_age_in_seconds): continue exception = meta_row[0] if exception == "": continue if exception not in print_data.keys(): print_data.update({exception: 0}) if "Empty html. Potentially, splash failed." in exception: print(f"Splash failed for url: '{meta_row[2]}'") failure_urls.append(meta_row[2]) print_data[exception] += 1 print( f"----------------- Found exceptions of the last {round(maximum_age_in_seconds / SECONDS_PER_DAY, 2)} days." ) print(f"All urls which caused exceptions: {get_unique_list(failure_urls)}") print(f"Total number of found exceptions: {len(print_data.items())}") for exception, value in print_data.items(): if exception != "": print(exception, value) print("-------")
def _processing_values(self, values: list[str], website_data: WebsiteData, before: float) -> dict: website_data.values = values star_case, explanation = self._decide(website_data=website_data) data = { self.key: { TIME_REQUIRED: get_utc_now() - before, VALUES: values, STAR_CASE: star_case, EXPLANATION: explanation, } } if self.tag_list_last_modified != "": data[self.key].update({ "tag_list_last_modified": self.tag_list_last_modified, "tag_list_expires": self.tag_list_expires, }) return data
def extract_meta(input_data: Input): starting_extraction = get_utc_now() allowance = _convert_allow_list_to_dict(input_data.allow_list) database_exception = "" try: create_request_record( starting_extraction, input_data=input_data, allowance=allowance ) except OperationalError as err: database_exception += ( "\nDatabase exception: " + str(err.args) + "".join(traceback.format_exception(None, err, err.__traceback__)) ) uuid = app.communicator.send_message( { MESSAGE_URL: input_data.url, MESSAGE_HTML: input_data.html, MESSAGE_HEADERS: input_data.headers, MESSAGE_HAR: input_data.har, MESSAGE_ALLOW_LIST: allowance, MESSAGE_SHARED_MEMORY_NAME: shared_status.shm.name, MESSAGE_BYPASS_CACHE: input_data.bypass_cache, } ) meta_data: dict = app.communicator.get_message(uuid) if meta_data: extractor_tags = _convert_dict_to_output_model( meta_data, input_data.debug ) if MESSAGE_EXCEPTION in meta_data.keys(): exception = meta_data[MESSAGE_EXCEPTION] else: exception = "" else: extractor_tags = None exception = f"No response from {METADATA_EXTRACTOR}." end_time = get_utc_now() out = Output( url=input_data.url, meta=extractor_tags, exception=exception + database_exception, time_until_complete=end_time - starting_extraction, ) try: create_response_record( starting_extraction, end_time, input_data=input_data, allowance=allowance, output=out, ) except OperationalError as err: database_exception += ( "\nDatabase exception: " + str(err.args) + "".join(traceback.format_exception(None, err, err.__traceback__)) ) out.exception += database_exception if exception != "": raise HTTPException( status_code=400, detail={ MESSAGE_URL: input_data.url, "meta": meta_data, MESSAGE_EXCEPTION: exception, "time_until_complete": end_time - starting_extraction, }, ) return out
def is_cached_value_recent(timestamp: float) -> bool: return timestamp >= (get_utc_now() - CACHE_RETENTION_TIME_DAYS * SECONDS_PER_DAY)
def rester(): allow_list = { "advertisement": True, "easy_privacy": True, "malicious_extensions": True, "extracted_links": True, "extract_from_files": True, "fanboy_annoyance": True, "fanboy_notification": True, "fanboy_social_media": True, "anti_adblock": True, "easylist_germany": True, "easylist_adult": True, "paywall": True, "security": True, "iframe_embeddable": True, "pop_up": True, "reg_wall": True, "log_in_out": True, "accessibility": True, "cookies": True, "g_d_p_r": True, "javascript": True, } extractor_url = "http://0.0.0.0:5057/extract_meta" result = {} try: os.remove(RESULT_FILE_PATH) except FileNotFoundError: pass logs, file_path = load_file_list() for counter, raw in enumerate(load_scraped_data(logs, file_path)): before = time.perf_counter() print(f"Working file {counter + 1} of {len(logs)}".center(80, "-")) print(raw["url"]) starting_extraction = get_utc_now() headers = {"Content-Type": "application/json"} payload = { MESSAGE_HTML: raw["html"], MESSAGE_HEADERS: raw["headers"], MESSAGE_URL: raw["url"], MESSAGE_ALLOW_LIST: allow_list, MESSAGE_HAR: raw["har"], "debug": True, } response = requests.request("POST", extractor_url, headers=headers, data=json.dumps(payload)) try: output = json.loads(response.content) except JSONDecodeError as e: print(response.content) print(f"Exception: {e}, {e.args}") output = {} output.update( {"time_for_extraction": get_utc_now() - starting_extraction}) result.update({raw["url"]: output}) with open(RESULT_FILE_PATH, "w") as fp: json.dump(result, fp) print(output) after = time.perf_counter() print(f"Total time needed in series: {after - before}")
def _prepare_start(self) -> tuple[float, WebsiteData]: self._logger.info(f"Starting {self.__class__.__name__}.") before = get_utc_now() website_data = self._prepare_website_data() return before, website_data
def start(self, message: dict) -> dict: self._logger.debug( f"Start metadata_manager at {time.perf_counter() - global_start} since start" ) shared_status = shared_memory.ShareableList( name=message[MESSAGE_SHARED_MEMORY_NAME]) url = message[MESSAGE_URL] if len(url) > 1024: url = url[0:1024] shared_status[1] = url website_manager = WebsiteManager.get_instance() self._logger.debug( f"WebsiteManager initialized at {time.perf_counter() - global_start} since start" ) website_manager.load_website_data(message=message) self._logger.debug( f"WebsiteManager loaded at {time.perf_counter() - global_start} since start" ) cache_manager = CacheManager.get_instance() cache_manager.update_to_current_domain( website_manager.website_data.domain, bypass=message[MESSAGE_BYPASS_CACHE], ) now = time.perf_counter() self._logger.debug( f"starting_extraction at {now - global_start} since start") starting_extraction = get_utc_now() if website_manager.website_data.html == "": exception = "Empty html. Potentially, splash failed." extracted_meta_data = {MESSAGE_EXCEPTION: exception} else: try: extracted_meta_data = asyncio.run( self._extract_meta_data( allow_list=message[MESSAGE_ALLOW_LIST], cache_manager=cache_manager, shared_memory_name=message[MESSAGE_SHARED_MEMORY_NAME], )) self.cache_data( extracted_meta_data, cache_manager, allow_list=message[MESSAGE_ALLOW_LIST], ) except ConnectionError as e: exception = f"Connection error extracting metadata: '{e.args}'" self._logger.exception( exception, exc_info=True, ) extracted_meta_data = {MESSAGE_EXCEPTION: exception} except Exception as e: exception = ( f"Unknown exception from extracting metadata: '{e.args}'. " f"{''.join(traceback.format_exception(None, e, e.__traceback__))}" ) self._logger.exception( exception, exc_info=True, ) extracted_meta_data = {MESSAGE_EXCEPTION: exception} self._logger.debug( f"extracted_meta_data at {time.perf_counter() - global_start} since start" ) extracted_meta_data.update({ "time_for_extraction": get_utc_now() - starting_extraction, **website_manager.get_website_data_to_log(), }) website_manager.reset() cache_manager.reset() shared_status[1] = "" self._logger.debug( f"website_manager.reset() at {time.perf_counter() - global_start} since start" ) return extracted_meta_data