def startup(self): """""" super().startup() self.DATAPATH = self.config["Path"] self.REQUEST_TIMEOUT = float( self.config["RequestTimeoutFactor"]) * float( self.config["StopWaitSecs"]) self.ua = UserAgent() self.db = DBInterface(config=self.config) self.db.connection_name = self.name self.request = Request(self.db) self.url = URLs(self.db) self.docs = Documents(self.db) self.logger.info("{} started".format(self.name)) self.url_id, self.url_str = None, None self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7", "Dnt": "1", "Referer": "https://www.google.com", }
def test_get_status_code_summary(db_interface): request = Request(db_interface) timestamp = [] now = datetime.now(tz=timezone.utc) urls = URLs(db_interface) url_id = urls.save_url(None, None, "www.internet.de") for i in range(-10, 10): ts = now + timedelta(seconds=i) timestamp.append(ts) request.mark_as_requested( url_id=url_id, status_code=200, redirected_url="www.internet1.de", requested_at=ts, ) result = request.get_status_code_summary(timestamp[0], timestamp[-1]) assert result[200] == 20 result = request.get_status_code_summary(timestamp[1], timestamp[-2]) assert result[200] == 18 result = request.get_status_code_summary( timestamp[-1], timestamp[-1] + timedelta(seconds=60) ) assert result[200] == 1 result = request.get_status_code_summary( timestamp[0] - timedelta(seconds=60), timestamp[0] ) assert result[200] == 1
def stop_procs(self): super(Context, self).stop_procs() temp_db = DBInterface(config=self.config["General"]) urls = URLs(temp_db) # drop uncrawled urls last to prevent race conditions self.logger.info("Dropping uncrawled urls") urls.drop_uncrawled_urls()
def test_table_not_exists(db_interface): with db_interface.cursor() as db: db.cur.execute( sql.SQL("drop table {table} cascade").format( table=sql.Identifier(URLs.table_name))) urls = URLs(db_interface) assert urls.table_exists() is False
def test_drop_uncrawled_urls_drop_all_but_one(db_interface, url_ids): u = URLs(db_interface) r = Request(db_interface) r.mark_as_requested(url_ids[0], 200, "wwww.internet.de") u.drop_uncrawled_urls() assert count_urls(db_interface) == 1
def test_save_url_completly_unique(db_interface, sessionDays, rulesFix): u = URLs(db_interface) result = [] for day_id, rule_id in zip(sessionDays, rulesFix): result.append( u.save_url(date_id=day_id, rule_id=rule_id, url="www.internet.de" + str(day_id))) entries = list(zip(sessionDays, rulesFix)) assert len(entries) == len(result)
def url_ids(db_interface): u = URLs(db_interface) url_ids = [] amount = 10 for i in range(amount): url_ids.append(u.save_url(None, None, str(i))) assert count_urls(db_interface) == amount return url_ids
def test_get_todo_rule_and_date_combos_one_rule(db_interface, todo_setup): # valid session url is found and one rule is activated u = URLs(db_interface) ru = Rules(db_interface) s = SessionDay(db_interface) ru.update_rule_state(id=todo_setup["rule_ids"][1], active=True) ret = u.get_todo_rule_and_date_combos(limit=100) assert len(ret) == 1 assert ret[0]["date"] == s.get_date(todo_setup["day_id"])[1] id, name, active = ru.get_rule(todo_setup["rule_ids"][1]) assert ret[0]["rulename"] == name
def test_save_url_rule_unique(db_interface, sessionDays, rulesFix): """ Only the combination of url and responsible rule should be unique. This allows for the same url being generated by the session_day_checker and by the date_url_generator, given that the session_day-rule wraps the """ u = URLs(db_interface) result = [] for day_id, rule_id in zip(sessionDays, rulesFix): result.append( u.save_url(date_id=day_id, rule_id=rule_id, url="www.internet.de")) entries = list(zip(sessionDays, rulesFix)) assert len(entries) == len(result)
def startup(self): super().startup() self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"]) self.db = DBInterface(config=self.config) self.db.connection_name = self.name self.urls = URLs(self.db) self.rules = Rules(self.db) self.todo_date_rule_combos = [] self.url_id = None self.url_string = None self.logger.info("{} started".format(self.name))
def test_Request_mark_as_requested_get_request_log( db_interface, url, status_code, redirected_url, document_id, requested_at, ): request = Request(db_interface) urls = URLs(db_interface) url_id = None if url: url_id = urls.save_url(None, None, url) # TODO: Test document_id id = request.mark_as_requested( url_id=url_id, status_code=status_code, redirected_url=redirected_url, requested_at=requested_at, document_id=document_id, ) assert type(id) == int if url: assert type(url_id) == int else: assert url_id is None row = request.get_request_log(id) assert row[0] == id assert row[1] == url_id assert row[2] == document_id assert row[3] == requested_at assert row[4] == status_code assert row[5] == redirected_url
def todo_setup(db_interface): u = URLs(db_interface) r = Request(db_interface) ru = Rules(db_interface) s = SessionDay(db_interface) day_id = s.insert_date(date.today()) rule_ids = ru.register_rules(rule_registry.all) session_day_id, name, active = ru.get_rule(rulename="session_day") session_url_id = u.save_url(date_id=day_id, rule_id=session_day_id, url="www.internet.de") rule_ids.remove(session_url_id) r.mark_as_requested(url_id=session_url_id, status_code=200, redirected_url="www.internet1.de") return { "day_id": day_id, "rule_ids": rule_ids, "session_url_id": session_url_id }
class DateUrlGenerator(ProcWorker): def init_args(self, args): (self.url_q, ) = args def startup(self): super().startup() self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"]) self.db = DBInterface(config=self.config) self.db.connection_name = self.name self.urls = URLs(self.db) self.rules = Rules(self.db) self.todo_date_rule_combos = [] self.url_id = None self.url_string = None self.logger.info("{} started".format(self.name)) def shutdown(self): super().shutdown() def get_new_combos(self, limit): """ Get a list of new rule and date combinations Args: limit (int): amount of combinations that should be retrieved Returns: list: list of combination dictionaries """ self.logger.debug("Getting new date/rule-combinations") combos = self.urls.get_todo_rule_and_date_combos(limit=limit) # got no new combinations from db. sleep for the polling timeout before retrying if len(combos) == 0: time.sleep(self.DEFAULT_POLLING_TIMEOUT) else: self.logger.info("Got {} new combinations from database".format( len(combos))) return combos def create_url(self, combo): """ Creates a url based upon a rule and date combination Args: combo (dict): rule and date combination dictionary Returns: tuple: url_id and url_string """ self.logger.debug("Applying rule: {} to date: {}".format( combo["rulename"], combo["date"])) url_id, url_string = self.rules.apply_rule(date_id=combo["date_id"], rule_id=combo["rule_id"]) self.logger.debug("Result: {}".format(url_string)) return url_id, url_string def enqueue_url(self, url_id, url_string): """ Queues up a URl Args: url_id (int): id of the url url_string (id): url string Returns: tuple of url_id and url_string: values are None if the value was enqueued successfully, the old values stay if this isn't the case """ try: self.logger.debug("Queueing up URL with id: {}".format(url_id)) self.url_q.put(url_id, timeout=self.DEFAULT_POLLING_TIMEOUT) self.logger.info("Queued up URL: {} with id: {}".format( url_string, url_id)) url_string, url_id = None, None except Full: pass return url_id, url_string def main_func(self): """ Continuously enqueue new urls. First block sets up a buffer of date and rule combinations. This buffer is then iteratively consumed with every iteration, urls created, stored and enqueued """ if len(self.todo_date_rule_combos) == 0: self.todo_date_rule_combos = self.get_new_combos( limit=self.PREFETCH_LIMIT) if len(self.todo_date_rule_combos) == 0: time.sleep(self.DEFAULT_POLLING_TIMEOUT * 10) return if self.url_id is None: self.url_id, self.url_string = self.create_url( combo=self.todo_date_rule_combos.pop()) self.url_id, self.url_string = self.enqueue_url( url_id=self.url_id, url_string=self.url_string)
def test_drop_uncrawled_urls_drop_all(db_interface, url_ids): u = URLs(db_interface) u.drop_uncrawled_urls() assert count_urls(db_interface) == 0
class DocumentDownloader(QueueProcWorker): """ Worker responsible for downloading documents """ DATAPATH = "../data/" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def init_args(self, args): ( self.work_q, self.url_q, ) = args def startup(self): """""" super().startup() self.DATAPATH = self.config["Path"] self.REQUEST_TIMEOUT = float( self.config["RequestTimeoutFactor"]) * float( self.config["StopWaitSecs"]) self.ua = UserAgent() self.db = DBInterface(config=self.config) self.db.connection_name = self.name self.request = Request(self.db) self.url = URLs(self.db) self.docs = Documents(self.db) self.logger.info("{} started".format(self.name)) self.url_id, self.url_str = None, None self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7", "Dnt": "1", "Referer": "https://www.google.com", } def shutdown(self): """""" super().shutdown() def main_func(self, token): """ This method downloads documents. It gets called whenever a new request token is provided by the throttling mechanism. It then tries to get a new URL from the url work queue. The token is returned if no work is available. Otherwise a new session with a random user agent is generated, a download is triggered, a downloaded file is stored and the request logged. Args: token (str): Request throttling token that is provided by the token bucket """ # get url if not self.url_id: self.logger.debug("Getting new URL") self.url_id = self.url_q.safe_get() if self.url_id is None: self.work_q.safe_put(token) time.sleep(self.DEFAULT_POLLING_TIMEOUT) self.logger.debug("No work - returning") return url = self.url.get_url(id=self.url_id) self.url_str = url["url"] self.filetype = url["filetype"] try: self.logger.debug("Downloading: {}".format(self.url_str)) with requests.Session() as ses: ses.headers = self.headers ses.headers["User-Agent"] = self.ua.random resp = ses.get( self.url_str, allow_redirects=True, timeout=self.REQUEST_TIMEOUT, ) self.logger.debug("Response for: {} is {}".format( self.url_str, resp.status_code)) doc_id = None # if successfull store file if resp.status_code == 200: self.logger.debug("Storing file for {}".format(self.url_str)) file_uuid = str(uuid.uuid4()) filename = file_uuid + self.filetype abspath = os.path.abspath(self.DATAPATH) filepath = abspath + "/" + filename open(filepath, "wb").write(resp.content) doc_id = self.docs.register_document(filepath=filepath, filename=file_uuid) self.request.mark_as_requested( self.url_id, status_code=resp.status_code, redirected_url=resp.url, document_id=doc_id, ) self.logger.info("Crawled: {}".format(self.url_str)) self.url_id, self.url_str, self.filetype = None, None, None except requests.ReadTimeout as e: self.logger.warn("Timeout for url: {}".format(self.url_str)) self.logger.warn("Exception Message: {}".format(e)) self.request.mark_as_requested(url_id=self.url_id, status_code=408, redirected_url=self.url_str) time.sleep(self.DEFAULT_POLLING_TIMEOUT) return except requests.RequestException as e: self.logger.warn("Request exception for url: {}".format( self.url_str)) self.logger.warn("Exception Message: {}".format(e)) self.request.mark_as_requested(url_id=self.url_id, status_code=460, redirected_url=self.url_str) time.sleep(self.DEFAULT_POLLING_TIMEOUT) return
def test_table_exists(db_interface): urls = URLs(db_interface) assert urls.table_exists()
def test_get_todo_rule_and_date_combos_nothing(db_interface, todo_setup): # valid session url is found but no rule is activated u = URLs(db_interface) ret = u.get_todo_rule_and_date_combos(limit=100) assert len(ret) == 0