def startup(self):
        """"""
        super().startup()

        self.DATAPATH = self.config["Path"]
        self.REQUEST_TIMEOUT = float(
            self.config["RequestTimeoutFactor"]) * float(
                self.config["StopWaitSecs"])

        self.ua = UserAgent()

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.request = Request(self.db)
        self.url = URLs(self.db)
        self.docs = Documents(self.db)

        self.logger.info("{} started".format(self.name))

        self.url_id, self.url_str = None, None

        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
            "Dnt": "1",
            "Referer": "https://www.google.com",
        }
예제 #2
0
def test_get_status_code_summary(db_interface):
    request = Request(db_interface)
    timestamp = []
    now = datetime.now(tz=timezone.utc)

    urls = URLs(db_interface)
    url_id = urls.save_url(None, None, "www.internet.de")

    for i in range(-10, 10):
        ts = now + timedelta(seconds=i)
        timestamp.append(ts)
        request.mark_as_requested(
            url_id=url_id,
            status_code=200,
            redirected_url="www.internet1.de",
            requested_at=ts,
        )

    result = request.get_status_code_summary(timestamp[0], timestamp[-1])
    assert result[200] == 20
    result = request.get_status_code_summary(timestamp[1], timestamp[-2])
    assert result[200] == 18
    result = request.get_status_code_summary(
        timestamp[-1], timestamp[-1] + timedelta(seconds=60)
    )
    assert result[200] == 1
    result = request.get_status_code_summary(
        timestamp[0] - timedelta(seconds=60), timestamp[0]
    )
    assert result[200] == 1
예제 #3
0
 def stop_procs(self):
     super(Context, self).stop_procs()
     temp_db = DBInterface(config=self.config["General"])
     urls = URLs(temp_db)
     # drop uncrawled urls last to prevent race conditions
     self.logger.info("Dropping uncrawled urls")
     urls.drop_uncrawled_urls()
예제 #4
0
def test_table_not_exists(db_interface):
    with db_interface.cursor() as db:
        db.cur.execute(
            sql.SQL("drop table {table} cascade").format(
                table=sql.Identifier(URLs.table_name)))

    urls = URLs(db_interface)
    assert urls.table_exists() is False
예제 #5
0
def test_drop_uncrawled_urls_drop_all_but_one(db_interface, url_ids):
    u = URLs(db_interface)
    r = Request(db_interface)

    r.mark_as_requested(url_ids[0], 200, "wwww.internet.de")

    u.drop_uncrawled_urls()

    assert count_urls(db_interface) == 1
예제 #6
0
def test_save_url_completly_unique(db_interface, sessionDays, rulesFix):
    u = URLs(db_interface)
    result = []
    for day_id, rule_id in zip(sessionDays, rulesFix):
        result.append(
            u.save_url(date_id=day_id,
                       rule_id=rule_id,
                       url="www.internet.de" + str(day_id)))
    entries = list(zip(sessionDays, rulesFix))
    assert len(entries) == len(result)
예제 #7
0
def url_ids(db_interface):
    u = URLs(db_interface)

    url_ids = []
    amount = 10
    for i in range(amount):
        url_ids.append(u.save_url(None, None, str(i)))

    assert count_urls(db_interface) == amount
    return url_ids
예제 #8
0
def test_get_todo_rule_and_date_combos_one_rule(db_interface, todo_setup):
    # valid session url is found and one rule is activated
    u = URLs(db_interface)
    ru = Rules(db_interface)
    s = SessionDay(db_interface)
    ru.update_rule_state(id=todo_setup["rule_ids"][1], active=True)
    ret = u.get_todo_rule_and_date_combos(limit=100)
    assert len(ret) == 1
    assert ret[0]["date"] == s.get_date(todo_setup["day_id"])[1]
    id, name, active = ru.get_rule(todo_setup["rule_ids"][1])
    assert ret[0]["rulename"] == name
예제 #9
0
def test_save_url_rule_unique(db_interface, sessionDays, rulesFix):
    """
    Only the combination of url and responsible rule should be unique. This allows for the same url being generated by the session_day_checker and by the date_url_generator, given that the session_day-rule wraps the
    """
    u = URLs(db_interface)
    result = []
    for day_id, rule_id in zip(sessionDays, rulesFix):
        result.append(
            u.save_url(date_id=day_id, rule_id=rule_id, url="www.internet.de"))
    entries = list(zip(sessionDays, rulesFix))
    assert len(entries) == len(result)
예제 #10
0
    def startup(self):
        super().startup()

        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.urls = URLs(self.db)
        self.rules = Rules(self.db)

        self.todo_date_rule_combos = []
        self.url_id = None
        self.url_string = None
        self.logger.info("{} started".format(self.name))
예제 #11
0
def test_Request_mark_as_requested_get_request_log(
    db_interface,
    url,
    status_code,
    redirected_url,
    document_id,
    requested_at,
):
    request = Request(db_interface)
    urls = URLs(db_interface)

    url_id = None
    if url:
        url_id = urls.save_url(None, None, url)

    # TODO: Test document_id

    id = request.mark_as_requested(
        url_id=url_id,
        status_code=status_code,
        redirected_url=redirected_url,
        requested_at=requested_at,
        document_id=document_id,
    )

    assert type(id) == int

    if url:
        assert type(url_id) == int
    else:
        assert url_id is None

    row = request.get_request_log(id)
    assert row[0] == id
    assert row[1] == url_id
    assert row[2] == document_id
    assert row[3] == requested_at
    assert row[4] == status_code
    assert row[5] == redirected_url
예제 #12
0
def todo_setup(db_interface):
    u = URLs(db_interface)
    r = Request(db_interface)
    ru = Rules(db_interface)
    s = SessionDay(db_interface)

    day_id = s.insert_date(date.today())
    rule_ids = ru.register_rules(rule_registry.all)
    session_day_id, name, active = ru.get_rule(rulename="session_day")
    session_url_id = u.save_url(date_id=day_id,
                                rule_id=session_day_id,
                                url="www.internet.de")
    rule_ids.remove(session_url_id)

    r.mark_as_requested(url_id=session_url_id,
                        status_code=200,
                        redirected_url="www.internet1.de")

    return {
        "day_id": day_id,
        "rule_ids": rule_ids,
        "session_url_id": session_url_id
    }
예제 #13
0
class DateUrlGenerator(ProcWorker):
    def init_args(self, args):
        (self.url_q, ) = args

    def startup(self):
        super().startup()

        self.PREFETCH_LIMIT = int(self.config["PrefetchLimit"])

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.urls = URLs(self.db)
        self.rules = Rules(self.db)

        self.todo_date_rule_combos = []
        self.url_id = None
        self.url_string = None
        self.logger.info("{} started".format(self.name))

    def shutdown(self):
        super().shutdown()

    def get_new_combos(self, limit):
        """
        Get a list of new rule and date combinations

        Args:
            limit (int): amount of combinations that should be retrieved

        Returns:
            list: list of combination dictionaries
        """
        self.logger.debug("Getting new date/rule-combinations")

        combos = self.urls.get_todo_rule_and_date_combos(limit=limit)

        # got no new combinations from db. sleep for the polling timeout before retrying
        if len(combos) == 0:
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
        else:
            self.logger.info("Got {} new combinations from database".format(
                len(combos)))

        return combos

    def create_url(self, combo):
        """
        Creates a url based upon a rule and date combination

        Args:
            combo (dict): rule and date combination dictionary

        Returns:
            tuple: url_id and url_string
        """
        self.logger.debug("Applying rule: {} to date: {}".format(
            combo["rulename"], combo["date"]))
        url_id, url_string = self.rules.apply_rule(date_id=combo["date_id"],
                                                   rule_id=combo["rule_id"])
        self.logger.debug("Result: {}".format(url_string))
        return url_id, url_string

    def enqueue_url(self, url_id, url_string):
        """
        Queues up a URl

        Args:
            url_id (int): id of the url
            url_string (id): url string

        Returns:
            tuple of url_id and url_string: values are None if the value was enqueued successfully, the old values stay if this isn't the case
        """
        try:
            self.logger.debug("Queueing up URL with id: {}".format(url_id))
            self.url_q.put(url_id, timeout=self.DEFAULT_POLLING_TIMEOUT)
            self.logger.info("Queued up URL: {} with id: {}".format(
                url_string, url_id))
            url_string, url_id = None, None
        except Full:
            pass

        return url_id, url_string

    def main_func(self):
        """
        Continuously enqueue new urls.
        First block sets up a buffer of date and rule combinations.
        This buffer is then iteratively consumed with every iteration, urls created, stored and enqueued
        """

        if len(self.todo_date_rule_combos) == 0:
            self.todo_date_rule_combos = self.get_new_combos(
                limit=self.PREFETCH_LIMIT)
            if len(self.todo_date_rule_combos) == 0:
                time.sleep(self.DEFAULT_POLLING_TIMEOUT * 10)

            return

        if self.url_id is None:
            self.url_id, self.url_string = self.create_url(
                combo=self.todo_date_rule_combos.pop())

        self.url_id, self.url_string = self.enqueue_url(
            url_id=self.url_id, url_string=self.url_string)
예제 #14
0
def test_drop_uncrawled_urls_drop_all(db_interface, url_ids):
    u = URLs(db_interface)

    u.drop_uncrawled_urls()

    assert count_urls(db_interface) == 0
class DocumentDownloader(QueueProcWorker):
    """
    Worker responsible for downloading documents
    """

    DATAPATH = "../data/"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def init_args(self, args):
        (
            self.work_q,
            self.url_q,
        ) = args

    def startup(self):
        """"""
        super().startup()

        self.DATAPATH = self.config["Path"]
        self.REQUEST_TIMEOUT = float(
            self.config["RequestTimeoutFactor"]) * float(
                self.config["StopWaitSecs"])

        self.ua = UserAgent()

        self.db = DBInterface(config=self.config)
        self.db.connection_name = self.name

        self.request = Request(self.db)
        self.url = URLs(self.db)
        self.docs = Documents(self.db)

        self.logger.info("{} started".format(self.name))

        self.url_id, self.url_str = None, None

        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
            "Dnt": "1",
            "Referer": "https://www.google.com",
        }

    def shutdown(self):
        """"""
        super().shutdown()

    def main_func(self, token):
        """
        This method downloads documents.
        It gets called whenever a new request token is provided by the throttling mechanism. It then tries to get a new URL from the url work queue. The token is returned if no work is available.
        Otherwise a new session with a random user agent is generated, a download is triggered, a downloaded file is stored and the request logged.

        Args:
            token (str): Request throttling token that is provided by the token bucket
        """
        # get url
        if not self.url_id:
            self.logger.debug("Getting new URL")
            self.url_id = self.url_q.safe_get()

            if self.url_id is None:
                self.work_q.safe_put(token)
                time.sleep(self.DEFAULT_POLLING_TIMEOUT)
                self.logger.debug("No work - returning")
                return

            url = self.url.get_url(id=self.url_id)
            self.url_str = url["url"]
            self.filetype = url["filetype"]

        try:

            self.logger.debug("Downloading: {}".format(self.url_str))

            with requests.Session() as ses:
                ses.headers = self.headers
                ses.headers["User-Agent"] = self.ua.random
                resp = ses.get(
                    self.url_str,
                    allow_redirects=True,
                    timeout=self.REQUEST_TIMEOUT,
                )
            self.logger.debug("Response for: {} is {}".format(
                self.url_str, resp.status_code))

            doc_id = None
            # if successfull store file
            if resp.status_code == 200:
                self.logger.debug("Storing file for {}".format(self.url_str))
                file_uuid = str(uuid.uuid4())
                filename = file_uuid + self.filetype
                abspath = os.path.abspath(self.DATAPATH)
                filepath = abspath + "/" + filename

                open(filepath, "wb").write(resp.content)

                doc_id = self.docs.register_document(filepath=filepath,
                                                     filename=file_uuid)

            self.request.mark_as_requested(
                self.url_id,
                status_code=resp.status_code,
                redirected_url=resp.url,
                document_id=doc_id,
            )

            self.logger.info("Crawled: {}".format(self.url_str))

            self.url_id, self.url_str, self.filetype = None, None, None

        except requests.ReadTimeout as e:

            self.logger.warn("Timeout for url: {}".format(self.url_str))
            self.logger.warn("Exception Message: {}".format(e))

            self.request.mark_as_requested(url_id=self.url_id,
                                           status_code=408,
                                           redirected_url=self.url_str)
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
            return

        except requests.RequestException as e:
            self.logger.warn("Request exception for url: {}".format(
                self.url_str))
            self.logger.warn("Exception Message: {}".format(e))
            self.request.mark_as_requested(url_id=self.url_id,
                                           status_code=460,
                                           redirected_url=self.url_str)
            time.sleep(self.DEFAULT_POLLING_TIMEOUT)
            return
예제 #16
0
def test_table_exists(db_interface):
    urls = URLs(db_interface)
    assert urls.table_exists()
예제 #17
0
def test_get_todo_rule_and_date_combos_nothing(db_interface, todo_setup):
    # valid session url is found but no rule is activated
    u = URLs(db_interface)
    ret = u.get_todo_rule_and_date_combos(limit=100)
    assert len(ret) == 0