Exemplo n.º 1
0
    def _uri_from_curi(self, curi):
        """
        Create the uri tuple from the :class:`CrawlUri` and calculate the
        priority.

        Overwrite this method in more specific frontiers.
        """
        etag = mod_date = None
        if curi.rep_header:
            if "Etag" in curi.rep_header:
                etag = curi.rep_header["Etag"]
            if "Last-Modified" in curi.rep_header:
                mod_date = time.mktime(deserialize_date_time(
                    curi.rep_header["Last-Modified"]).timetuple())
            if not mod_date and 'Date' in curi.rep_header:
                mod_date = time.mktime(deserialize_date_time(
                    curi.rep_header["Date"]).timetuple())

        if mod_date:
            # only reschedule if it has been crawled before
            (prio, next_crawl_date) = self._reschedule_uri(curi)
        else:
            (prio, next_crawl_date) = (1,
                    time.mktime(datetime.now(self._timezone).timetuple()))

        return (curi.url, etag, mod_date, next_crawl_date, prio)
Exemplo n.º 2
0
    def _uri_from_curi(self, curi):
        """
        Create the uri tuple from the :class:`CrawlUri` and calculate the
        priority.

        Overwrite this method in more specific frontiers.
        """
        etag = mod_date = None
        if curi.rep_header:
            if "Etag" in curi.rep_header:
                etag = curi.rep_header["Etag"]
            if "Last-Modified" in curi.rep_header:
                mod_date = time.mktime(
                    deserialize_date_time(
                        curi.rep_header["Last-Modified"]).timetuple())
            if not mod_date and 'Date' in curi.rep_header:
                mod_date = time.mktime(
                    deserialize_date_time(curi.rep_header["Date"]).timetuple())

        if mod_date:
            # only reschedule if it has been crawled before
            (prio, next_crawl_date) = self._reschedule_uri(curi)
        else:
            (prio,
             next_crawl_date) = (1,
                                 time.mktime(
                                     datetime.now(self._timezone).timetuple()))

        return (curi.url, etag, mod_date, next_crawl_date, prio)
Exemplo n.º 3
0
    def __call__(self, msg, out_stream):
        """
        Work on the current `DataMessage` and send the result to `out_stream`.
        """
        # prepare the HTTPHeaders
        headers = prepare_headers(msg)

        last_modified = None
        if msg.curi.req_header:
            # check if we have a date when the page was last crawled
            if "Last-Modified" in msg.curi.req_header:
                last_modified = deserialize_date_time(
                    msg.curi.req_header["Last-Modified"])

        # check if we have username and password present
        auth_username = None
        auth_password = None
        if msg.curi.optional_vars and \
            CURI_SITE_USERNAME in msg.curi.optional_vars and \
            CURI_SITE_PASSWORD in msg.curi.optional_vars:

            auth_username = msg.curi.optional_vars[CURI_SITE_USERNAME]
            auth_password = msg.curi.optional_vars[CURI_SITE_PASSWORD]

        # create the request
        request = HTTPRequest(msg.curi.effective_url,
                              method="GET",
                              headers=headers,
                              auth_username=auth_username,
                              auth_password=auth_password,
                              if_modified_since=last_modified,
                              follow_redirects=self._follow_redirects,
                              max_redirects=self._max_redirects,
                              user_agent=self._user_agent,
                              request_timeout=self._request_timeout,
                              connect_timeout=self._connect_timeout,
                              validate_cert=self._validate_cert)

        if hasattr(self, '_proxy_configuration'):
            request.proxy_host = self._proxy_configuration['host']
            request.proxy_port = self._proxy_configuration['port']
            request.proxy_username = \
                    self._proxy_configuration.get('user', None)
            request.proxy_password = \
                    self._proxy_configuration.get('password', None)

        LOG.info("proc.fetch::request for %s" % msg.curi.url)
        self._client.fetch(request, handle_response(msg, out_stream))
Exemplo n.º 4
0
    def __call__(self, msg, out_stream):
        """
        Work on the current `DataMessage` and send the result to `out_stream`.
        """
        # prepare the HTTPHeaders
        headers = prepare_headers(msg)

        last_modified = None
        if msg.curi.req_header:
            # check if we have a date when the page was last crawled
            if "Last-Modified" in msg.curi.req_header:
                last_modified = deserialize_date_time(
                        msg.curi.req_header["Last-Modified"])

        # check if we have username and password present
        auth_username = None
        auth_password = None
        if msg.curi.optional_vars and \
            CURI_SITE_USERNAME in msg.curi.optional_vars and \
            CURI_SITE_PASSWORD in msg.curi.optional_vars:

            auth_username = msg.curi.optional_vars[CURI_SITE_USERNAME]
            auth_password = msg.curi.optional_vars[CURI_SITE_PASSWORD]

        # create the request
        request = HTTPRequest(msg.curi.effective_url,
                method="GET",
                headers=headers,
                auth_username=auth_username,
                auth_password=auth_password,
                if_modified_since=last_modified,
                follow_redirects=self._follow_redirects,
                max_redirects=self._max_redirects,
                user_agent=self._user_agent,
                request_timeout = self._request_timeout,
                connect_timeout = self._connect_timeout,
                validate_cert = self._validate_cert)

        if hasattr(self, '_proxy_configuration'):
            request.proxy_host = self._proxy_configuration['host']
            request.proxy_port = self._proxy_configuration['port']
            request.proxy_username = \
                    self._proxy_configuration.get('user', None)
            request.proxy_password = \
                    self._proxy_configuration.get('password', None)

        LOG.info("proc.fetch::request for %s" % msg.curi.url)
        self._client.fetch(request, handle_response(msg, out_stream))