def crawl_url(self, async_mode, url, input_message, user_agent, referer, page_last_modified):
        request_header = copy.deepcopy(settings.default_headers)
        #set user agent
        if user_agent is not None:
            request_header['User-Agent'] = user_agent

        #set referer
        if referer is not None:
            request_header["Referer"] = referer;

        #set if_modified_since
        if page_last_modified is not None:
            request_header['If-Modified-Since'] = page_last_modified

        meta = input_message

        meta["dns_cache_enabled"] = settings.dns_cache_enabled
        meta["chunked_transfer_decoding"] = settings.chunked_transfer_decoding

        #start crawling
        result =self._downloader.crawl(
            async_mode,
            url,
            settings.timeout,
            request_header,
            settings.robotstxt_enabled,
            meta)

        return misc.postprocess(self._async_mode, result, self.process_crawler_response)
示例#2
0
    def crawl(self, async_mode, url, timeout, request_header, robotstxt_enabled, meta):
        success, result = Downloader.preprocess(url, robotstxt_enabled)
        if not success:
            if async_mode:
                result = defer_fail(result)
        else:
            if async_mode:
                result = self._crawl_async(url, timeout, request_header, meta)
            else:
                result = self._crawl_sync(url, timeout, request_header, meta)

        return misc.postprocess(async_mode, result, Downloader.postprocess)