示例#1
0
    def __init__(self, cache=None, cache_file=None, read_cache=True, write_cache=True, use_network=True, 
            user_agent=None, timeout=30, delay=5, proxies=None, proxy_file=None, max_proxy_errors=5,
            opener=None, headers=None, data=None, num_retries=0, num_redirects=1,
            force_html=False, force_ascii=False, max_size=None, default='', pattern=None):
        """
        `cache' is a pdict object to use for the cache
        `cache_file' sets filename to store cached data
        `read_cache' sets whether to read from the cache
        `write_cache' sets whether to write to the cache
        `use_network' sets whether to download content not in the cache
        `user_agent' sets the User Agent to download content with
        `timeout' is the maximum amount of time to wait for http response
        `delay' is the minimum amount of time (in seconds) to wait after downloading content from a domain per proxy
        `proxy_file' is a filename to read proxies from
        `max_proxy_errors' is the maximum number of consecutive errors allowed per proxy before discarding
            an error is only counted if another proxy is able to successfully download the URL
            set to None to disable
        `proxies' is a list of proxies to cycle through when downloading content
        `opener' sets an optional opener to use instead of using urllib2 directly
        `headers' are the headers to include in the request
        `data' is what to post at the URL
        `num_retries' sets how many times to try downloading a URL when get an error
        `num_redirects' sets how many times the URL is allowed to be redirected, to avoid infinite loop
        `force_html' sets whether to download non-text data
        `force_ascii' sets whether to only return ascii characters
        `max_size' determines maximum number of bytes that will be downloaded, or None to disable
        `default' is what to return when no content can be downloaded
        `pattern' is a regular expression that the downloaded HTML has to match to be considered a valid download
        """
        socket.setdefaulttimeout(timeout)
        need_cache = read_cache or write_cache
        if pdict and need_cache:
            cache_file = cache_file or settings.cache_file
            self.cache = cache or pdict.PersistentDict(cache_file)
        else:
            self.cache = None
            if need_cache:
                common.logger.info('Cache disabled because could not import pdict')

        self.settings = adt.Bag(
            read_cache = read_cache,
            write_cache = write_cache,
            use_network = use_network,
            delay = delay,
            proxies = collections.deque((common.read_list(proxy_file) if proxy_file else []) or proxies or []),
            proxy_file = proxy_file,
            max_proxy_errors = max_proxy_errors,
            user_agent = user_agent,
            opener = opener,
            headers = headers,
            data = data,
            num_retries = num_retries,
            num_redirects = num_redirects,
            force_html = force_html,
            force_ascii = force_ascii,
            max_size = max_size,
            default = default,
            pattern = pattern
        )
        self.last_load_time = self.last_mtime = time.time()
示例#2
0
def parse_proxy(proxy):
    """Parse a proxy into its fragments
    Returns a dict with username, password, host, and port

    >>> f = parse_proxy('login:[email protected]:8080')
    >>> f.username
    'login'
    >>> f.password
    'pw'
    >>> f.host
    '66.197.208.200'
    >>> f.port
    '8080'
    >>> f = parse_proxy('66.197.208.200')
    >>> f.username == f.password == f.port == ''
    True
    >>> f.host
    '66.197.208.200'
    """
    fragments = adt.Bag()
    if isinstance(proxy, basestring):
        match = re.match(
            '((?P<username>\w+):(?P<password>\w+)@)?(?P<host>\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})(:(?P<port>\d+))?',
            proxy)
        if match:
            groups = match.groupdict()
            fragments.username = groups.get('username') or ''
            fragments.password = groups.get('password') or ''
            fragments.host = groups.get('host')
            fragments.port = groups.get('port') or ''
    return fragments
示例#3
0
    def __init__(self,
                 cache=None,
                 cache_file=None,
                 read_cache=True,
                 write_cache=True,
                 use_network=True,
                 user_agent=None,
                 timeout=30,
                 delay=5,
                 proxies=None,
                 proxy_file=None,
                 max_proxy_errors=5,
                 opener=None,
                 headers=None,
                 data=None,
                 num_retries=0,
                 num_redirects=1,
                 force_html=False,
                 force_ascii=False,
                 max_size=None,
                 default='',
                 pattern=None,
                 acceptable_errors=None):
        socket.setdefaulttimeout(timeout)
        need_cache = read_cache or write_cache
        if pdict and need_cache:
            cache_file = cache_file or settings.cache_file
            self.cache = cache or pdict.PersistentDict(cache_file)
        else:
            self.cache = None
            if need_cache:
                common.logger.info(
                    'Cache disabled because could not import pdict')

        self.settings = adt.Bag(
            read_cache=read_cache,
            write_cache=write_cache,
            use_network=use_network,
            delay=delay,
            proxies=(common.read_list(proxy_file) if proxy_file else [])
            or proxies or [],
            proxy_file=proxy_file,
            max_proxy_errors=max_proxy_errors,
            user_agent=user_agent,
            opener=opener,
            headers=headers,
            data=data,
            num_retries=num_retries,
            num_redirects=num_redirects,
            force_html=force_html,
            force_ascii=force_ascii,
            max_size=max_size,
            default=default,
            pattern=pattern,
            acceptable_errors=acceptable_errors)
        self.last_load_time = self.last_mtime = time.time()
        self.num_downloads = self.num_errors = 0
示例#4
0
文件: async.py 项目: yuzi3150/SeatPJ2
 def __init__(self,
              url=None,
              urls=None,
              url_iter=None,
              num_threads=20,
              cb=None,
              depth=True,
              max_errors=None,
              pattern=None,
              **kwargs):
     self.settings = adt.Bag(read_cache=True,
                             write_cache=True,
                             num_redirects=5,
                             num_retries=2,
                             timeout=20,
                             headers={},
                             num_threads=num_threads,
                             cb=cb,
                             url_iter=url_iter,
                             depth=depth,
                             pattern=pattern)
     self.settings.update(**kwargs)
     self.D = download.Download(**kwargs)
     self.kwargs = kwargs
     # queue of html to be written to cache
     self.cache_queue = []
     # URL's that are waiting to download
     self.download_queue = collections.deque()
     if urls:
         self.download_queue.extend(urls)
     if url:
         self.download_queue.append(
             url
         )  # XXX create compressed dict data type for large in memory?
     # URL's currently downloading
     self.processing = {}
     # defereds that are downloading
     self.downloading = []
     # URL's that have been found before
     self.found = adt.HashDict()
     for url in self.download_queue:
         self.found[url] = True
     self.state = download.State()
     self.max_errors = max_errors
     self.num_errors = 0  # counter for the number of subsequent errors
示例#5
0
    def get(self, url, **kwargs):
        """Download this URL and return the HTML. 
        By default HTML is cached so only have to download once.

        url:
            what to download
        kwargs:
            override any of the arguments passed to constructor
        """
        self.reload_proxies()
        self.proxy = None  # the current proxy
        self.final_url = None  # for tracking redirects
        self.response_code = ''  # keep response code
        self.response_headers = {}  # keep response headers
        self.downloading_error = None  # keep downloading error
        self.num_downloads = self.num_errors = 0  # track the number of downloads made

        # update settings with any local overrides
        settings = adt.Bag(self.settings)
        settings.update(kwargs)
        # check cache for whether this content is already downloaded
        key = self.get_key(url, settings.data)
        if self.cache and settings.read_cache:
            try:
                html = self.cache[key]
                if self.invalid_response(html, settings.pattern):
                    # invalid result from download
                    html = None
            except KeyError:
                pass  # have not downloaded yet
            else:
                if not html and settings.num_retries > 0:
                    # try downloading again
                    common.logger.debug('Redownloading')
                    settings.num_retries -= 1
                else:
                    # return previously downloaded content
                    return html or settings.default
        if not settings.use_network:
            # only want previously cached content
            return settings.default

        html = None
        failed_proxies = set(
        )  # record which proxies failed to download for this URL
        # attempt downloading content at URL
        while settings.num_retries >= 0 and html is None:
            settings.num_retries -= 1
            if settings.proxy:
                self.proxy = settings.proxy
            else:
                self.proxy = self.get_proxy(settings.proxies)
            # crawl slowly for each domain to reduce risk of being blocked
            self.throttle(url, delay=settings.delay, proxy=self.proxy)
            html = self.fetch(url,
                              headers=settings.headers,
                              data=settings.data,
                              proxy=self.proxy,
                              user_agent=settings.user_agent,
                              opener=settings.opener,
                              pattern=settings.pattern,
                              max_size=settings.max_size)

            if html:
                # successfully downloaded
                self.num_downloads += 1
                if settings.max_proxy_errors is not None:
                    Download.proxy_performance.success(self.proxy)
                    # record which proxies failed for this download
                    for proxy in failed_proxies:
                        if Download.proxy_performance.error(
                                self.proxy) > settings.max_proxy_errors:
                            # this proxy has had too many errors so remove
                            common.logger.warning(
                                'Removing unstable proxy from list after %d consecutive errors: %s'
                                % (settings.max_proxy_errors, self.proxy))
                            settings.proxies.remove(self.proxy)
            else:
                # download failed - try again
                self.num_errors += 1
                failed_proxies.add(self.proxy)

        if html:
            if settings.num_redirects > 0:
                # allowed to redirect
                redirect_url = get_redirect(url=url, html=html)
                if redirect_url:
                    # found a redirection
                    common.logger.debug('%s redirecting to %s' %
                                        (url, redirect_url))
                    settings.num_redirects -= 1
                    html = self.get(redirect_url, **settings) or ''
                    # make relative links absolute so will still work after redirect
                    relative_re = re.compile(
                        '(<\s*a[^>]+href\s*=\s*["\']?)(?!http)([^"\'>]+)',
                        re.IGNORECASE)
                    try:
                        html = relative_re.sub(
                            lambda m: m.group(1) + urlparse.urljoin(
                                url, m.group(2)), html)
                    except UnicodeDecodeError:
                        pass
            html = self._clean_content(html=html,
                                       max_size=settings.max_size,
                                       force_html=settings.force_html,
                                       force_ascii=settings.force_ascii)

        if self.cache and settings.write_cache:
            # cache results
            self.cache[key] = html
            if url != self.final_url:
                # cache what URL was redirected to
                self.cache.meta(key, dict(url=self.final_url))

        # return default if no content
        return html or settings.default