예제 #1
0
 def __init__(self,
              delay=DEFAULT_DELAY,
              user_agent=DEFAULT_AGENT,
              proxies=None,
              num_retries=DEFAULT_RETRIES,
              timeout=DEFAULT_TIMEOUT,
              opener=None,
              cache=None):
     """
     对download 方法进行封装
     :param delay:
     :param user_agent:
     :param proxies:
     :param num_retries:
     :param timeout:
     :param opener:
     :param cache:
     """
     socket.setdefaulttimeout(timeout)
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_retries = num_retries
     self.opener = opener
     self.cache = cache
예제 #2
0
 def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES,
              timeout=DEFAULT_TIMEOUT, opener=None, cache=None):
     """
     对download 方法进行封装
     :param delay:
     :param user_agent:
     :param proxies:
     :param num_retries:
     :param timeout:
     :param opener:
     :param cache:
     """
     socket.setdefaulttimeout(timeout)
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_retries = num_retries
     self.opener = opener
     self.cache = cache
예제 #3
0
class Downloader:
    def __init__(self,
                 delay=DEFAULT_DELAY,
                 user_agent=DEFAULT_AGENT,
                 proxies=None,
                 num_retries=DEFAULT_RETRIES,
                 timeout=DEFAULT_TIMEOUT,
                 opener=None,
                 cache=None):
        """
        对download 方法进行封装
        :param delay:
        :param user_agent:
        :param proxies:
        :param num_retries:
        :param timeout:
        :param opener:
        :param cache:
        """
        socket.setdefaulttimeout(timeout)
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.opener = opener
        self.cache = cache

    def __call__(self, url):
        result = None
        #用于多线程演示
        # time.sleep(0.5)
        if self.cache:
            try:
                # logging.debug(self.cache)
                result = self.cache[url]
            except KeyError:
                # url is not available in cache
                pass
            else:
                if self.num_retries > 0 and 500 <= result['code'] < 600:
                    # server error so ignore result from cache and re-download
                    result = None
        if result is None:
            logging.info('URL:%s not in cache!' % url)
            # result was not loaded from cache so still need to download
            self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {'User-agent': self.user_agent}
            result = self.download(url,
                                   headers,
                                   proxy=proxy,
                                   num_retries=self.num_retries)
            if self.cache:
                # save result to cache
                self.cache[url] = result
        else:
            logging.info('URL:%s hit  cache!' % url)
        return result['html']

    def download(self, url, headers, proxy, num_retries, data=None):
        # logging.info( 'Downloading:', url)
        request = urllib2.Request(url, data, headers or {})
        opener = self.opener or urllib2.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            response = opener.open(request)
            html = response.read()
            code = response.code
        except Exception as e:
            print 'Download error:', str(e)
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    # retry 5XX HTTP errors
                    return self._get(url, headers, proxy, num_retries - 1,
                                     data)
            else:
                code = None
        return {'html': html, 'code': code}


# class Throttle:
#     """Throttle downloading by sleeping between requests to same domain
#     """
#
#     def __init__(self, delay):
#         # amount of delay between downloads for each domain
#         self.delay = delay
#         # timestamp of when a domain was last accessed
#         self.domains = {}
#
#     def wait(self, url):
#         """Delay if have accessed this domain recently
#         """
#         domain = urlparse.urlsplit(url).netloc
#         last_accessed = self.domains.get(domain)
#         if self.delay > 0 and last_accessed is not None:
#             sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
#             if sleep_secs > 0:
#                 time.sleep(sleep_secs)
#         self.domains[domain] = datetime.now()
예제 #4
0
class Downloader:
    def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES,
                 timeout=DEFAULT_TIMEOUT, opener=None, cache=None):
        """
        对download 方法进行封装
        :param delay:
        :param user_agent:
        :param proxies:
        :param num_retries:
        :param timeout:
        :param opener:
        :param cache:
        """
        socket.setdefaulttimeout(timeout)
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.opener = opener
        self.cache = cache

    def __call__(self, url):
        result = None
        #用于多线程演示
        # time.sleep(0.5)
        if self.cache:
            try:
                # logging.debug(self.cache)
                result = self.cache[url]
            except KeyError:
                # url is not available in cache
                pass
            else:
                if self.num_retries > 0 and 500 <= result['code'] < 600:
                    # server error so ignore result from cache and re-download
                    result = None
        if result is None:
            logging.info('URL:%s not in cache!' % url)
            # result was not loaded from cache so still need to download
            self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {'User-agent': self.user_agent}
            result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries)
            if self.cache:
                # save result to cache
                self.cache[url] = result
        else:
            logging.info('URL:%s hit  cache!' % url)
        return result['html']

    def download(self, url, headers, proxy, num_retries, data=None):
        # logging.info( 'Downloading:', url)
        request = urllib2.Request(url, data, headers or {})
        opener = self.opener or urllib2.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            response = opener.open(request)
            html = response.read()
            code = response.code
        except Exception as e:
            print 'Download error:', str(e)
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    # retry 5XX HTTP errors
                    return self._get(url, headers, proxy, num_retries - 1, data)
            else:
                code = None
        return {'html': html, 'code': code}


# class Throttle:
#     """Throttle downloading by sleeping between requests to same domain
#     """
#
#     def __init__(self, delay):
#         # amount of delay between downloads for each domain
#         self.delay = delay
#         # timestamp of when a domain was last accessed
#         self.domains = {}
#
#     def wait(self, url):
#         """Delay if have accessed this domain recently
#         """
#         domain = urlparse.urlsplit(url).netloc
#         last_accessed = self.domains.get(domain)
#         if self.delay > 0 and last_accessed is not None:
#             sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
#             if sleep_secs > 0:
#                 time.sleep(sleep_secs)
#         self.domains[domain] = datetime.now()