Python UrlCache примеры использования

Язык программирования: Python

Пространство имен/Пакет: cache

Класс/Тип: UrlCache

Примеров на hotexamples.com: 3

Python UrlCache - 3 примера найдено. Это лучшие примеры Python кода для cache.UrlCache, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

insert(1)

read(1)

Пример #1

Показать файл

Файл: spider.py Проект: BoyceYang/wsbs

    def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300,
                 internal_timeout=60, spider_timeout=6*3600, crawler_mode=0, same_origin=True, dynamic_parse=False,
                 spider_type='img'):
        self.stopped = event.Event()
        self.internal_timeout = internal_timeout
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode             # 爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)                  # 运行的进程数目

        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        # self.crawler_pool = []
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num*10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num*10000)

        self.fetcher_cache = UrlCache()                     # url缓存设置
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object']
        self.ignore_ext = ['js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip']

        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))       # 取二者的并集
        self.same_origin = same_origin
        self.depth = depth                              # 爬虫访问深度
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin   # 注册Crawler中使用的插件
        self.custom_headers = custom_headers
        self.unspider_url_list = []
        self.spider_type = spider_type

Пример #2

Показать файл

Файл: spider.py Проект: BoyceYang/wsbs

class Spider(object):
    """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
    """
    def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300,
                 internal_timeout=60, spider_timeout=6*3600, crawler_mode=0, same_origin=True, dynamic_parse=False,
                 spider_type='img'):
        self.stopped = event.Event()
        self.internal_timeout = internal_timeout
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode             # 爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)                  # 运行的进程数目

        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        # self.crawler_pool = []
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num*10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num*10000)

        self.fetcher_cache = UrlCache()                     # url缓存设置
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object']
        self.ignore_ext = ['js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip']

        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))       # 取二者的并集
        self.same_origin = same_origin
        self.depth = depth                              # 爬虫访问深度
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin   # 注册Crawler中使用的插件
        self.custom_headers = custom_headers
        self.unspider_url_list = []
        self.spider_type = spider_type

    def _start_fetcher(self):
        for i in xrange(self.concurrent_num):
            fetcher = Fetcher(self)
            self.fetcher_pool.start(fetcher)

    def _start_crawler(self):
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)
        # self.crawler_pool = [gevent.spawn(self.crawler) for _ in xrange(self.concurrent_num)]

    def start(self):
        logging.info("spider starting...")

        if self.crawler_mode == 0:
            logging.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            logging.info("crawler run in gevent mode.")

        self._start_fetcher()
        self._start_crawler()

        self.stopped.wait()      # 等待停止事件置位

        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)

            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()

        except Timeout:
            logging.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()

        unspider_url_list = list(set(self.unspider_url_list))          # 对未访问的list进行去重
        unspider_url_list.sort(key=self.unspider_url_list.index)
        redis_key = IMG_UNSPIDER_URL_KEY         # redis_key控制spider检索的数据类型

        try:
            for url_link in unspider_url_list:
                with global_redis.pipeline() as pipe:
                    pipe.lpush(redis_key, url_link).ltrim(redis_key, 0, 100).expire(redis_key, 72000).execute()
        except:
            logging.info("store unspider url error!!")
            pass

        logging.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache)))
        logging.info("spider process quit.")

    def crawler(self, _dep=None):
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()         # 维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth+1
                link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]

                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:     # 最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    if len(self.fetcher_cache) == self.max_url_num:   # 最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)

                for plugin_name in self.plugin_handler:     # 循环动态调用初始化时注册的插件
                    try:
                        plugin_obj = eval(plugin_name)()
                        plugin_obj.start(url_data)
                    except Exception, e:
                        import traceback
                        traceback.print_exc()

                self.crawler_queue.task_done()

Пример #3

Показать файл

Файл: judge.py Проект: thesquelched/nhl-season-judge

def url_read(url):
  """Grab HTML content from the url"""

  urlc = UrlCache(url)
  return urlc.read()