Exemplo n.º 1
0
    def test_override_default_params(self):
        for key, val in defaults.REDIS_PARAMS.items():
            self.expected_params[key] = self.settings['REDIS_PARAMS'][key] = object()

        server = from_settings(self.settings)
        assert server is self.redis_cls.return_value
        self.redis_cls.assert_called_with(**self.expected_params)
Exemplo n.º 2
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     # create one-time key. needed to support to use this
     # class as standalone dupefilter with scrapy's default scheduler
     # if scrapy passes spider on open() method this wouldn't be needed
     key = "dupefilter:%s" % int(time.time())
     return cls(server, key)
Exemplo n.º 3
0
    def from_settings(cls, settings):
        kwargs = {
            'persist': settings.getbool('SCHEDULER_PERSIST'),
            'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
            'idle_before_close':
            settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
        }

        # If these values are missing, it means we want to use the defaults.
        optional = {
            # TODO: Use custom prefixes for this settings to note that are
            # specific to scrapy-redis.
            'queue_key': 'SCHEDULER_QUEUE_KEY',
            'queue_cls': 'SCHEDULER_QUEUE_CLASS',
            'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
            # We use the default setting name to keep compatibility.
            'dupefilter_cls': 'DUPEFILTER_CLASS',
            'serializer': 'SCHEDULER_SERIALIZER',
        }
        for name, setting_name in optional.items():
            val = settings.get(setting_name)
            if val:
                kwargs[name] = val

        # Support serializer as a path to a module.
        if isinstance(kwargs.get('serializer'), six.string_types):
            kwargs['serializer'] = importlib.import_module(
                kwargs['serializer'])

        # Redis Server connection
        server = connection.from_settings(settings)
        # Ensure the connection is working.
        server.ping()

        return cls(server=server, settings=settings, **kwargs)
Exemplo n.º 4
0
class GxzfcgFactorySpider(RedisSpider):
    name = "gxzfcg_spider"
    name_pre = 'gxzfcg'
    redis_server = connection.from_settings(settings)

    def parse(self, response):
        node_name_pre = settings['NODE_NAME']
        website_pre = '广西壮族自治区政府采购网'
        level_pre = response.xpath('//*[@id="channelBody"]/div[1]/a[2]/text()').extract()[0]
        typr_pre = response.xpath('//*[@id="channelBody"]/div[1]/a[3]/text()').extract()[0]
        ul = response.xpath("//*[@id=\"channelBody\"]/div[2]/ul/li")
        for li in ul:
            item = TenderItem()
            item['node_name'] = node_name_pre
            item['website'] = website_pre
            item['level'] = level_pre
            item['type'] = typr_pre
            item['title'] = li.xpath("a/@title").extract()[0]
            item['date'] = li.xpath("span[@class=\"date\"]/text()").extract()[0]
            item['url'] = 'http://www.gxzfcg.gov.cn' + li.xpath("a/@href").extract()[0]
            article = newspaper.Article(
                'http://www.gxzfcg.gov.cn' + li.xpath("a/@href").extract()[0],
                language='zh', fetch_images=False)
            article.download()
            article.parse()
            item['content'] = article.text
            # 生成时间
            now_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            item['add_time'] = now_time
            item['update_time'] = now_time
            # print(item)
            yield item
Exemplo n.º 5
0
    def test_override_default_params(self):
        for key, val in DEFAULT_PARAMS.items():
            self.expected_params[key] = self.settings['REDIS_PARAMS'][key] = object()

        server = from_settings(self.settings)
        assert server is self.redis_cls.return_value
        self.redis_cls.assert_called_with(**self.expected_params)
Exemplo n.º 6
0
    def setup_redis(self):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = '%s:start_urls' % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)
        self.crawler.signals.connect(self.schedule_rest_requests,
                                     signal=signals.item_scraped)
        self.crawler.signals.connect(self.schedule_rest_requests,
                                     signal=signals.item_dropped)
        self.crawler.signals.connect(self.schedule_rest_requests,
                                     signal=signals.request_scheduled)
        self.crawler.signals.connect(self.schedule_rest_requests,
                                     signal=signals.response_received)
        self.crawler.signals.connect(self.schedule_rest_requests,
                                     signal=signals.response_downloaded)

        self.crawler.signals.connect(self.__start_loop,
                                     signal=signals.spider_opened)
        self.crawler.signals.connect(self.__stop_loop,
                                     signal=signals.spider_closed)

        log.msg("Reading URLs from redis list '%s'" % self.redis_key,
                level=log.INFO)
        self.touch()
Exemplo n.º 7
0
 def __init__(self, settings, crawler):
     RetryMiddleware.__init__(self, settings)
     self.redis = connection.from_settings(settings)
     # Ensure the connection is working.
     self.redis.ping()
     #self.redis = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379)))
     initCookie(self.redis, crawler.spider.name)
Exemplo n.º 8
0
 def from_settings(cls, settings):
     kwargs = {
         'persist': settings.getbool('SCHEDULER_PERSIST'),
         'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
         'idle_before_close':
         settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
     }
     # 如果缺少这些值,则意味着我们要使用默认值。
     optional = {
         # TODO: Use custom prefixes for this settings to note that are
         # specific to scrapy-redis.
         'queue_key': 'SCHEDULER_QUEUE_KEY',
         'queue_cls': 'SCHEDULER_QUEUE_CLASS',
         'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
         # 我们使用默认设置名称来保持兼容性。
         'dupefilter_cls': 'DUPEFILTER_CLASS',
         'serializer': 'SCHEDULER_SERIALIZER',
     }
     for name, setting_name in optional.items():
         val = settings.get(setting_name)
         if val:
             kwargs[name] = val
     # 支持序列化程序作为模块的路径。
     if isinstance(kwargs.get('serializer'), six.string_types):
         kwargs['serializer'] = importlib.import_module(
             kwargs['serializer'])
     server = connection.from_settings(settings)
     # 确保连接正常。
     server.ping()
     return cls(server=server, **kwargs)
Exemplo n.º 9
0
    def from_settings(cls, settings):
        params = {}
        params['client'] = connection.from_settings(settings) # from_settings means redis_from_settings
        if settings.get("REDIS_SIMHASH_KEY"):
            params['key'] = settings["REDIS_SIMHASH_KEY"]

        return cls(**params)
Exemplo n.º 10
0
 def __init__(self):
     settings = get_project_settings()
     self.queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
     self.server = connection.from_settings(settings)
     self.headers = {
         "Host": "www.zhihu.com",
         "Connection": "keep-alive",
         "Cache-Control": "max-age=0",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
         "Referer": "http://www.zhihu.com/people/raymond-wang",
         "Accept-Encoding": "gzip,deflate,sdch",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
     }
     self.cookies = {
         '_za': r'bda5810c-88f0-40a8-8d2b-d9be0e0c58a9',
         'q_c1': r'28f9a453b53a482486644378553c3a10|1447162001000|1447162001000',
         '_xsrf': r'4307a4b2977f25efbdacbd89edf2e789',
         'cap_id': r'"OThkOGIwMDVkMDllNGZmMzkzN2JkY2MzNzhhMmZjZWQ=|1448186640|774a87a7e0bd5ecec150a0d4bed38b570859c822"',
         'z_c0': r'"QUFBQUF1VWdBQUFYQUFBQVlRSlZUUjBnZVZZM0ptcEVROU9YSzZ3bXpUUEJXQm0zSUkxSFl3PT0=|1448186653|1eb9dfd0eff895cab5c818fd97d103a17d557dfe"',
         'unlock_ticket': r'"QUFBQUF1VWdBQUFYQUFBQVlRSlZUU1dhVVZhcmFDck02VUROeVV3c1oyRHQ1aWduQmVLYWdRPT0=|1448186653|c734f11184740390f0b34536e218952aabdcff46"',
         '__utmt': r'1',
         '__utma': r'51854390.16347795.1448186642.1448186642.1448186642.1',
         '__utmb': r'51854390.18.10.1448186642',
         '__utmc': r'51854390',
         '__utmz': r'51854390.1448186642.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
         '__utmv': r'51854390.100-1|2=registration_date=20131118=1^3=entry_date=20131118=1'
     }
     super(ZhihuNotGenRequestSpider, self).__init__()
Exemplo n.º 11
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     # create one-time key. needed to support to use this
     # class as standalone dupefilter with scrapy's default scheduler
     # if scrapy passes spider on open() method this wouldn't be needed
     key = "dupefilter:%s" % int(time.time())
     return cls(server, key)
Exemplo n.º 12
0
    def test_redis_default(self):
        settings = Settings()

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 6379)
Exemplo n.º 13
0
    def test_redis_default(self):
        settings = Settings()

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 6379)
Exemplo n.º 14
0
    def from_settings(cls, settings):
        params = {
            'client': connection.from_settings(settings),
        }
        if settings.get('REDIS_SIMHASH_KEY'):
            params['key'] = settings['REDIS_SIMHASH_KEY']

        return cls(**params)
Exemplo n.º 15
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        server = connection.from_settings(settings)

        s = cls(server, settings)
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s
Exemplo n.º 16
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     slave_key = settings.get('REDIS_START_URLS_KEY')
     master_key = settings.get('REDIS_START_URLS_MASTER_KEY')
     judge_key = settings.get('REDIS_JUDGE_KEY')
     scan_page = settings.get('SCAN_PAGE')
     server = connection.from_settings(settings)
     s = cls(server, slave_key, master_key, judge_key, scan_page)
     return s
Exemplo n.º 17
0
    def test_redis_host_port_fallback(self):
        settings = Settings(
            dict(REDIS_HOST='baz', REDIS_PORT=1337, REDIS_URL=None))

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'baz')
        self.assertEqual(connect_args['port'], 1337)
Exemplo n.º 18
0
    def from_settings(cls, settings):
        params = {
            'client': connection.from_settings(settings),
        }
        if settings.get('REDIS_START_URLS_KEY'):
            params['start_url_key'] = settings['REDIS_START_URLS_KEY']
        if settings.get('REDIS_START_URLS_AS_SET'):
            params['start_url_as_set'] = settings['REDIS_START_URLS_AS_SET']

        return cls(**params)
Exemplo n.º 19
0
    def setup_redis(self, crawler=None):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if self.server is not None:
            return

        if crawler is None:
            # We allow optional crawler argument to keep backwards
            # compatibility.
            # XXX: Raise a deprecation warning.
            crawler = getattr(self, 'crawler', None)

        if crawler is None:
            raise ValueError("crawler is required")

        settings = crawler.settings

        if self.redis_key is None:
            self.redis_key = settings.get(
                'REDIS_START_URLS_KEY',
                defaults.START_URLS_KEY,
            )

        self.redis_key = self.redis_key % {'name': self.name}

        if not self.redis_key.strip():
            raise ValueError("redis_key must not be empty")

        if self.redis_batch_size is None:
            # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
            self.redis_batch_size = settings.getint(
                'REDIS_START_URLS_BATCH_SIZE',
                settings.getint('CONCURRENT_REQUESTS'),
            )

        try:
            self.redis_batch_size = int(self.redis_batch_size)
        except (TypeError, ValueError):
            raise ValueError("redis_batch_size must be an integer")

        if self.redis_encoding is None:
            self.redis_encoding = settings.get('REDIS_ENCODING',
                                               defaults.REDIS_ENCODING)

        self.logger.info(
            "Reading start URLs from redis key '%(redis_key)s' "
            "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s)",
            self.__dict__)

        self.server = connection.from_settings(crawler.settings)
        # The idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
Exemplo n.º 20
0
    def from_settings(cls, settings):
        params = {
            'server': connection.from_settings(settings),
        }
        if settings.get('REDIS_ITEMS_KEY'):
            params['key'] = settings['REDIS_ITEMS_KEY']
        if settings.get('REDIS_ITEMS_SERIALIZER'):
            params['serialize_func'] = load_object(
                settings['REDIS_ITEMS_SERIALIZER'])

        return cls(**params)
Exemplo n.º 21
0
    def test_redis_host_port(self):
        settings = Settings({
            'REDIS_HOST': 'localhost',
            'REDIS_PORT': 9001,
        })

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
Exemplo n.º 22
0
    def test_redis_host_port(self):
        settings = Settings({
            'REDIS_HOST': 'localhost',
            'REDIS_PORT': 9001,
        })

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
Exemplo n.º 23
0
    def test_redis_url(self):
        settings = Settings({
            'REDIS_URL': 'redis://*****:*****@localhost:9001/42',
        })

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
        self.assertEqual(connect_args['password'], 'bar')
        self.assertEqual(connect_args['db'], 42)
Exemplo n.º 24
0
    def test_redis_host_port_fallback(self):
        settings = Settings(dict(
            REDIS_HOST='baz',
            REDIS_PORT=1337,
            REDIS_URL=None
        ))

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'baz')
        self.assertEqual(connect_args['port'], 1337)
Exemplo n.º 25
0
    def test_redis_url(self):
        settings = Settings({
            'REDIS_URL': 'redis://*****:*****@localhost:9001/42',
        })

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
        self.assertEqual(connect_args['password'], 'bar')
        self.assertEqual(connect_args['db'], 42)
Exemplo n.º 26
0
    def test_redis_url_precedence(self):
        settings = Settings(
            dict(REDIS_HOST='baz',
                 REDIS_PORT=1337,
                 REDIS_URL='redis://*****:*****@localhost:9001/42'))

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
        self.assertEqual(connect_args['password'], 'bar')
        self.assertEqual(connect_args['db'], 42)
Exemplo n.º 27
0
class IsvServiceInfoFactorySpider(RedisSpider):
    name = "isv_service_info_factory"
    start_urls = [
        'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1'
    ]
    redis_server = connection.from_settings(settings)
    count = 0

    def parse(self, response):
        self.count = 0
        self.redis_server.delete('isv_service_info:items')
        self.redis_server.delete('isv_service_info:dupefilter')
        self.redis_server.delete('isv_service_info:start_urls')
        f = codecs.open('service-code-list.csv', 'r', 'utf-8')
        for datas in f.readlines():
            data = datas[:-1].split(',')
            print data[1]
            self.generate_url(data[0])
        f.close()

    def generate_url(self, service_code):
        # 随机休眠0~1秒
        time.sleep(random.random() * 3)
        url = 'https://fuwu.taobao.com/ser/detail.html?service_code='
        url += service_code
        html = requests.get(url).text
        selector = etree.HTML(html)
        company_url = selector.xpath(
            '//*[@id="apc-detail"]/div[1]/div/div/p[1]/a/@href')
        # 防止没有公司服务列表只有服务详细页面
        if not company_url:
            self.redis_server.lpush('isv_service_info:start_urls', url)
            return
        company_url = company_url[0]
        isv_id = re.search('isv_id=(.*?)&', company_url + '&').group(1)
        company_url = 'https://fuwu.taobao.com/serv/shop_index.htm?isv_id='
        company_url += isv_id
        html = requests.get(company_url).text
        selector = etree.HTML(html)
        ul = selector.xpath('//*[@id="seller-header"]/div[2]/div[2]/div/ul/li')
        for li in ul:
            tab_type = li.xpath('span/b/a/text()')[0]
            if '服务列表' == tab_type:
                service_urls = li.xpath('span/b/a/@href')[0]
                service_urls = 'https://fuwu.taobao.com/serv/' + service_urls
                self.redis_server.lpush('isv_service_info:start_urls',
                                        service_urls)
                print service_urls
                self.count += 1
                print self.count
                break
Exemplo n.º 28
0
    def setup_redis(self):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = '%s:start_urls' % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        log.msg("Reading URLs from redis list '%s'" % self.redis_key, level=log.INFO)
Exemplo n.º 29
0
    def test_redis_url_precedence(self):
        settings = Settings(dict(
            REDIS_HOST='baz',
            REDIS_PORT=1337,
            REDIS_URL='redis://*****:*****@localhost:9001/42'
        ))

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
        self.assertEqual(connect_args['password'], 'bar')
        self.assertEqual(connect_args['db'], 42)
Exemplo n.º 30
0
 def __init__(self, settings):
     self.request_count = settings.getint('MYEXT_ITEMCOUNT', 1000)
     self.request_num = 0
     # self.scheduler = scheduler
     self.request = None
     self.no_meet = True#是否遇到seed请求
     self.path_base = settings.get("SEED_FILE_PATH")
     self.server = connection.from_settings(settings)
     use_set = settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
     request_set = settings.get("SCHEDULER_QUEUE_CLASS")
     self.fetch_one = self.server.spop if use_set else self.server.lpop
     self.add_one = self.server.sadd if use_set else self.server.lpush
     self.get_num = self.server.llen if "LifoQueue" in request_set or "FifoQueue" in request_set else self.server.zcard
     self.get_startnum = self.server.scard if use_set else self.server.llen
     self.split_num = settings.get("SPLIT_NUM")
     self.path_split = None
Exemplo n.º 31
0
class IsvServiceInfoFactorySpider(RedisSpider):
    name = "cycle_run"
    start_urls = [
        'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1'
    ]
    redis_server = connection.from_settings(settings)

    def parse(self, response):
        self.runTask(self.work, hour=4)

    def work(self):
        self.redis_server.lpush(
            'isv_service_info_factory:start_urls',
            'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1'
        )
        print "更新周期到"

    def runTask(self, func, day=0, hour=0, min=0, second=0):
        # Init time
        now = datetime.now()
        strnow = now.strftime('%Y-%m-%d %H')
        # print "now:", strnow
        # First next run time
        period = timedelta(days=day, hours=hour, minutes=min, seconds=second)
        next_time = now + period
        strnext_time = next_time.strftime('%Y-%m-%d %H')
        # print "next run:", strnext_time
        while True:
            # Get system current time
            iter_now = datetime.now()
            iter_now_time = iter_now.strftime('%Y-%m-%d %H')
            if str(iter_now_time) == str(strnext_time):
                # Get every start work time
                # print "start work: %s" % iter_now_time
                # Call task func
                func()
                # print "task done."
                # Get next iteration time
                iter_time = iter_now + period
                strnext_time = iter_time.strftime('%Y-%m-%d %H')
                # print "next_iter: %s" % strnext_time
                # Continue next iteration
                continue
            # 1分钟检查一次
            time.sleep(600)
Exemplo n.º 32
0
    def setup_redis(self):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = '%s:start_urls' % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped,
                                     signal=signals.item_scraped)
        #         self.crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
        #         self.crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)

        self.log("Reading URLs from redis list '%s'" % self.redis_key)
Exemplo n.º 33
0
class GxzfcgFactorySpider(RedisSpider):
    name = "gxzfcg_factory"
    name_pre = 'gxzfcg'
    # allowed_domains = ["dmoz.org"]
    start_urls = [
        "http://www.gxzfcg.gov.cn/CmsNewsController/recommendBulletinList/channelCode-cgxx/20/page_1.html"
    ]
    redis_server = connection.from_settings(settings)

    def parse(self, response):
        level_xpaths = [
            "//*[@id=\"bodyMain\"]/div/aside/div/nav/ul/li[1]/ul/li",
            "//*[@id=\"bodyMain\"]/div/aside/div/nav/ul/li[2]/ul/li"
        ]
        level_names = ["区本级采购", "市(县)级采购"]
        i = 0
        for level_xpath in level_xpaths:
            level_name = level_names[i]
            i += 1
            ul = response.xpath(level_xpath)
            for li in ul:
                item = TenderItem()
                item['node_name'] = settings['NODE_NAME']
                item['website'] = '广西壮族自治区政府采购网'
                item['level'] = level_name
                item['type'] = li.xpath("a/text()").extract()[0]
                next_page_url = 'http://www.gxzfcg.gov.cn' + li.xpath(
                    "a/@href").extract()[0]
                yield scrapy.Request(next_page_url,
                                     callback=self.parse_news,
                                     meta={'item': item})

    def parse_news(self, response):
        page_nums = re.search(
            u'页次:1/(.*?)页',
            response.xpath("//*[@id=\"QuotaList_paginate\"]/span[1]/text()").
            extract()[0]).group(1)
        for page_num in range(1, int(page_nums) + 1):
            next_page_url = re.sub('page_(.*?).html',
                                   'page_' + str(page_num) + '.html',
                                   response.url)
            self.redis_server.lpush('%s_spider:start_urls' % self.name_pre,
                                    next_page_url)
            print next_page_url
Exemplo n.º 34
0
    def from_settings(cls, settings):
        if os.environ.get('spider_set_persist'):
            persist = (os.environ.get('spider_set_persist') != 'False')
        else:
            persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)

        if os.environ.get('need_dupefilter'):
            need_dupefilter = (os.environ.get('need_dupefilter') != 'False')
        else:
            need_dupefilter = True

        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(
            settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE',
                                         IDLE_BEFORE_CLOSE)
        server = connection.from_settings(settings)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key,
                   idle_before_close, need_dupefilter)
Exemplo n.º 35
0
    def setup_redis(self):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = "%s:start_urls" % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.item_scraped)
        self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.item_dropped)
        self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.request_scheduled)
        self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.response_received)
        self.crawler.signals.connect(self.schedule_rest_requests, signal=signals.response_downloaded)

        self.crawler.signals.connect(self.__start_loop, signal=signals.spider_opened)
        self.crawler.signals.connect(self.__stop_loop, signal=signals.spider_closed)

        log.msg("Reading URLs from redis list '%s'" % self.redis_key, level=log.INFO)
        self.touch()
Exemplo n.º 36
0
 def from_settings(cls, settings):
     params = {
         'redis_conn': connection.from_settings(settings)
     }
     return cls(**params)
Exemplo n.º 37
0
 def test_default_params(self):
     server = from_settings(self.settings)
     assert server is self.redis_cls.return_value
     self.redis_cls.assert_called_with(**dict(defaults.REDIS_PARAMS, **self.expected_params))
Exemplo n.º 38
0
 def test_redis_cls_custom_path(self):
     self.settings['REDIS_PARAMS']['redis_cls'] = 'mock.Mock'
     server = from_settings(self.settings)
     assert isinstance(server, mock.Mock)
Exemplo n.º 39
0
 def test_redis_cls_default(self):
     server = from_settings(Settings())
     assert isinstance(server, defaults.REDIS_CLS)
Exemplo n.º 40
0
from scrapy_redis import connection
from scrapy.conf import settings
import time

redis_server = connection.from_settings(settings)
redis_server.lpush('isv_service_info_factory:start_urls',
                        'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1')
print "更新周期到"
time.sleep(2)
Exemplo n.º 41
0
def get_server(settings):
    redis_server = connection.from_settings(settings)
    return redis_server
Exemplo n.º 42
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     return cls(server)
Exemplo n.º 43
0
Arquivo: home.py Projeto: ICCV/chaos
 def _set_crawler(self, crawler):
     super(HomepageSpider, self)._set_crawler(crawler)
     self.server = connection.from_settings(self.crawler.settings)
Exemplo n.º 44
0
 def test_default_params(self):
     server = from_settings(self.settings)
     assert server is self.redis_cls.return_value
     self.redis_cls.assert_called_with(**dict(DEFAULT_PARAMS, **self.expected_params))
Exemplo n.º 45
0
class IsvServiceInfoSpider(RedisSpider):
    name = "isv_service_info"
    start_urls = [
        'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.CZ3Xrj&page_id=2489&isv_id=45632667&page_rank=2&tab_type=1',
        'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.Oquk72&page_id=678230&isv_id=877021141&page_rank=2&tab_type=1',
        'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.mSxKHl&page_id=25995&isv_id=305442977&page_rank=2&tab_type=1',
        'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.lH8xCC&page_id=172044&isv_id=570102268&page_rank=2&tab_type=1',
        'https://fuwu.taobao.com/serv/shop_index.htm?spm=0.0.0.0.OzhuHM&page_id=690262&isv_id=897211958&page_rank=2&tab_type=1'
    ]
    redis_server = connection.from_settings(settings)

    def parse(self, response):
        print response.url
        item = IsvServiceInfoItem()
        isv_id = re.search('isv_id=(.*?)&', response.url + '&').group(1)
        company_name = response.xpath(
            '//*[@id="seller-header"]/div[1]/div/a/text()').extract()[0]
        servers = response.xpath('//*[@id="searchForm"]/div[2]/table/tbody/tr')
        for server in servers:
            user_number = server.xpath('td[4]/text()').extract()[0]
            browser_number = server.xpath('td[5]/text()').extract()[0]
            item['isv_id'] = isv_id
            item['company_name'] = company_name
            item['user_number'] = user_number
            item['browser_number'] = browser_number
            detail_url = re.sub(
                'service/service.htm', 'ser/detail.html',
                'https:' + server.xpath('td[2]/dl/dt/a/@href').extract()[0])
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})
            print detail_url

    def parse_detail(self, response):
        detail_url = response.url
        service_code = re.search('service_code=(.*?)&',
                                 detail_url + '&').group(1)
        content = response.xpath('//*[@id="J_SKUForm"]/div[2]/text()')
        # print(content)
        # 判断是否存在"此服务暂不支持在线订购,请您直接联系服务商"
        if not content:
            service_name = response.xpath(
                '//*[@id="J_SKUForm"]/div[1]/h2/text()').extract()[0].replace(
                    '\t', '').replace('\n', '')
            score = response.xpath(
                '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/span[2]/text()'
            ).extract()[0]
            usability = response.xpath(
                '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[1]/span[2]/@class'
            ).extract()[0]
            usability_compare = \
            response.xpath('//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[1]/span[2]/text()').extract()[0]
            # 判断是高于还是低于
            if usability == 'low per':
                usability_compare = '-' + usability_compare
            attitude = response.xpath(
                '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[2]/span[2]/@class'
            ).extract()[0]
            attitude_compare = \
            response.xpath('//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[2]/span[2]/text()').extract()[0]
            if attitude == 'low per':
                attitude_compare = '-' + attitude_compare
            stability = response.xpath(
                '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[3]/span[2]/@class'
            ).extract()[0]
            stability_compare = \
            response.xpath('//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[1]/li[3]/span[2]/text()').extract()[0]
            if stability == 'low per':
                stability_compare = '-' + stability_compare

            secure_score = str(
                response.xpath(
                    '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[1]/span[2]/text()'
                ).extract()[0]).replace('\t', '').replace('\n', '')
            payer_number = response.xpath(
                '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[2]/span[2]/text()'
            ).extract()[0]
            nearly_payer_number = \
            response.xpath('//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[2]/span[3]/text()').extract()[0]
            continue_rate = response.xpath(
                '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[3]/span[2]/text()'
            ).extract()[0]
            refund_rate = response.xpath(
                '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[4]/span[2]/text()'
            ).extract()[0]
            open_rate = response.xpath(
                '//*[@id="apc-detail"]/div[2]/div[1]/div[2]/div/ul[2]/li[5]/span[2]/text()'
            )
            # 打开率有点特殊
            if open_rate:
                open_rate = open_rate.extract()[0]
            else:
                open_rate = None
            score_times = re.search(
                '(\d+)',
                response.xpath(
                    '//*[@id="reviews"]/div[1]/div/div/div[2]/span/text()').
                extract()[0]).group(1)
            five_score_rate = \
            response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[1]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '')
            four_score_rate = \
            response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[2]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '')
            three_score_rate = \
            response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[3]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '')
            two_score_rate = \
            response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[4]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '')
            one_score_rate = \
            response.xpath('//*[@id="reviews"]/div[1]/div/div/div[3]/ul/li[5]/span[@class="tb-r-pecent"]/text()').extract()[0].replace('\t', '').replace('\n', '')
            seller_rank_percent_url = 'https://fuwu.taobao.com' + \
                                      response.xpath('//*[@id="desc-log"]/div/div[1]/div[1]/h5/a/@href').extract()[0]
            seller_industry_percent_url = 'https://fuwu.taobao.com' + \
                                          response.xpath('//*[@id="desc-log"]/div/div[1]/div[2]/h5/a/@href').extract()[0]
            # 爬淘宝买家等级占比
            html = requests.get(seller_rank_percent_url).text
            selector = etree.HTML(html)
            seller_rank_percent_trs = selector.xpath(
                '//*[@id="apc-detail"]/div[2]/table/tbody/tr')
            seller_rank_percent = '['
            for seller_rank_percent_tr in seller_rank_percent_trs:
                seller_rank_percent_tds = seller_rank_percent_tr.xpath('td')
                index = 0
                for seller_rank_percent_td in seller_rank_percent_tds:
                    index += 1
                    img = seller_rank_percent_td.xpath('img/@src')
                    if img:
                        seller_rank = re.search('rank/(.*?).gif',
                                                img[0]).group(1)
                    else:
                        seller_rank = seller_rank_percent_td.xpath('text()')
                        if seller_rank:
                            seller_rank = str(seller_rank[0]).replace(
                                '\t', '').replace('\n', '').replace(' ', '')
                    if seller_rank:
                        if index % 2 == 1:
                            seller_rank_percent = seller_rank_percent + '{\"rank\":\"' + str(
                                seller_rank).replace('\r', '') + '\",'
                        else:
                            seller_rank_percent = seller_rank_percent + '\"percent\":\"' + seller_rank + '\"},'

            seller_rank_percent = seller_rank_percent[:-1] + ']'
            # 爬卖家行业占比
            html = requests.get(seller_industry_percent_url).text
            selector = etree.HTML(html)
            seller_industry_percent_trs = selector.xpath(
                '//*[@id="apc-detail"]/div[2]/table/tbody/tr')
            seller_industry_percent = '['
            for seller_industry_percent_tr in seller_industry_percent_trs:
                seller_industry_percent_tds = seller_industry_percent_tr.xpath(
                    'td')
                index = 0
                for seller_industry_percent_td in seller_industry_percent_tds:
                    index += 1
                    img = seller_industry_percent_td.xpath('img/@src')
                    if img:
                        seller_rank = re.search('rank/(.*?).gif',
                                                img[0]).group(1)
                    else:
                        seller_rank = seller_industry_percent_td.xpath(
                            'text()')
                        if seller_rank:
                            seller_rank = str(seller_rank[0]).replace(
                                '\t', '').replace('\n', '').replace(' ', '')
                    if seller_rank:
                        if index % 2 == 1:
                            seller_industry_percent = seller_industry_percent + '{\"industry\":\"' + str(
                                seller_rank).replace('\r', '') + '\",'
                        else:
                            seller_industry_percent = seller_industry_percent + '\"percent\":\"' + seller_rank + '\"},'

            seller_industry_percent = seller_industry_percent[:-1] + ']'

            # print(company_name)
            print(service_name)
            print(service_code)
            print(score)
            print(usability)
            print(usability_compare)
            print(attitude)
            print(attitude_compare)
            print(stability)
            print(stability_compare)
            print(secure_score)
            print(payer_number)
            print(nearly_payer_number)
            print(continue_rate)
            print(refund_rate)
            print(open_rate)
            print(score_times)
            print(five_score_rate)
            print(four_score_rate)
            print(three_score_rate)
            print(two_score_rate)
            print(one_score_rate)
            # print(seller_rank_percent_url)
            # print(seller_industry_percent_url)
            print(seller_rank_percent)
            print(seller_industry_percent)
            # print(user_number)
            # print(browser_number)

            now_time = datetime.datetime.today()
            item = response.meta['item']
            item['add_time'] = now_time
            item['modify_time'] = now_time
            # item['isv_id'] = isv_id
            # item['company_name'] = company_name
            item['service_name'] = service_name
            item['service_code'] = service_code
            item['score'] = score
            # item['usability'] = usability
            item['usability_compare'] = usability_compare
            # item['attitude'] = attitude
            item['attitude_compare'] = attitude_compare
            # item['stability'] = stability
            item['stability_compare'] = stability_compare
            item['secure_score'] = secure_score
            item['payer_number'] = payer_number
            item['nearly_payer_number'] = nearly_payer_number
            item['continue_rate'] = continue_rate
            item['refund_rate'] = refund_rate
            item['open_rate'] = open_rate
            item['score_times'] = score_times
            item['five_score_rate'] = five_score_rate
            item['four_score_rate'] = four_score_rate
            item['three_score_rate'] = three_score_rate
            item['two_score_rate'] = two_score_rate
            item['one_score_rate'] = one_score_rate
            item['seller_rank_percent'] = seller_rank_percent
            item['seller_industry_percent'] = seller_industry_percent
            # item['user_number'] = user_number
            # item['browser_number'] = browser_number
            yield item
Exemplo n.º 46
0
 def from_settings(cls, settings):
     params = {
         'server': connection.from_settings(settings),
     }
     return cls(**params)
Exemplo n.º 47
0
class ShopBasicInfoSpider(RedisSpider):
    name = "shop_basic_info"
    start_urls = []
    redis_server = connection.from_settings(settings)

    def parse(self, response):
        head = {'User-Agent': \
                    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
                'cookie': 'cna=9dmhD/z0ODgCATHd1PgOvAEo; ali_ab=49.221.212.248.1461498199870.2; hng=CN%7Czh-cn%7CCNY; thw=cn; isg=0402F5E80881A216E6813A6676800CB8; v=0; _tb_token_=ku02IolX76m3jJ; uc1=cookie14=UoWxMP74ys3MPA%3D%3D&existShop=false&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie21=VFC%2FuZ9aiKCaj7AzN6nc&tag=1&cookie15=V32FPkk%2Fw0dUvg%3D%3D&pas=0; uc3=sg2=UIS5OL%2BOEDgy%2FIeQ7IgTu7dSOvuG0LEay5288ZRYw64%3D&nk2=pbEaPGpOBJk%3D&id2=UoYfobtYhLxhEw%3D%3D&vt3=F8dASmgu7PcOeAskyes%3D&lg2=UtASsssmOIJ0bQ%3D%3D; existShop=MTQ2MTkyMDU3MA%3D%3D; uss=UIIpyK78%2BArm1rQcpUrk%2FRwXHQDc93OpxAQgdlu7DWHJVDuJuqKSxy5hBg%3D%3D; lgc=%5Cu4F01%5Cu513F%5Cu8469%5Cu8469; tracknick=%5Cu4F01%5Cu513F%5Cu8469%5Cu8469; cookie2=1cd8c28594101548f03793d63c556c34; sg=%E8%91%A937; mt=np=&ci=9_1&cyk=-1_-1; cookie1=Vvkh3e1O2MWb%2FWoyF7KMYkHR3r9XP1ItH8ivkdLbbCM%3D; unb=1710468073; skt=6e1a6a704a8e0ddd; t=a74cdfe08cedf1981150320f20e9a793; _cc_=U%2BGCWk%2F7og%3D%3D; tg=0; _l_g_=Ug%3D%3D; _nk_=%5Cu4F01%5Cu513F%5Cu8469%5Cu8469; cookie17=UoYfobtYhLxhEw%3D%3D; l=Av7-AatB1kluKMPP1QNNkk-Bzh5BL8LA'
                }
        seller_nick = urllib.unquote(
            re.search('&q=(.*?)&',
                      response.url + '&').group(1)).decode('utf-8')
        print seller_nick
        html = response.body
        content = re.search('g_page_config = (.*?);\n', html, re.S)
        while not content:
            html = requests.get(response.url, headers=head).text
            content = re.search('g_page_config = (.*?);\n', html, re.S)
        content = content.group(1)
        # 解析出全部店铺列表信息的json
        data = json.loads(content).get('mods').get('shoplist').get('data')
        # 判断搜索有没有结果
        if data is not None:
            the_shop_data = data.get('shopItems')[0]
            nick = the_shop_data.get('nick')
            # 第一个店铺的nick要跟给定的nick相等
            if nick == seller_nick:
                shop_type = the_shop_data.get('shopIcon').get(
                    'iconClass').strip()
                shop_name = the_shop_data.get('rawTitle').strip()
                shop_id = the_shop_data.get('nid').strip()
                shop_address = the_shop_data.get('provcity').strip()
                total_sold = int(the_shop_data.get('totalsold'))
                goods_number = int(the_shop_data.get('procnt'))
                shop_label = ''
                icons = the_shop_data.get('icons')
                for i in icons:
                    shop_label = shop_label + i.get('title') + ','
                if len(shop_label) > 0:
                    shop_label = shop_label[:-1]
                good_rate_percent = float(
                    self.delete_the_percent(
                        the_shop_data.get('goodratePercent')))
                shop_img_url = the_shop_data.get('picUrl').strip()
                shop_rate_url = the_shop_data.get('userRateUrl').strip()
                dsrStr = json.loads(the_shop_data.get('dsrInfo').get('dsrStr'))
                main_business = dsrStr.get('ind').strip()
                if main_business == '':
                    main_business = None

                describe_score_industry = self.delete_the_percent(
                    dsrStr.get('mg'))
                service_score_industry = self.delete_the_percent(
                    dsrStr.get('sg'))
                logistics_score_industry = self.delete_the_percent(
                    dsrStr.get('cg'))

                # 判断店铺等级是否为0
                if shop_type != 'rank seller-rank-0':
                    url = 'https:' + shop_rate_url
                    html = requests.get(url, headers=head)
                    add_time = datetime.datetime.today()
                    modify_time = add_time

                    is_exist = True
                    deposit = None
                    seller_rank = None
                    buyer_rank = None
                    main_rate = None
                    # 切割地址
                    if shop_address != '':
                        shop_address_s = str(shop_address).split(' ')
                        if len(shop_address_s) == 2:
                            shop_address_province = shop_address_s[0]
                            shop_address_city = shop_address_s[1]
                        elif len(shop_address_s) == 1:
                            shop_address_province = shop_address_s[0]
                            shop_address_city = None
                    else:
                        shop_address_province = None
                        shop_address_city = None
                    if shop_type != 'icon-service-tianmao-large':
                        if shop_type == 'icon-service-qiye-large':
                            shop_type = '企业店铺'
                        else:
                            shop_type = '普通店铺'
                    else:
                        shop_type = '天猫店铺'

                    item = ShopbasicinfoItem()
                    item['add_time'] = add_time
                    item['modify_time'] = modify_time
                    item['nick'] = nick
                    item['shop_type'] = shop_type
                    item['shop_name'] = shop_name
                    item['shop_id'] = shop_id
                    item['shop_address_province'] = shop_address_province
                    item['shop_address_city'] = shop_address_city
                    item['total_sold'] = total_sold
                    item['goods_number'] = goods_number
                    item['good_rate_percent'] = good_rate_percent
                    item['shop_img_url'] = shop_img_url
                    item['shop_rate_url'] = shop_rate_url
                    item['main_business'] = main_business
                    item['deposit'] = deposit
                    item['seller_rank'] = seller_rank
                    item['buyer_rank'] = buyer_rank
                    item['main_rate'] = main_rate
                    item['is_exist'] = is_exist
                    item['shop_label'] = shop_label

                    yield item

    def getContent(self, content):
        if content:
            return content[0]
        return None

    def getNumber(self, content):
        if content:
            result = re.search('(\d+)', content[0])
            if result:
                return float(result.group(1))
        return 0

    def getUserId(self, content):
        content = re.search('"userID": "(.*?)"', content)
        if content:
            return content.group(1)
        return None

    def getCharge(self, content):
        content = re.search('¥(.*)', self.getContent(content))
        if content:
            return float(content.group(1).replace(',', ''))
        return 0

    def delete_the_percent(self, content):
        if content:
            content = re.search('(.*)%', content)
        if content:
            print('###' + content.group(1))
            return float(content.group(1))
        return 0

    def delete_the_fen(self, content):
        if content:
            content = re.search('(.*)分', content)
        if content:
            print('fen' + content.group(1))
            return float(content.group(1))
        return 0

    def delete_the_tian(self, content):
        if content:
            content = re.search('(.*)天', content)
        if content:
            print('天' + content.group(1))
            return float(content.group(1))
        return 0