예제 #1
0
class TestDepthMiddleware(TestCase):

    def setUp(self):
        crawler = get_crawler(Spider)
        self.spider = crawler._create_spider('scrapytest.org')

        self.stats = StatsCollector(crawler)
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)

    def test_process_spider_output(self):
        req = Request('http://scrapytest.org')
        resp = Response('http://scrapytest.org')
        resp.request = req
        result = [Request('http://scrapytest.org')]

        out = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out, result)

        rdc = self.stats.get_value('request_depth_count/1', spider=self.spider)
        self.assertEquals(rdc, 1)

        req.meta['depth'] = 1

        out2 = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out2, [])

        rdm = self.stats.get_value('request_depth_max', spider=self.spider)
        self.assertEquals(rdm, 1)

    def tearDown(self):
        self.stats.close_spider(self.spider, '')
예제 #2
0
class TestDepthMiddleware(TestCase):
    def setUp(self):
        crawler = get_crawler(Spider)
        self.spider = crawler._create_spider("scrapytest.org")

        self.stats = StatsCollector(crawler)
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)

    def test_process_spider_output(self):
        req = Request("http://scrapytest.org")
        resp = Response("http://scrapytest.org")
        resp.request = req
        result = [Request("http://scrapytest.org")]

        out = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out, result)

        rdc = self.stats.get_value("request_depth_count/1", spider=self.spider)
        self.assertEquals(rdc, 1)

        req.meta["depth"] = 1

        out2 = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out2, [])

        rdm = self.stats.get_value("request_depth_max", spider=self.spider)
        self.assertEquals(rdm, 1)

    def tearDown(self):
        self.stats.close_spider(self.spider, "")
예제 #3
0
    def parse(self, response):
        #getting the question blocks from response.
        question_blocks = Selector(
            text=json.loads(response.body.decode("utf-8"))['msg'][1]).xpath(
                '//div[contains(@itemtype, "http://schema.org/Question")]')

        for question_block in question_blocks:
            item = ZhihuapiItem()
            item['question_name'] = question_block.xpath(
                './/div/div/h2/a/text()').extract_first()
            item['question_url'] = question_block.xpath(
                './/div/div/h2/a/@href').extract_first()
            item['question_answer'] = question_block.xpath(
                './/div/div/div[1]/div[5]/div/a/@href').extract_first()
            item['question_answer_author_profile'] = question_block.xpath(
                './/div/div/div[1]/div[3]/span/span[1]/a/@href').extract_first(
                )
            item['question_answer_author'] = question_block.xpath(
                './/div/div/div[1]/div[3]/span/span[1]/a/text()'
            ).extract_first()

            self.logger.info(
                'Question info: question name - {}, question answer - {}, question url - {}, question answer author profile - {}, question answer author - {}'
                .format(item['question_name'], item['question_answer'],
                        item['question_url'],
                        item['question_answer_author_profile'],
                        item['question_answer_author']))

            yield item

        if len(question_blocks) > 0:
            last_data_score = question_blocks[len(question_blocks) - 1].xpath(
                '@data-score').extract_first()
        else:
            self.logger.info("No more new questions, waiting to stop...")
            StatsCollector.close_spider(self,
                                        spider=zhihuSpider,
                                        reason="No more questions...")

        self.logger.info('Last Data Score is - {}'.format(last_data_score))
        yield scrapy.http.FormRequest(self.topic_url,
                                      method='POST',
                                      headers=self.headers,
                                      formdata={
                                          'start': '0',
                                          'offset': str(last_data_score)
                                      },
                                      callback=self.parse)