Пример #1
0
 def parse(self, response):
     selector = Selector(response)
     questions = selector.xpath('//tr[td[@class="f14 yh"]]')
     for question in questions:
         item = QuestionItem()
         tag = question.xpath('td[1]/text()').extract()[0]
         title = question.xpath('td[2]/a/text()').extract()[0]
         url = question.xpath('td[2]/a/@href').extract()[0]
         item['parent'] = response.url
         item['url'] = str(url)
         item['tag'] = tag
         item['title'] = title
         request = scrapy.Request(str(url),
                                  headers=self.headers,
                                  callback=self.parse_sub_page,
                                  dont_filter=True)
         request.meta['item'] = item
         yield request
     word = u'下一页'
     next_pages = selector.xpath('//a[text()="%s"]/@href' % word).extract()
     if next_pages:
         next_page = next_pages[0]
         logger.info('parse next page============ %s', str(next_page))
         yield scrapy.Request(str(next_page),
                              headers=self.headers,
                              callback=self.parse,
                              dont_filter=True)
Пример #2
0
    def parse(self, response):
        selector = Selector(response)
        questions = selector.xpath(
            '//div[@id="messageList"]//tr[contains(@class,"lia-list-row")]')
        print len(questions)
        for question in questions:
            replies_num = question.xpath(
                './/div[@class="lia-component-messages-column-message-replies-count"]/span/text()'
            ).extract()[0]
            if replies_num is None or int(str(replies_num).strip()) == 0:
                continue
            item = QuestionItem()
            item['replies'] = str(replies_num).strip()

            views_num = \
            question.xpath('.//div[@class="lia-component-messages-column-message-views-count"]/span/text()').extract()[
                0]
            item['views'] = str(views_num).strip()
            url = question.xpath(
                './/a[@class="page-link lia-link-navigation lia-custom-event"]/@href'
            ).extract()[0]
            if url:
                item['subject'] = str(url).split('/')[2]
            title = question.xpath(
                './/a[@class="page-link lia-link-navigation lia-custom-event"]/text()'
            ).extract()[0]
            item['title'] = title.replace('\t', '').replace('\n', '').replace(
                u'\xa0', ' ')
            item['answers'] = []
            request = scrapy.Request(self.site_url + str(url),
                                     headers=self.headers,
                                     callback=self.parse_sub_page,
                                     dont_filter=True)
            request.meta['item'] = item
            yield request
        word = u'next'
        next_pages = selector.xpath('//link[@rel="%s"]/@href' % word).extract()
        if next_pages:
            if len(next_pages[0]) > 0:
                next_page = next_pages[0]
                logger.info('parse next page============ %s', str(next_page))
                yield scrapy.Request(str(next_page),
                                     headers=self.headers,
                                     callback=self.parse,
                                     dont_filter=True)
Пример #3
0
 def parse(self, response):
     selector = Selector(response)
     questions = selector.xpath('//td[@class="zx_lb_bt"]')
     for question in questions:
         item = QuestionItem()
         tag = question.xpath('a[@class="zx_fl"]/text()').extract()[0]
         title = question.xpath('a[@class="zx_tm"]/@title').extract()[0]
         url = question.xpath('a[@class="zx_tm"]/@href').extract()[0]
         full_url = self.site_url + str(url)
         item['parent'] = response.url
         item['url'] = full_url
         item['tag'] = tag
         item['title'] = title
         request = scrapy.Request(full_url,
                                  headers=self.headers,
                                  callback=self.parse_sub_page,
                                  dont_filter=True)
         request.meta['item'] = item
         yield request
     next_pages = selector.xpath('//a[@class="nextprev"]/@href').extract()
     if next_pages:
         if len(next_pages) == 2:
             next_page = next_pages[1]
         else:
             text = selector.xpath(
                 '//span[@class="nextprev"]/text()').extract()
             if text:
                 text = text[0]
                 if str(text).startswith(u'下一页'):
                     next_page = None
                     logger.info(
                         '===============yield all requests============')
                 else:
                     next_page = next_pages[0]
         if next_page:
             logger.info('parse next page============ %s',
                         self.site_url + str(next_page))
             yield scrapy.Request(self.site_url + str(next_page),
                                  headers=self.headers,
                                  callback=self.parse,
                                  dont_filter=True)
Пример #4
0
 def parse(self, response):
     # parse response and populate item as required
     item = QuestionItem()
     selector = Selector(response)
     text = selector.xpath('//div[@class="xwz"]').extract()[0]
     soup = BeautifulSoup(text, 'lxml')
     question = ''.join(soup.find_all(text=True)).replace(' ', '').replace(
         '\n', '')
     answers = selector.xpath('//div[@class="zjdanr"]/text()').extract()
     answer_list = []
     item['parent'] = response.url
     item['url'] = response.url
     item['tag'] = "test"
     item['title'] = "test"
     for answer in answers:
         answer_list.append(answer)
         print answer
     item['question'] = question
     item['answers'] = '|'.join(answer_list).replace(' ',
                                                     '').replace('\n', '')
     return item
Пример #5
0
 def parse(self, response):
     selector = Selector(response)
     questions = selector.xpath(
         '//ul[@class="result-list"]/li[@class="list-item"]')
     print len(questions)
     for question in questions:
         item_num = question.xpath(
             'div/span[@class="rli-item item-num"]/text()').extract()[0]
         if item_num and "0个" == item_num:
             continue
         item = QuestionItem()
         url = question.xpath(
             'div/a[@class="rli-item item-link"]/@href').extract()[0]
         title = question.xpath(
             'div/a[@class="rli-item item-link"]/@title').extract()[0]
         item['parent'] = response.url
         item['url'] = url
         item['tag'] = question.xpath(
             'div/span[@class="rli-item item-classify"]/text()').extract(
             )[0]
         item['title'] = title
         request = scrapy.Request(url,
                                  headers=self.headers,
                                  callback=self.parse_sub_page,
                                  dont_filter=True)
         request.meta['item'] = item
         yield request
     word = u'下页'
     next_pages = selector.xpath('//a[text()="%s"]/@href' % word).extract()
     if next_pages:
         if len(next_pages[0]) > 0:
             next_page = next_pages[0]
             logger.info('parse next page============ %s', str(next_page))
             yield scrapy.Request(str(next_page),
                                  headers=self.headers,
                                  callback=self.parse,
                                  dont_filter=True)
Пример #6
0
 def parse(self, response):
     selector = Selector(response)
     questions = selector.xpath('//ul[@class="clearfix"]/li')
     print len(questions)
     for question in questions:
         item = QuestionItem()
         url = question.xpath('a/@href').extract()[0]
         title = question.xpath('a/@title').extract()[0]
         item['parent'] = response.url
         item['url'] = url
         item['tag'] = question.xpath('span[@class="classItem"]/text()').extract()[0]
         item['title'] = title
         request = scrapy.Request(url, headers=self.headers,
                                  callback=self.parse_sub_page, dont_filter=True)
         request.meta['item'] = item
         yield request
     word = u'下一页'
     next_pages = selector.xpath('//a[text()="%s"]/@href' % word).extract()
     if next_pages:
         if len(next_pages[0]) > 0:
             next_page = next_pages[0]
             logger.info('parse next page============ %s', self.site_url+str(next_page))
             yield scrapy.Request(self.site_url+str(next_page), headers=self.headers,
                                     callback=self.parse, dont_filter=True)
Пример #7
0
 def parse(self, response):
     selector = Selector(response)
     questions = selector.xpath('//div[@class="tit07"]')
     for question in questions:
         item = QuestionItem()
         texts = question.xpath('.//a[@class="hui"]/text()').extract()
         if texts:
             if len(texts) == 2:
                 tag = texts[1]
             elif len(texts) == 1:
                 tag = texts[0]
         title = question.xpath(
             './/a[contains(@href,"/ask/question-")]/text()').extract()[0]
         url = question.xpath(
             './/a[contains(@href,"/ask/question-")]/@href').extract()[0]
         full_url = self.site_url + str(url)
         item['parent'] = response.url
         item['url'] = full_url
         item['tag'] = tag
         item['title'] = title
         request = scrapy.Request(full_url,
                                  headers=self.headers,
                                  callback=self.parse_sub_page,
                                  dont_filter=True)
         request.meta['item'] = item
         yield request
     word = u'下一页'
     next_pages = selector.xpath('//a[text()="%s"]/@href' % word).extract()
     if next_pages:
         next_page = next_pages[0]
         logger.info('parse next page============ %s',
                     self.site_url + str(next_page))
         yield scrapy.Request(self.site_url + str(next_page),
                              headers=self.headers,
                              callback=self.parse,
                              dont_filter=True)