def parse(self, response): selector = Selector(response) questions = selector.xpath('//tr[td[@class="f14 yh"]]') for question in questions: item = QuestionItem() tag = question.xpath('td[1]/text()').extract()[0] title = question.xpath('td[2]/a/text()').extract()[0] url = question.xpath('td[2]/a/@href').extract()[0] item['parent'] = response.url item['url'] = str(url) item['tag'] = tag item['title'] = title request = scrapy.Request(str(url), headers=self.headers, callback=self.parse_sub_page, dont_filter=True) request.meta['item'] = item yield request word = u'下一页' next_pages = selector.xpath('//a[text()="%s"]/@href' % word).extract() if next_pages: next_page = next_pages[0] logger.info('parse next page============ %s', str(next_page)) yield scrapy.Request(str(next_page), headers=self.headers, callback=self.parse, dont_filter=True)
def parse(self, response): selector = Selector(response) questions = selector.xpath( '//div[@id="messageList"]//tr[contains(@class,"lia-list-row")]') print len(questions) for question in questions: replies_num = question.xpath( './/div[@class="lia-component-messages-column-message-replies-count"]/span/text()' ).extract()[0] if replies_num is None or int(str(replies_num).strip()) == 0: continue item = QuestionItem() item['replies'] = str(replies_num).strip() views_num = \ question.xpath('.//div[@class="lia-component-messages-column-message-views-count"]/span/text()').extract()[ 0] item['views'] = str(views_num).strip() url = question.xpath( './/a[@class="page-link lia-link-navigation lia-custom-event"]/@href' ).extract()[0] if url: item['subject'] = str(url).split('/')[2] title = question.xpath( './/a[@class="page-link lia-link-navigation lia-custom-event"]/text()' ).extract()[0] item['title'] = title.replace('\t', '').replace('\n', '').replace( u'\xa0', ' ') item['answers'] = [] request = scrapy.Request(self.site_url + str(url), headers=self.headers, callback=self.parse_sub_page, dont_filter=True) request.meta['item'] = item yield request word = u'next' next_pages = selector.xpath('//link[@rel="%s"]/@href' % word).extract() if next_pages: if len(next_pages[0]) > 0: next_page = next_pages[0] logger.info('parse next page============ %s', str(next_page)) yield scrapy.Request(str(next_page), headers=self.headers, callback=self.parse, dont_filter=True)
def parse(self, response): selector = Selector(response) questions = selector.xpath('//td[@class="zx_lb_bt"]') for question in questions: item = QuestionItem() tag = question.xpath('a[@class="zx_fl"]/text()').extract()[0] title = question.xpath('a[@class="zx_tm"]/@title').extract()[0] url = question.xpath('a[@class="zx_tm"]/@href').extract()[0] full_url = self.site_url + str(url) item['parent'] = response.url item['url'] = full_url item['tag'] = tag item['title'] = title request = scrapy.Request(full_url, headers=self.headers, callback=self.parse_sub_page, dont_filter=True) request.meta['item'] = item yield request next_pages = selector.xpath('//a[@class="nextprev"]/@href').extract() if next_pages: if len(next_pages) == 2: next_page = next_pages[1] else: text = selector.xpath( '//span[@class="nextprev"]/text()').extract() if text: text = text[0] if str(text).startswith(u'下一页'): next_page = None logger.info( '===============yield all requests============') else: next_page = next_pages[0] if next_page: logger.info('parse next page============ %s', self.site_url + str(next_page)) yield scrapy.Request(self.site_url + str(next_page), headers=self.headers, callback=self.parse, dont_filter=True)
def parse(self, response): # parse response and populate item as required item = QuestionItem() selector = Selector(response) text = selector.xpath('//div[@class="xwz"]').extract()[0] soup = BeautifulSoup(text, 'lxml') question = ''.join(soup.find_all(text=True)).replace(' ', '').replace( '\n', '') answers = selector.xpath('//div[@class="zjdanr"]/text()').extract() answer_list = [] item['parent'] = response.url item['url'] = response.url item['tag'] = "test" item['title'] = "test" for answer in answers: answer_list.append(answer) print answer item['question'] = question item['answers'] = '|'.join(answer_list).replace(' ', '').replace('\n', '') return item
def parse(self, response): selector = Selector(response) questions = selector.xpath( '//ul[@class="result-list"]/li[@class="list-item"]') print len(questions) for question in questions: item_num = question.xpath( 'div/span[@class="rli-item item-num"]/text()').extract()[0] if item_num and "0个" == item_num: continue item = QuestionItem() url = question.xpath( 'div/a[@class="rli-item item-link"]/@href').extract()[0] title = question.xpath( 'div/a[@class="rli-item item-link"]/@title').extract()[0] item['parent'] = response.url item['url'] = url item['tag'] = question.xpath( 'div/span[@class="rli-item item-classify"]/text()').extract( )[0] item['title'] = title request = scrapy.Request(url, headers=self.headers, callback=self.parse_sub_page, dont_filter=True) request.meta['item'] = item yield request word = u'下页' next_pages = selector.xpath('//a[text()="%s"]/@href' % word).extract() if next_pages: if len(next_pages[0]) > 0: next_page = next_pages[0] logger.info('parse next page============ %s', str(next_page)) yield scrapy.Request(str(next_page), headers=self.headers, callback=self.parse, dont_filter=True)
def parse(self, response): selector = Selector(response) questions = selector.xpath('//ul[@class="clearfix"]/li') print len(questions) for question in questions: item = QuestionItem() url = question.xpath('a/@href').extract()[0] title = question.xpath('a/@title').extract()[0] item['parent'] = response.url item['url'] = url item['tag'] = question.xpath('span[@class="classItem"]/text()').extract()[0] item['title'] = title request = scrapy.Request(url, headers=self.headers, callback=self.parse_sub_page, dont_filter=True) request.meta['item'] = item yield request word = u'下一页' next_pages = selector.xpath('//a[text()="%s"]/@href' % word).extract() if next_pages: if len(next_pages[0]) > 0: next_page = next_pages[0] logger.info('parse next page============ %s', self.site_url+str(next_page)) yield scrapy.Request(self.site_url+str(next_page), headers=self.headers, callback=self.parse, dont_filter=True)
def parse(self, response): selector = Selector(response) questions = selector.xpath('//div[@class="tit07"]') for question in questions: item = QuestionItem() texts = question.xpath('.//a[@class="hui"]/text()').extract() if texts: if len(texts) == 2: tag = texts[1] elif len(texts) == 1: tag = texts[0] title = question.xpath( './/a[contains(@href,"/ask/question-")]/text()').extract()[0] url = question.xpath( './/a[contains(@href,"/ask/question-")]/@href').extract()[0] full_url = self.site_url + str(url) item['parent'] = response.url item['url'] = full_url item['tag'] = tag item['title'] = title request = scrapy.Request(full_url, headers=self.headers, callback=self.parse_sub_page, dont_filter=True) request.meta['item'] = item yield request word = u'下一页' next_pages = selector.xpath('//a[text()="%s"]/@href' % word).extract() if next_pages: next_page = next_pages[0] logger.info('parse next page============ %s', self.site_url + str(next_page)) yield scrapy.Request(self.site_url + str(next_page), headers=self.headers, callback=self.parse, dont_filter=True)