def parse_category(self, response): response_sel = Selector(response) data_widgetid = response_sel.xpath(u'//*[@class="J_TModule" and @data-title="搜索列表"]/@data-widgetid').extract() wid = data_widgetid[0] mid = 'w-' + wid + '-0' catId = get_query(response.url, 'catId') path = "/category"+catId + '.htm' pageNo = get_query(response.url, 'pageNo') page_url = set_query(self.asyncUrl, wid=wid, mid=mid, path=path, catId=catId, scid=catId,pageNo=pageNo) yield Request(url=page_url, callback=self.parse_nextpage)
def parse_nextpage(self, response): response_sel = Selector(response) next_pageurl = response_sel.xpath(u'//a[contains(@class,"next")]/@href').extract() if len(next_pageurl) > 0: page_num = get_query(next_pageurl[0], 'pageNo') next_url = set_query(self.categoryUrl, pageNo=page_num) yield Request(url=next_url, callback=self.parse_category) else: self.logger.warning("Can not find the next page url ! ") dl_bodys = response_sel.xpath(u'/html/body/div/div[3]') for dl_body in dl_bodys: item_lines = dl_body.xpath(u'./div/dl') for item_line in item_lines: comment_item = TmallCommentItem() data_id = item_line.xpath(u'./@data-id').extract() item_id = re.findall('(\d+)', data_id[0]) item_name = item_line.xpath(u'./dd[contains(@class,"detail")]/a/text()').extract() item_type = item_line.xpath(u'./dd[contains(@class,"detail")]/a/span/text()').extract() item_price = item_line.xpath(u'./dd[contains(@class,"detail")]/div/div[contains(@class,"cprice-area")]/span/text()').extract() item_sales = item_line.xpath(u'./dd[contains(@class,"detail")]/div/div[contains(@class,"sale-area")]/span/text()').extract() if len(item_name) > 1: comment_item['ItemName'] = item_name[0].strip() + ' ' + item_name[1].strip() else: comment_item['ItemName'] = item_name[0].strip() if len(item_type) > 0: comment_item['ItemType'] = item_type[0].strip() if len(item_price) > 1: comment_item['ItemPrice'] = item_price[1].strip() if len(item_sales) > 0: comment_item['ItemSales'] = item_sales[0].strip() yield comment_item