def parse(self, response): info = response.body.decode('utf-8') info = json.loads(info) logging.debug(json.dumps(info)) if 'items' not in info.keys(): self.err_after(response.meta) return None item_info = info['items'] keyword = response.meta['keyword'] sort = response.meta['sort'] # 上一页最后一个产品的排名 p_time = response.meta['p_time'] item_list = [] page = response.meta['page'] proxy = response.meta['proxy'] # print('parse_before', sort,len(item_info), keyword) # 返回有数据,处理数据 if len(item_info) > 0: for value in item_info: sort = sort + 1 # 判断是否推广 if 'ad' in value.keys(): mall_id = value['ad']['mall_id'] is_ad = 1 suggest_keyword = '' else: mall_id = 0 is_ad = 0 suggest_keyword = '' goods_info = value goods_info['keyword'] = keyword goods_info['sort'] = sort goods_info['p_time'] = p_time goods_info['mall_id'] = mall_id goods_info['is_ad'] = is_ad goods_info['suggest_keyword'] = suggest_keyword item_list.append(goods_info) # 处理单个关键字下所有产品的排名 item = KeywordGoodsList() item['goods_list'] = item_list item['page'] = page # print('parse_middle', sort,len(item_info), keyword) yield item page += 1 # 返回数据,页码加1,未返回数据,重新抓取 # print('parse_after', sort,len(item_info), keyword) if page <= self.max_page: url = self.build_search_url(page, self.size, keyword, '') headers = self.make_headers() meta = { 'proxy': proxy, 'page': page, 'keyword': keyword, 'sort': sort, 'p_time': p_time } yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin)
def parse(self, response): info = response.body.decode('utf-8') info = json.loads(info) item_info = info['items'] keyword = response.meta['keyword'] sort = response.meta['sort'] # 上一页最后一个产品的排名 item_list = [] page = response.meta['page'] proxy = response.meta['proxy'] # print('parse_before', sort,len(item_info), keyword) # 返回有数据,处理数据 if len(item_info) > 0: for value in item_info: sort = sort + 1 # 判断是否推广 if 'ad' in value.keys(): mall_id = value['ad']['mall_id'] is_ad = 1 suggest_keyword = '' else: mall_id = 0 is_ad = 0 suggest_keyword = '' item_list.append({ 'keyword': keyword, 'sort': sort, 'goods_id': value['goods_id'], 'p_time': self.p_time, 'mall_id': mall_id, 'is_ad': is_ad, 'suggest_keyword': suggest_keyword }) item = KeywordGoodsList() # 此处作兼容 item['page'] = page page = page + 1 # 返回数据,页码加1,未返回数据,重新抓取 # 处理单个关键字下所有产品的排名 item['goods_list'] = item_list # print('parse_middle', sort,len(item_info), keyword) yield item # print('parse_after', sort,len(item_info), keyword) if page <= self.max_page: url = self.build_search_url(page, self.size, keyword) headers = self.make_headers() meta = { 'proxy': proxy, 'page': page, 'keyword': keyword, 'sort': sort } yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin)