def parse(self, response): pass review_list = response.meta['review_list'] ##产品集合 goods_id = response.meta['goods_id'] ##店铺ID page = response.meta['page'] ##每返回一次页面数据 记录页数 goods_reviews = response.body.decode('utf-8') ##bytes转换为str goods_reviews = json.loads(goods_reviews) if 'data' not in goods_reviews.keys() or len( goods_reviews['data']) == 0: self.ssdb.hdel(self.hash_name, goods_id) if review_list: item = GoodsSalesItem() item['goods_list'] = review_list item['mall_id'] = goods_id yield item else: review_list = review_list + goods_reviews['data'] ##合并评论列表 page += 1 url = self.build_url(goods_id, page) meta = { 'page': page, 'goods_id': goods_id, 'review_list': review_list } yield scrapy.Request(url, meta=meta, callback=self.parse)
def parse(self, response): goods_data = response.body.decode('utf-8') ##bytes转换为str goods_data = json.loads(goods_data) if 'goods_id' in goods_data.keys(): item = GoodsSalesItem() item['goods_list'] = goods_data yield item
def parse(self, response): pass goods_list=response.meta['goods_list'] ##产品集合 mall_id = response.meta['mall_id'] ##店铺ID page = response.meta['page'] ##每返回一次页面数据 记录页数 mall_goods = response.body.decode('utf-8') ##bytes转换为str mall_goods = json.loads(mall_goods) mall_goods = mall_goods['goods'] if 'goods_list' not in mall_goods.keys(): return None goods_len = len(mall_goods['goods_list']) if goods_len > 0: goods_list = goods_list + mall_goods['goods_list'] ##合并产品列表 if goods_len > self.pageSize - 100: page += 1 ##继续采集下一页面 ##url = 'http://apiv4.yangkeduo.com/api/turing/mall/query_cat_goods?category_id=0&type=0&sort_type=_sales&mall_id='+str(mall_id)+'&page_no='+str(page)+'&page_size=500' #url ='http://apiv3.yangkeduo.com/api/turing/mall/query_cat_goods?category_id=0&type=0&mall_id='+str(mall_id)+'&page_no='+str(page)+'&page_size=500&sort_type=_sales&anti_content='+self.anti_content+'&pdduid='+str(self.log_user_id) url = self.build_url(mall_id, page, 500) meta = {'page':page, 'mall_id':mall_id, 'goods_list':goods_list} headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers,dont_filter=True,errback=self.errback_httpbin) else: if goods_list: item = GoodsSalesItem() item['goods_list'] = goods_list item['mall_id'] = mall_id yield item
def parse(self, response): pass goods_list = response.meta['goods_list'] ##产品集合 mall_id = response.meta['mall_id'] ##店铺ID page = response.meta['page'] ##每返回一次页面数据 记录页数 mall_goods = response.body.decode('utf-8') ##bytes转换为str mall_goods = json.loads(mall_goods) goods_len = len(mall_goods['goods_list']) if goods_len > 0: goods_list = goods_list + mall_goods['goods_list'] ##合并产品列表 if goods_len > self.pageSize - 100: page += 1 ##继续采集下一页面 url = 'http://apiv4.yangkeduo.com/api/turing/mall/query_cat_goods?category_id=0&type=0&sort_type=_sales&mall_id=' + str( mall_id) + '&page_no=' + str(page) + '&page_size=500' meta = {'page': page, 'mall_id': mall_id, 'goods_list': goods_list} headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers) else: if goods_list: item = GoodsSalesItem() item['goods_list'] = goods_list item['mall_id'] = mall_id yield item
def parse(self, response): pass mall_id = response.meta['mall_id'] mall_goods = response.body.decode('utf-8') self.save_mall_log(mall_id, mall_goods) self.success_count += 1 self.save_mall_success_log( mall_id, json.dumps({ 'success_count': self.success_count, 'request_count': self.request_count, 'error_count': self.error_count })) goods_list = response.meta['goods_list'] ##产品集合 mall_id = response.meta['mall_id'] ##店铺ID page = response.meta['page'] ##每返回一次页面数据 记录页数 proxy = response.meta['proxy'] ##使用原始代理 mall_goods = response.body.decode('utf-8') ##bytes转换为str #self.save_mall_log(mall_id, mall_goods) mall_goods = json.loads(mall_goods) mall_goods = mall_goods['goods'] if 'goods_list' not in mall_goods.keys(): #self.ssdb_client.hset(self.fail_hash, mall_id, mall_id) return None goods_len = len(mall_goods['goods_list']) if goods_len > 0: goods_list = goods_list + mall_goods['goods_list'] ##合并产品列表 else: return None if goods_len > self.pageSize - 100: page += 1 ##继续采集下一页面 url = self.build_url(mall_id, page, 500) meta = { 'page': page, 'mall_id': mall_id, 'goods_list': goods_list, 'proxy': proxy } headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin) else: if goods_list: item = GoodsSalesItem() item['goods_list'] = goods_list item['mall_id'] = mall_id #print(item) yield item
def parse(self, response): pass meta = response.meta html = response.body.decode('utf-8') ##bytes转换为str # print(1, html) regex_content = re.search('window\.rawData= (.*)\;\s*\<\/script\>', html) if not regex_content: self.err_after(meta) return None rawData = json.loads(regex_content.group(1)) if 'initDataObj' not in rawData.keys(): self.err_after(meta) return None initDataObj = rawData['initDataObj'] ##str转为字典 # print(2,initDataObj) if 'needLogin' in initDataObj.keys(): self.err_after(meta, True) return None if 'goods' not in initDataObj.keys(): self.err_after(meta) return None goods_data = initDataObj['goods'] ##str转为字典 # print(goods_data) if 'skus' not in goods_data.keys(): self.err_after(meta) return None time = response.meta['time'] ##产品集合 goods_id = response.meta['goods_id'] ##店铺ID skus = [] for sku in goods_data['skus']: spec = '' for specX in sku['specs']: spec += str(specX['spec_value']) skuDetail = { 'sku_id': sku['skuID'], 'thumb_url': sku['thumbUrl'], 'quantity': sku['quantity'], 'is_onsale': sku['isOnSale'], 'spec': spec, 'normal_price': sku['normalPrice'], 'group_price': sku['groupPrice'], 'specs': sku['specs'], 'weight': 0, } skus.append(skuDetail) # print(skus) item = GoodsSalesItem() goods_data['sku'] = skus goods_data['time'] = time item['goods_list'] = goods_data item['mall_id'] = goods_id yield item
def parse(self, response): pass time = response.meta['time'] ##产品集合 goods_id = response.meta['goods_id'] ##店铺ID goods_data = response.body.decode('utf-8') ##bytes转换为str goods_data = json.loads(goods_data) if 'goods_id' in goods_data.keys(): item = GoodsSalesItem() goods_data['time'] = time item['goods_list'] = goods_data item['mall_id'] = goods_id yield item
def parse(self, response): pass mall_id = response.meta['mall_id'] ##店铺ID page = response.meta['page'] ##每返回一次页面数据 记录页数 proxy = response.meta['proxy'] ##使用原始代理 mall_goods = response.body.decode('utf-8') ##bytes转换为str #self.save_mall_log(mall_id, mall_goods) mall_goods = json.loads(mall_goods) if 'goods_list' not in mall_goods.keys(): #self.ssdb_client.hset(self.fail_hash, mall_id, mall_id) return None goods_len = len(mall_goods['goods_list']) # print(goods_len) if goods_len > 0: goods_list = mall_goods['goods_list'] ##合并产品列表 else: return None if goods_list: item = GoodsSalesItem() item['goods_list'] = goods_list item['mall_id'] = mall_id #print(item) yield item if goods_len >= 30 and page < self.max_page: page += 1 ##继续采集下一页面 url = self.build_url(mall_id, page, 50) meta = {'page': page, 'mall_id': mall_id, 'proxy': proxy} headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin)
def parse(self, response): mall_id = response.meta['mall_id'] ##店铺ID page = response.meta['page'] ##每返回一次页面数据 记录页数 proxy = response.meta['proxy'] ##使用原始代理 mall_goods = response.body.decode('utf-8') ##bytes转换为str #self.save_mall_log(mall_id, mall_goods) mall_goods = json.loads(mall_goods) if 'goods_list' not in mall_goods.keys(): # self.ssdb_client.qpush_back('', '') return None mall_goods = mall_goods['goods_list'] goods_len = len(mall_goods) item = GoodsSalesItem() item['goods_list'] = mall_goods item['mall_id'] = mall_id self.save_mall_log(page, json.dumps(mall_goods)) # print(item) yield item if goods_len < 50: return None else: page += 1 ##继续采集下一页面 url = self.build_url(mall_id, page, 50) meta = {'page': page, 'mall_id': mall_id, 'proxy': proxy} headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin)