def parse(self, response): subject_info = response.meta['subject_info'] page = int(response.meta['page']) goods_list = response.meta['goods_list'] page_size = int(response.meta['page_size']) rank = (page - 1) * page_size + 1 content = json.loads(response.body.decode('utf-8')) if 'goods_list' in content.keys() and len(content['goods_list'] ) > 0: for goods in content['goods_list']: price = float(goods['group']['price']/100) goods_id = goods['goods_id'] goods_info = self.build_goods_rank_info(goods_id, subject_info['subject_id'], subject_info['type'], rank, price) rank += 1 goods_list.append(goods_info) page += 1 url = self.get_activity_url(subject_info, page, page_size) meta= {'subject_info':subject_info, 'page':page, 'page_size':page_size, 'goods_list':goods_list} yield scrapy.Request(url, meta=meta, callback=self.parse) else: if goods_list: item = CategoryGoodsItem() item['goods_lists'] = goods_list #print(len(goods_lists), cat_id) yield item
def parse(self, response): # print(response.meta) meta = response.meta offset = response.meta['offset'] cat_id = response.meta['cat_id'] opt_type = response.meta['opt_type'] receive_info = response.body.decode('utf-8') data = json.loads(receive_info) flip = data['flip'] goods_lists = [] # print('parse_before', flip, offset,len(data['goods_list']),cat_id) if data['goods_list']: i = 0 for goods_data in data['goods_list']: goods_id = goods_data['goods_id'] i += 1 goods_lists.append(goods_data) offset += i item = CategoryGoodsItem() item['goods_lists'] = goods_lists self.save_goods_log(cat_id, offset, receive_info) # print('parse_middle',flip,offset,len(data['goods_list']),cat_id) yield item # print('parse_after',flip,offset,len(data['goods_list']),cat_id) if i >= self.pagesise and offset < 900: url = self.build_url(opt_type, cat_id, offset, flip) meta['offset'] = offset headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin)
def parse(self, response): receive_info = response.body.decode('utf-8') ##bytes转换为str goods_info = json.loads(receive_info) goods_id = goods_info['goods_id'] self.save_goods_log(goods_id, receive_info) item = CategoryGoodsItem() item['goods_lists'] = goods_info yield item
def parse(self, response): # print(response.meta) meta = response.meta offset = response.meta['offset'] cat_id = response.meta['cat_id'] opt_type = response.meta['opt_type'] receive_info = response.body.decode('utf-8') # self.save_goods_log(cat_id, offset, receive_info) data = json.loads(receive_info) logging.debug( json.dumps({ 'cat_id': cat_id, 'offset': offset, 'receive_info': data })) goods_lists = [] # print('parse_before', flip, offset,len(data['goods_list']),cat_id) if 'goods_list' not in data.keys(): self.err_after(meta, True) return False # flip = data['flip'] if len(data['goods_list']) > 0: i = 0 for goods_data in data['goods_list']: i += 1 rank = offset + i goods_data['rank'] = rank goods_data['subject_id'] = cat_id goods_data['type'] = 2 goods_lists.append(goods_data) offset += i item = CategoryGoodsItem() item['goods_lists'] = goods_lists # print('parse_middle',flip,offset,len(data['goods_list']),cat_id) yield item # print('parse_after',flip,offset,len(data['goods_list']),cat_id) if i >= self.realPagesise and offset < 1000 - self.realPagesise: uri = self.build_uri(opt_type, cat_id, offset, '') meta['offset'] = offset headers = self.make_headers() form_data = { 'name': 'getOperationGroups', 'method': 'GET', 'domain': 'http://apiv4.yangkeduo.com', 'uri': uri, 'headers': json.dumps(headers), } logging.debug(json.dumps(form_data)) yield scrapy.FormRequest(url=self.gateway_url, formdata=form_data, meta=meta, headers={}, callback=self.parse, dont_filter=True, errback=self.errback_httpbin)
def parse(self, response): result = json.loads(response.body.decode('utf-8')) logging.debug(json.dumps({"result": result, "meta": response.meta})) subject_info = response.meta['subject_info'] page = int(response.meta['page']) rank = (page - 1) * 500 + 1 api_type = subject_info["api_type"] goods_list = [] if api_type in [11]: # items 限时秒杀 # self.save_log(json.dumps({'goods_api_kill': api_type, 'goods_api_items_kill': result["items"]})) if "items" in result.keys() and len(result["items"]) > 0: for i in result["items"]: try: goods_id = i["data"]["goods_id"] except Exception: goods_id = None if goods_id: goods_info = self.get_kill_goods_info(subject_info, rank, i["data"]) goods_list.append(goods_info) self.save_log(json.dumps({'goods_info_kill_' + str(api_type): goods_info})) else: return None if api_type in [14]: # 品牌秒杀子活动 # self.save_log(json.dumps({'goods_api_slow': api_type, 'goods_api_items_slow': result["result"]})) if "result" in result.keys() and len(result["result"]) > 0: for i in result["result"]: goods_info = self.get_goods_info_slow(subject_info, rank, i) goods_list.append(goods_info) self.save_log(json.dumps({'goods_info_slow' + str(api_type): goods_info})) else: return None if api_type in [21, 72]: # list 断码清仓/首页子活动 if "list" in result.keys() and len(result["list"]) > 0: for i in result["list"]: goods_info = self.get_goods_info(subject_info, rank, i) goods_list.append(goods_info) self.save_log(json.dumps({'goods_info_short_' + str(api_type): goods_info})) else: return None if api_type in [31, 41]: # goods_list # 品牌馆/9块9特卖 # self.save_log(json.dumps({'goods_api_brand': api_type, 'goods_api_items_brand': result["goods_list"]})) if "goods_list" in result.keys() and len(result["goods_list"]) > 0: for i in result["goods_list"]: goods_info = self.get_goods_info(subject_info, rank, i) goods_list.append(goods_info) self.save_log(json.dumps({'goods_info_brand_' + str(api_type): goods_info})) else: return None item = CategoryGoodsItem() logging.debug(json.dumps({'goods_list': goods_list, "goods_len": len(goods_list)})) self.save_log(json.dumps({'goods_list_all': goods_list, "goods_len": len(goods_list)})) item['goods_lists'] = goods_list yield item
def parse(self, response): # print(response.meta) meta = response.meta offset = response.meta['offset'] cat_id = response.meta['cat_id'] opt_type = response.meta['opt_type'] receive_info = response.body.decode('utf-8') # self.save_goods_log(cat_id, offset, receive_info) data = json.loads(receive_info) logging.debug( json.dumps({ 'cat_id': cat_id, 'offset': offset, 'receive_info': data })) goods_lists = [] # print('parse_before', flip, offset,len(data['goods_list']),cat_id) if 'goods_list' in data.keys(): flip = data['flip'] if len(data['goods_list']) > 0: i = 0 for goods_data in data['goods_list']: i += 1 rank = offset + i goods_data['rank'] = rank goods_data['subject_id'] = cat_id goods_data['type'] = 2 goods_lists.append(goods_data) offset += i item = CategoryGoodsItem() item['goods_lists'] = goods_lists # print('parse_middle',flip,offset,len(data['goods_list']),cat_id) yield item # print('parse_after',flip,offset,len(data['goods_list']),cat_id) if i >= self.pagesise and offset < 900: url = self.build_url(opt_type, cat_id, offset, flip) meta['offset'] = offset headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin) else: yield scrapy.Request(response.url, meta=meta, callback=self.parse, headers=response.headers, dont_filter=True, errback=self.errback_httpbin)
def parse(self, response): receive_info = response.body.decode('utf-8') ##bytes转换为str goods_info = json.loads(receive_info) #self.save_goods_log(goods_id, receive_info) item = CategoryGoodsItem() self.success_count += 1 # self.save_count_log('success', json.dumps({ # 'success_count': self.success_count # })) item['goods_lists'] = goods_info['goods'] item['goods_lists']['price'] = goods_info['price'] item['goods_lists']['sku'] = goods_info['sku'] yield item
def parse(self, response): pass receive_info = response.body.decode('utf-8') ##bytes转换为str goods_info = json.loads(receive_info) goods_id = goods_info['goods_id'] #self.save_goods_log(goods_id, receive_info) item = CategoryGoodsItem() item['goods_lists'] = { 'goods_id': goods_id, 'goods_sales': goods_info['sales'], 'mall_id': goods_info['mall_id'], 'goods_price': goods_info['min_on_sale_group_price'] } yield item
def parse_goods_info(self, response): goods_info = response.meta['goods_info'] content = response.body.decode('utf-8') a = re.search('window\.rawData= (.*)\;\s*\<\/script\>', content) if a: content = json.loads(a.group(1)) if 'goods' not in content.keys(): return False goods = content['goods'] goods_info['price'] = goods['minOnSaleGroupPrice'] item = CategoryGoodsItem() item['goods_lists'] = [goods_info] #print(len(goods_lists), cat_id) yield item
def parse(self, response): receive_info = response.body.decode('utf-8') ##bytes转换为str goods_info = json.loads(receive_info) #self.save_goods_log(goods_id, receive_info) item = CategoryGoodsItem() self.success_count += 1 self.save_count_log(json.dumps({ 'type': 'success', 'proxy': response.meta['proxy'], 'success_count': self.success_count, 'error_count': self.error_count, })) item['goods_lists'] = goods_info['goods'] item['goods_lists']['price'] = goods_info['price'] item['goods_lists']['sku'] = goods_info['sku'] yield item
def parse(self, response): logging.debug(response.meta) meta = response.meta offset = response.meta['offset'] cat_id = response.meta['cat_id'] opt_type = response.meta['opt_type'] receive_info = response.body data = json.loads(receive_info.decode('utf-8')) self.save_goods_log( cat_id, json.dumps({ 'cat_id': cat_id, 'offset': offset, 'receive_info': data })) goods_lists = [] if 'goods_list' not in data.keys(): self.err_after(meta, True) return False flip = data['flip'] if len(data['goods_list']) > 0: i = 0 for goods_data in data['goods_list']: i += 1 rank = offset + i goods_data['rank'] = rank goods_data['subject_id'] = cat_id goods_data['type'] = 2 goods_lists.append(goods_data) offset += i item = CategoryGoodsItem() item['goods_lists'] = goods_lists yield item if i >= self.realPagesise and offset < 1000 - self.realPagesise: url = self.build_url(opt_type, cat_id, offset, flip) meta['offset'] = offset meta['flip'] = flip headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin)
def parse(self, response): #print(response.meta) page = int(response.meta['page']) subject_id = response.meta['subject_id'] goods_lists = response.meta['goods_lists'] activity_type = response.meta['activity_type'] data = json.loads(response.body.decode('utf-8')) #print(len(goods_lists)) if data['goods_list']: i = 1 for goods_data in data['goods_list']: goods_id = goods_data['goods_id'] rank = (page - 1) * 500 + i ##计算排名 i += 1 price = float(goods_data['group']['price'] / 100) goods_lists.append({ 'goods_id': goods_id, 'rank': rank, 'subject_id': subject_id, 'type': 1, 'price': price }) page += 1 url = self.get_activity_url(subject_id, activity_type, page) meta = { 'subject_id': subject_id, 'page': page, 'goods_lists': goods_lists, 'activity_type': activity_type } #print(url) headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers) else: item = CategoryGoodsItem() item['goods_lists'] = goods_lists #print(goods_lists) yield item
def parse(self, response): subject_info = response.meta['subject_info'] page = int(response.meta['page']) goods_list = response.meta['goods_list'] page_size = int(response.meta['page_size']) rank = (page - 1) * page_size + 1 content = json.loads(response.body.decode('utf-8')) if 'goods_list' in content.keys() and len(content['goods_list']) > 0: for goods in content['goods_list']: if 'icon' in goods.keys(): if goods['icon'][ 'url'] == 'http://t00img.yangkeduo.com/t02img/images/2018-05-31/b0a6bc8699c92bf40d9d6d46b63dd49f.png': price = float(goods['group']['price'] / 100) goods_id = goods['goods_id'] rank = int(goods['cnt']) goods_info = self.build_goods_rank_info( goods_id, -618, 1, rank, price) #rank += 1 goods_list.append(goods_info) page += 1 url = self.get_activity_url(subject_info, page, page_size) meta = { 'subject_info': subject_info, 'page': page, 'page_size': page_size, 'goods_list': goods_list } headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers) else: if goods_list: item = CategoryGoodsItem() item['goods_lists'] = goods_list #print(goods_list) yield item
def parse(self, response): # print(response.meta) meta = response.meta offset = response.meta['offset'] cat_id = response.meta['cat_id'] opt_type = response.meta['opt_type'] data = json.loads(response.body.decode('utf-8')) flip = data['flip'] goods_lists = [] # print('parse_before', flip, offset,len(data['goods_list']),cat_id) if data['goods_list']: i = 0 for goods_data in data['goods_list']: goods_id = goods_data['goods_id'] i += 1 rank = offset + i price = float(goods_data['group']['price'] / 100) goods_lists.append({ 'goods_id': goods_id, 'rank': rank, 'subject_id': cat_id, 'type': 2, 'price': price }) offset += i item = CategoryGoodsItem() item['goods_lists'] = goods_lists # print('parse_middle',flip,offset,len(data['goods_list']),cat_id) yield item # print('parse_after',flip,offset,len(data['goods_list']),cat_id) if i >= self.pagesise and offset < 900: url = self.build_url(opt_type, cat_id, offset, flip) meta['offset'] = offset headers = self.make_headers() yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin)
def parse(self, response): hash_name = 'pdd_category_goods_sales_hash' is_end = False start_key = '' item = CategoryGoodsItem() while not is_end: goods_list = self.ssdb.hscan(hash_name, '', '', 10) if not goods_list: is_end = True continue for i in goods_list: i = json.loads(i.decode('utf-8')) if type(i) != dict: self.ssdb.hdel(hash_name, i) continue item['goods_lists'] = i yield item
def parse(self, response): goods_id = response.meta['goods_id'] ##店铺ID goods_data = response.body.decode('utf-8') ##bytes转换为str goods_data = json.loads(goods_data) if 'goods_id' in goods_data["goods"].keys(): item = CategoryGoodsItem() detail, skus, galleries = self.make_goods_quantity_data(goods_data, int(time.time())) logging.debug(json.dumps({ 'detail': detail, 'skus': skus, 'galleries': galleries })) item['goods_lists'] = { 'detail': detail, 'skus': skus, 'galleries': galleries } yield item
def parse(self, response): result = json.loads(response.body.decode('utf-8')) # logging.debug(json.dumps({"result": result, "meta": response.meta})) subject_info = response.meta['subject_info'] page = int(response.meta['page']) rank = (page - 1) * 500 + 1 api_type = subject_info["api_type"] goods_list = [] if api_type in [72]: # list 首页子活动 if "list" in result.keys() and len(result["list"]) > 0: for i in result["list"]: goods_info = self.get_goods_info(subject_info, rank, i) goods_list.append(goods_info) # self.save_log(json.dumps({'goods_info_' + str(api_type): goods_info})) else: return None if api_type in [71]: # goods_list # 首页商品 # self.save_log(json.dumps({'goods_api_brand': api_type, 'goods_api_items_brand': result["goods_list"]})) if "goods_list" in result.keys() and len(result["goods_list"]) > 0: for i in result["goods_list"]: goods_info = self.get_goods_info(subject_info, rank, i) goods_list.append(goods_info) # self.save_log(json.dumps({'goods_info_' + str(api_type): goods_info})) else: return None if api_type in [31]: # goods_list # 品牌馆 # self.save_log(json.dumps({'goods_api_brand': api_type, 'goods_api_items_brand': result["goods_list"]})) if "goods_list" in result.keys() and len(result["goods_list"]) > 0: for i in result["goods_list"]: goods_info = self.get_goods_info_brand( subject_info, rank, i) goods_list.append(goods_info) # self.save_log(json.dumps({'goods_info_' + str(api_type): goods_info})) else: return None if api_type in [62, 63, 64]: # result/goods_list 首页轮播 null = None false = None # self.save_log(json.dumps({'goods_api_shopping': api_type, 'goods_api_items_shopping': result["result"]["goods_list"]})) if "result" in result.keys() and len( result["result"]["goods_list"]) > 0: for i in result["result"]["goods_list"]: goods_info = self.get_goods_info_lunbo_1( subject_info, rank, i) goods_list.append(goods_info) # self.save_log(json.dumps({'goods_info_' + str(api_type): goods_info})) else: return None if api_type in [51]: # result/goods_list 爱逛街 null = None false = None # self.save_log(json.dumps({'goods_api_shopping': api_type, 'goods_api_items_shopping': result["result"]["goods_list"]})) if "result" in result.keys() and len( result["result"]["goods_list"]) > 0: for i in result["result"]["goods_list"]: goods_info = self.get_shopping_goods_info( subject_info, rank, i) goods_list.append(goods_info) # self.save_log(json.dumps({'goods_info_' + str(api_type): goods_info})) else: return None if api_type in [61]: # result/goods_list 首页轮播 # self.save_log(json.dumps({'goods_api_shopping': api_type, 'goods_api_items_shopping': result["goods_list"]})) if "goods_list" in result.keys() and len(result["goods_list"]) > 0: for i in result["goods_list"]: goods_info = self.get_goods_info_lunbo_2( subject_info, rank, i) goods_list.append(goods_info) # self.save_log(json.dumps({'goods_info_' + str(api_type): goods_info})) else: return None item = CategoryGoodsItem() # logging.debug(json.dumps({'goods_list': goods_list, "goods_len": len(goods_list)})) # self.save_log(json.dumps({'goods_list_all': goods_list, "goods_len": len(goods_list)})) item['goods_lists'] = goods_list yield item
def parse(self, response): result = json.loads(response.body.decode('utf-8')) logging.debug(json.dumps({"result": result, "meta": response.meta})) subject_info = response.meta['subject_info'] page = int(response.meta['page']) rank = (page - 1) * 500 + 1 api_type = subject_info["api_type"] goods_list = [] if api_type in [11, 14, 15]: # items 限时秒杀 if "item" in result.keys() and len(result["items"]) > 0: for i in result["items"]: goods_info = self.get_kill_goods_info( subject_info, rank, i["data"]) goods_list.append(goods_info) self.save_log(json.dumps({'goods_info_kill': goods_info})) else: return None if api_type in [16]: # result if "result" in result.keys() and len(result["result"]) > 0: for i in result["result"]: goods_info = self.get_goods_info(subject_info, rank, i) goods_list.append(goods_info) self.save_log(json.dumps({'goods_info_others': goods_info})) else: return None if api_type in [21]: # list 断码清仓 if "list" in result.keys() and len(result["list"]) > 0: for i in result["list"]: goods_info = self.get_goods_info(subject_info, rank, i) goods_list.append(goods_info) self.save_log(json.dumps({'goods_info_short': goods_info})) else: return None if api_type in [31, 41, 61]: # goods_list # 品牌馆 9块9特卖 if "goods_list" in result.keys() and len(result["goods_list"]) > 0: for i in result["goods_list"]: goods_info = self.get_goods_info(subject_info, rank, i) goods_list.append(goods_info) self.save_log(json.dumps({'goods_info_brand': goods_info})) else: return None if api_type in [51]: # result/goods_list 爱逛街 if "result" in result.keys() and len( result["result"]["goods_list"]) > 0: for i in result["result"]["goods_list"]: goods_info = self.get_shopping_goods_info( subject_info, rank, i) goods_list.append(goods_info) self.save_log( json.dumps({'goods_info_shopping': goods_info})) else: return None item = CategoryGoodsItem() logging.debug( json.dumps({ 'goods_list': goods_list, "goods_len": len(goods_list) })) self.save_log( json.dumps({ 'goods_list_all': goods_list, "goods_len": len(goods_list) })) item['goods_lists'] = goods_list yield item