def homeSubjectParse(self, response): content = json.loads(response.body.decode('utf-8')) if 'data' in content.keys(): subject_list = [] name = '首页' subject_id = 0 path = [name] path_id = [0] info = self.build_subject_info(subject_id, name, path, path_id, 1, 3) subject_list.append(info) for subject in content['data']: subject_mix_data = self.get_child_subject_info(subject) ##获取活动下级mix信息 if not subject_mix_data: continue for v in subject_mix_data: child_name = v['name'] child_subject_id = v['subject_id'] if child_name: subject_info = self.build_subject_info(child_subject_id, child_name, path + [child_name], path_id + [child_subject_id]) subject_list.append(subject_info) else: headers = self.make_headers() url = 'http://apiv3.yangkeduo.com/subject/'+str(child_subject_id) meta = {'path':path, 'path_id':path_id} yield scrapy.Request(url, meta=meta, callback=self.parse_subject_info, headers=headers,dont_filter=True,errback=self.errback_httpbin) item = CategoryItem() item['cat_list'] = subject_list #print(item) yield item
def brand_parse_subject(self, response): """ 品牌馆 """ body = response.body.decode() logging.debug(json.dumps({'body': body})) path = response.meta["path"] path_id = response.meta["path_id"] result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body) if result: subject_list = [] result = json.loads(result.group()) tabList = self.dict_get(result, 'tabList', None) if tabList and len(tabList) > 0: a = 1 for i in tabList: subject_id = str(i["web_url"]) subject_id = re.search(r"\d+", subject_id).group() name = i['tab_name'] subject_info = self.build_subject_info( subject_id, name, path + [name], path_id + [subject_id], 31, 3, a) a += 1 subject_list.append(subject_info) self.save_log( json.dumps({"subject_info_brand": subject_info})) item = CategoryItem() logging.debug(json.dumps({'subject_list_brand': subject_list})) self.save_log(json.dumps({'subject_list_brand': subject_list})) item['cat_list'] = subject_list yield item
def shopping_parse_subject(self, response): """ 爱逛街 """ subject_list = [] path = response.meta["path"] path_id = response.meta["path_id"] body = response.body.decode('utf-8') logging.debug(json.dumps({'body': body})) result = json.loads(body) list_subject = self.dict_get(result, 'list', None) if list_subject: a = 1 for i in list_subject: subject_id = i["tab_id"] name = i["subject"] subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 51, 5, a) a += 1 subject_list.append(subject_info) self.save_log( json.dumps({"subject_info_shopping": subject_info})) item = CategoryItem() logging.debug(json.dumps({'subject_list_shopping': subject_list})) self.save_log(json.dumps({'subject_list_shopping': subject_list})) item['cat_list'] = subject_list yield item
def parse_subject_banner(self, response): """ 首页轮播""" subject_list = [] body = response.body.decode('utf-8') logging.debug(json.dumps({'body': body})) result = re.search(r'{"store".*?"ssr":true}', body) path = response.meta['path'] path_id = response.meta['path_id'] if result: result_dict = json.loads(result.group()) result_str = result.group() try: name = result_dict["store"]["pageTitle"] except Exception: name = '' f = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str) if not f: f = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str) subject_id_list = re.findall(r"\d+", str(f)) a = 1 for subject_id in subject_id_list: subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 61, 8, a) a += 1 subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner": subject_info})) item = CategoryItem() logging.debug(json.dumps({'subject_list_banner': subject_list})) self.save_log(json.dumps({'subject_list_banner': subject_list})) item['cat_list'] = subject_list yield item
def get_third_category(self, response): data = json.loads(response.body.decode('utf-8')) if data['opt_infos']: item = CategoryItem() cat_name = response.meta['cat_name'] cat_id = response.meta['cat_id'] cat_list = response.meta['cat_list'] #first_name = response.meta['first_name'] #second_name = response.meta['second_name'] for info in data['opt_infos']: name = info['opt_name'] subject_id = info['id'] path = cat_name['first_name'] + '>' + cat_name[ 'second_name'] + '>' + name path_id = str(cat_id['first_id']) + '>' + str( cat_id['second_id']) + '>' + str(subject_id) cat = { 'subject_id': subject_id, 'name': name, 'type': 2, 'path': path, 'path_id': path_id } cat_list.append(cat) #print(cat) item['cat_list'] = cat_list yield item
def short_parse_subject(self, response): """ 断码清仓""" path = response.meta["path"] path_id = response.meta["path_id"] body = response.body.decode() logging.debug(json.dumps({'body': body})) result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body) if result: result = json.loads(result.group()) result = self.dict_get(result, 'filterTabList', None) subject_list = [] a = 1 if result and len(result) > 0: for i in result: subject_id = i["id"] str_i = str(i) d = re.search(r"'brand_name': '\w+'", str_i).group() name = re.sub(r"'brand_name':", '', d) subject_info = self.build_subject_info( subject_id, name, path + [name], path_id + [subject_id], 21, 6, a) a += 1 subject_list.append(subject_info) self.save_log( json.dumps({"subject_info_short": subject_info})) item = CategoryItem() logging.debug(json.dumps({'subject_list_short': subject_list})) self.save_log(json.dumps({'subject_list_short': subject_list})) item['cat_list'] = subject_list yield item
def parse_keyword_extra(self, response): keyword_data = response.meta['keyword_data'] times = response.meta['times'] query_data = response.meta['query_data'] get_data = True result = json.loads(response.body.decode('utf-8')) if 'errorCode' in result.keys() and result['errorCode'] == 1000000: pass keyword_extend_data = result['result'][0] keyword = keyword_extend_data['word'] keyword_merge_data = self.merge_keyword_data(keyword_data, keyword_extend_data) if keyword_merge_data is False: get_data = False else: self.ssdb_client.hset(self.keyword_extend_hash, keyword, json.dumps(keyword_extend_data)) else: get_data = False if get_data is False: ##未获取到数据 if times < 3: ##重试次数少于三次 meta = {'keyword_data': keyword_data, 'times': times + 1, 'query_data': query_data} headers = self.make_headers() yield scrapy.Request(response.url, method="POST", meta=meta, body=query_data, headers=headers, dont_filter=True, callback=self.parse_keyword_extra) else: keyword_merge_data = self.merge_keyword_data(keyword_data, {}) get_data = True if get_data is True: keywordItem = CategoryItem() keywordItem['cat_list'] = keyword_merge_data yield keywordItem
def parse(self, response): pass cat_list = [] categoryInfo = response.body.decode('utf-8') ##bytes转换为str categoryInfo = json.loads(categoryInfo) ##str转为字典 if 'errorCode' in categoryInfo.keys() and categoryInfo['errorCode'] == 1000000: for cat in categoryInfo['result']: cat_id = cat['id'] cat_name= cat['cat_name'] parent_id=cat['parent_id'] cat_level = cat['level'] info = {'cat_id':cat_id, 'cat_name':cat_name, 'level':cat_level, 'parent_id':parent_id} info['cat_id_1'] = cat['cat_id_1'] info['cat_id_2'] = cat['cat_id_2'] info['cat_id_3'] = cat['cat_id_3'] info['cat_id_4'] = cat['cat_id_4'] cat_list.append(info) if cat_level != 3: headers = self.make_headers() yield scrapy.FormRequest(self.url+'?&parentId='+str(cat_id),callback=self.parse, headers=headers) CatItem = CategoryItem() CatItem['cat_list'] = cat_list yield CatItem elif 'error_code' in categoryInfo.keys() and categoryInfo['error_code'] == 43001: self.get_pdd_login_info()
def special_parse_subject(self, response): """ 9块9特卖 """ subject_list = [] path = response.meta["path"] path_id = response.meta["path_id"] body = response.body.decode() logging.debug(json.dumps({'body': body})) result = json.loads(body) if 'list' in result.keys() and len(result['list']) > 0: a = 1 for i in result['list']: subject_id = i["tab_id"] name = i["subject"] subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 41, 4, a) a += 1 subject_list.append(subject_info) self.save_log( json.dumps({"subject_info_special": subject_info})) item = CategoryItem() logging.debug(json.dumps({'subject_list_special': subject_list})) self.save_log(json.dumps({'subject_list_special': subject_list})) item['cat_list'] = subject_list yield item
def parse_subjects(self, response): data_list = [] result = json.loads(response.body.decode('utf-8')) lists = result['list'] path = response.meta['path'] for data in lists: name = data['subject'] new_path = [name] new_path = path + new_path ##path.append(data['subject']) info = { 'subject_id': data['subject_id'], 'path': new_path, 'name': name, 'type': 1, 'activity_type': 2, 'path_id': [] } data_list.append(info) item = CategoryItem() item['cat_list'] = data_list # print(item) yield item
def kill_parse_subject(self, response): """ 限时秒杀""" path = response.meta["path"] path_id = response.meta["path_id"] subject_list = response.meta["subject_list"] self.save_log(json.dumps({"kill_subject_list": subject_list})) body = response.body.decode() logging.debug(json.dumps({'body': body})) result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body).group() if result: result = json.loads(result) result = self.dict_get(result, 'brandList', None) if result: a = 1 for i in result: subject_id = i["data"]["id"] name = i["data"]["name"] subject_info = self.build_subject_info( subject_id, name, path + [name], path_id + [subject_id], 14, 7, a) a += 1 subject_list.append(subject_info) self.save_log( json.dumps({"subject_info_kill": subject_info})) item = CategoryItem() logging.debug(json.dumps({'subject_list_kill': subject_list})) self.save_log(json.dumps({'subject_list_kill': subject_list})) item['cat_list'] = subject_list yield item
def short_parse_subject(self, response): """ 断码清仓""" path = response.meta["path"] path_id = response.meta["path_id"] body = response.body.decode() result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body) if result: result = json.loads(result.group()) result = self.dict_get(result, 'filterTabList', None) subject_list = [] a = 0 if result and len(result) > 0: for i in result: a += 1 subject_id = i["id"] name = i['tabName'] subject_info = self.build_subject_info_brand_time( subject_id, name, path + [name], path_id + [subject_id], 21, 6, a) subject_list.append(subject_info) self.save_log( json.dumps({"subject_info_short": subject_info})) item = CategoryItem() item['cat_list'] = subject_list yield item
def special_parse_subject(self, response): """ 9块9特卖 """ subject_list = [] path = response.meta["path"] path_id = response.meta["path_id"] body = response.body.decode() result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body) if result: result = json.loads(result.group()) tab_list = self.dict_get(result, 'tabList', None) if tab_list: a = 0 for i in tab_list: a += 1 subject_id = i["tab_id"] name = i["subject"] subject_info = self.build_subject_info_brand_time( subject_id, name, path + [name], path_id + [subject_id], 41, 4, a) subject_list.append(subject_info) self.save_log( json.dumps({"subject_info_special": subject_info})) item = CategoryItem() item['cat_list'] = subject_list yield item
def parse(self, response): """ 获取首页活动信息""" body = response.body.decode("utf-8") result = json.loads(body) # 处理首页下拉商品中的活动 logging.debug(json.dumps({'result': result})) list_activity = self.dict_get(result, 'crossSlideList', None) logging.debug(json.dumps({'list_activity': list_activity})) banner = self.dict_get(result, 'carouselData', None) logging.debug(json.dumps({'banner': banner})) if list_activity: subject_list = [] subject_info = self.build_subject_info(71, "首页商品", "首页商品", [71], 71, 1) subject_list.append(subject_info) a = 0 for i in list_activity: subject_list_id = re.findall(r"'brand_id': '\d+'", str(i)) subject_id_list = list(set(subject_list_id)) a += 1 logging.debug(json.dumps({'brand_id': subject_list_id})) if subject_id_list: if len(subject_id_list) == 1: name = self.dict_get(i, "subject", None) subject_id = self.dict_get(i, "subject_id", None) subject_info = self.build_subject_info( subject_id, name, name, [subject_id], 72, 1, a) subject_list.append(subject_info) logging.debug( json.dumps({'subject_info_home_1': subject_info})) self.save_log( json.dumps({"subject_info_home_1": subject_info})) else: b = 0 for i in i["subject_list"]: b += 1 subject_id = i['p_rec']["brand_id"] name = i["name"] subject_info = self.build_subject_info( subject_id, name, name, [subject_id], 72, 1, a, b) subject_list.append(subject_info) logging.debug( json.dumps( {'subject_info_home_2': subject_info})) self.save_log( json.dumps( {"subject_info_home_2": subject_info})) item = CategoryItem() logging.debug(json.dumps({'subject_list_home': subject_list})) self.save_log(json.dumps({'subject_list_home': subject_list})) item['cat_list'] = subject_list yield item
def curl_sub_info(self, response): sub_info = response.meta['sub_info'] message = json.loads(response.body.decode('utf-8')) subject_name= message['subject'] sub_info['path'] = sub_info['path'] + [subject_name] sub_info['name'] = subject_name item = CategoryItem() item['cat_list'] = [sub_info] yield item
def parse_category(response): loader = ItemLoader(item=CategoryItem(), response=response) loader.add_xpath( "category", '//div[@id="floor_1"]/div[@class="classify_kind"]' '/ul[@class="classify_kind_detail"]/li/a/text()') loader.add_xpath( "url", '//div[@id="floor_1"]/div[@class="classify_kind"]' '/ul[@class="classify_kind_detail"]/li/a/@href') items = loader.load_item() # {"影视写真": "http://category.dangdang.com/cp01.01.13.00.00.00.html"} #result = dict(zip(items.get("category"), items.get("url"))) return items
def parse(self, response): result = json.loads(response.body.decode('utf-8')) logging.debug(json.dumps(result)) if 'errorCode' in result.keys() and result['errorCode'] == 1000000: if result['result']['items'] is None: return None meta = response.meta cat = meta['cat'] cat_id = cat['cat_id'] level = cat['level'] day = meta['day'] rank = meta['rank'] page = meta['page'] # logging.log(logging.WARNING, str(cat_id)+'+'+str(len(result['result']['items']))) for keyword_item in result['result']['items']: keyword = keyword_item['query'] rank += 1 keyword_data = {} keyword_data['keyword'] = keyword keyword_data['category'] = cat_id keyword_data['cat_id_1'] = cat['cat_id_1'] if level == 2 or level == 3: keyword_data['cat_id_2'] = cat['cat_id_2'] if level == 3: keyword_data['cat_id_3'] = cat['cat_id_3'] keyword_data['hotness'] = keyword_item['heat'] keyword_data['richness'] = 27 keyword_data['rank'] = rank keyword_data['day'] = day keywords_data = self.merge_keyword_data(keyword_data, {}) keywordItem = CategoryItem() keywordItem['cat_list'] = keywords_data yield keywordItem if result['result']['count'] > rank and page < self.max_page: page += 1 meta['page'] = page meta['rank'] = rank query_data = self.build_query_data(cat, page, self.size, day) headers = self.make_headers() yield scrapy.Request(self.url, method="POST", body=json.dumps(query_data), meta=meta, headers=headers, callback=self.parse) '''先从ssdb中获取关键词扩展信息 ssdb没有则拉取接口'''
def parse_subject_children(self, response): subject_list = [] result = json.loads(response.body.decode('utf-8')) lists = result['list'] path = response.meta['path'] path_id = response.meta['path_id'] for data in lists: name = data['subject'] subject_id = data['subject_id'] new_path = path + [name] new_path_id = path_id + [subject_id] info = self.build_subject_info(subject_id, name, new_path, new_path_id, 1, 2) subject_list.append(info) if data['mix']: ##有子级活动 pass for mix in data['mix']: subject_mix_data = self.get_child_subject_info( mix) ##获取活动下级mix信息 if not subject_mix_data: continue for v in subject_mix_data: child_name = v['name'] child_subject_id = v['subject_id'] if child_name: subject_info = self.build_subject_info( child_subject_id, child_name, new_path + [child_name], new_path_id + [child_subject_id]) subject_list.append(subject_info) else: headers = self.make_headers() url = 'http://apiv3.yangkeduo.com/subject/' + str( child_subject_id) meta = {'path': new_path, 'path_id': new_path_id} yield scrapy.Request( url, meta=meta, callback=self.parse_subject_info, headers=headers, dont_filter=True, errback=self.errback_httpbin) item = CategoryItem() item['cat_list'] = subject_list #print(item) yield item
def parse_subject_info(self, response): path = response.meta['path'] path_id = response.meta['path_id'] result = json.loads(response.body.decode('utf-8')) subject_id = result['id'] name = result['subject'] new_path = path+[name] new_path_id = path_id + [subject_id] info = self.build_subject_info(subject_id, name, new_path, new_path_id, 1, 2) item = CategoryItem() item['cat_list'] = [info] #print(item) yield item
def parse(self, response): pass self.activity_list.clear() item = CategoryItem() result = json.loads(response.body.decode('utf-8')) if result['result']: for data in result['result']: img_url = data['img_url'] title = data['title'] url = data['link_url'] ##拆分出URL参数 url_arr = urlparse.urlparse(url) url_query = url_arr.query url_query = urlparse.parse_qs(url_query) path = ['首页banner轮播', title] ##活动图片 query_keys = url_query.keys() if 'subjects_id' in query_keys: ##子页面有下级分类 subject_id = url_query['subjects_id'][0] if int(subject_id) in [12, 14]: ##9.9特卖和品牌清仓跳过 continue new_url = 'http://apiv4.yangkeduo.com/subject_collection/' + str( subject_id) headers = self.make_headers() meta = {'path': path} yield scrapy.Request(new_url, meta=meta, callback=self.parse_subjects, headers=headers) ##抓取下级分类 elif 'subject_id' in query_keys: ##子页面无分类 subject_id = url_query['subject_id'][0] info = { 'path': path, 'subject_id': subject_id, 'name': title, 'type': 1, 'activity_type': 1, 'path_id': [] } self.activity_list.append(info) else: ##无法抓取 跳过 continue item['cat_list'] = self.activity_list yield item
def parse(self, response): result = json.loads(response.body.decode('utf-8')) logging.debug(json.dumps(result)) meta = response.meta cat = meta['cat'] day = meta['day'] rank_type = meta['rank_type'] cat_id = cat['cat_id'] cat_id_1 = cat['cat_id_1'] cat_id_2 = cat['cat_id_2'] cat_id_3 = cat['cat_id_3'] if 'success' in result.keys() and result['success'] and 'result' in result.keys(): content = json.dumps({'status': 'success', 'cat': cat, "result": result['result']}) self.save_cat_log(content) for keyword_item in result['result']: keyword_data = {'category': cat_id, 'day': day, 'rank_type': rank_type} keyword_data['cat_id_1'] = cat['cat_id_1'] if cat_id_2: keyword_data['cat_id_2'] = cat_id_2 if cat_id_3: keyword_data['cat_id_3'] = cat_id_3 keyword_data['rank_num'] = keyword_item['rankNum'] keyword_data['click_num'] = keyword_item['clickNum'] keyword_data['compete_value'] = keyword_item['competeValue'] keyword_data['ctr'] = keyword_item['ctr'] keyword_data['cvr'] = keyword_item['cvr'] keyword_data['impr_avg_bid'] = keyword_item['imprAvgBid'] keyword_data['pv'] = keyword_item['pv'] keyword_data['word'] = keyword_item['word'] keywordItem = CategoryItem() keywordItem['cat_list'] = keyword_data yield keywordItem else: content = json.dumps({'status': 'fail', 'cat': cat, "result": result}) self.save_cat_log(content) # 三级分类 mall_result = self.get_pass_mall_id(cat_id_1) mall_id, pass_id = mall_result if not mall_id: return None cookie = 'PASS_ID' + '=' + pass_id + ";" cat["mall_id"] = mall_id meta["proxy"] = self.get_proxy_ip(False) headers = self.make_headers(cookie) query_data = self.build_query_data(cat, meta['page'], self.size, day, rank_type) yield scrapy.Request(self.url, method="POST", body=json.dumps(query_data), meta=meta, headers=headers, callback=self.parse, dont_filter=True, errback=self.errback_httpbin) return None
def parse(self, response): level_one_cates = response.xpath( "//div[contains(@class, 'category-items')]//div[contains(@class, 'category-item')]" ) for each in level_one_cates: level = CATEGORY.LEVEL_ONE name = each.xpath("div[@class='mt']//span/text()").get() url = '' # 一级分类没有url path = name is_list = CATEGORY.LIST_NO yield CategoryItem( level=level, name=name, url=url, path=path, is_list=is_list, cat_id=None ) for two_each in self.parse_level_two_cates(each, name): yield two_each
def parse(self, response): meta = response.meta sub_list = [] activity_type = meta['activity_type'] data = json.loads(response.body.decode('utf-8')) if data['list']: for subject_info in data['list']: name = subject_info['subject'] subject_id = subject_info['subject_id'] path = [meta['name'], name] path_id = [meta['subject_id'], subject_id] info = {'subject_id':subject_id,'path':path,'name':name,'type':1,'activity_type':activity_type,'path_id':path_id} sub_list.append(info) if subject_info['mix']: ##有下级subject for child_sub in subject_info['mix']: sub_info = self.get_child_subject_info(path, path_id, child_sub, activity_type) ##获取子subject信息 if sub_info: for sub in sub_info: new_path_id = path_id + [int(sub['subject_id'])] sub['path_id'] = new_path_id sub['type'] = 1 sub['activity_type'] = activity_type if sub['name']: ##有subject_name if 'banner' in sub: new_path = path + [sub['banner']] del sub['banner'] else: new_path = path + [sub['name']] sub['path'] = new_path sub_list.append(sub) else: ##没有name则需要接口拉取subject信息获取name sub['path'] = path url = 'http://apiv3.yangkeduo.com/subject/'+str(sub['subject_id']) yield scrapy.Request(url, meta={'sub_info':sub}, callback=self.curl_sub_info, headers=self.make_headers()) item = CategoryItem() item['cat_list'] = sub_list yield item
def parse_level_two_cates(self, level_one_cate, level_one_name): """ 解析出二级分类 :param level_one_cate: 一级分类的html节点 :param level_one_name: 一级分类名 :return: """ level_two_cates = level_one_cate.xpath("div[@class='mc']/div[@class='items']/dl") for each in level_two_cates: level = CATEGORY.LEVEL_TWO name = each.xpath("dt/a/text()").get().strip() if not name: name = each.xpath("dt//text()").get().strip() url = each.xpath("dt/a/@href").get() path = self.generate_path([level_one_name, name]) is_list = CATEGORY.LIST_NO yield CategoryItem( level=level, name=name, url=url, path=path, is_list=is_list, cat_id=None ) for three_each in self.parse_level_three_cates(each, level_one_name, name): yield three_each
def parse(self, response): pass result = json.loads(response.body.decode('utf-8')) logging.debug(json.dumps(result)) if 'errorCode' in result.keys() and result['errorCode'] == 1000: if result['result'] is None: return None meta = response.meta cat = meta['cat'] cat_id = cat['cat_id'] level = cat['level'] day = meta['day'] rank_type = meta['rank_type'] # logging.log(logging.WARNING, str(cat_id)+'+'+str(len(result['result']['items']))) for keyword_item in result['result']: keyword_data = { 'category': cat_id, 'day': day, 'rank_type': rank_type } keyword_data['cat_id_1'] = cat['cat_id_1'] if level == 2 or level == 3: keyword_data['cat_id_2'] = cat['cat_id_2'] if level == 3: keyword_data['cat_id_3'] = cat['cat_id_3'] keyword_data['rank_num'] = keyword_item['rankNum'] keyword_data['click_num'] = keyword_item['clickNum'] keyword_data['compete_value'] = keyword_item['competeValue'] keyword_data['ctr'] = keyword_item['ctr'] keyword_data['cvr'] = keyword_item['cvr'] keyword_data['impr_avg_bid'] = keyword_item['imprAvgBid'] keyword_data['pv'] = keyword_item['pv'] keyword_data['word'] = keyword_item['word'] keywordItem = CategoryItem() keywordItem['cat_list'] = keyword_data yield keywordItem
def parse_level_three_cates(self, level_two_cate, level_one_name, level_two_name): """ 解析出三级分类 :param level_two_cate: 二级分类的html节点 :param level_one_name: 一级分类名 :param level_two_name: 二级分类名 :return: """ level_three_cates = level_two_cate.xpath("dd/a") for each in level_three_cates: level = CATEGORY.LEVEL_THREE name = each.xpath("text()").get() url = each.xpath("@href").get() path = self.generate_path([level_one_name, level_two_name, name]) re_matcher = match(r"/{0,2}list\.jd\.com/list.html\?cat=(\d+,\d+,\d+)?(.*)", url) if re_matcher: is_list = CATEGORY.LIST_YES cat_id = re_matcher.group(1) else: is_list = CATEGORY.LIST_NO cat_id = None yield CategoryItem( level=level, name=name, url=url, path=path, is_list=is_list, cat_id=cat_id, hot=0 )
def parse_subject_banner(self, response): """ 首页轮播""" subject_list = [] body = response.body.decode('utf-8') result = re.search(r'{"store".*?"ssr":true}', body) path = response.meta['path'] path_id = response.meta['path_id'] url_type = response.meta["url_type"] item = CategoryItem() if result: result_str = result.group() if url_type in [1, 2]: name = '轮播活动' + str(1) + str(url_type) subject_id = 0 subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 61, 7, url_type, 1) subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner" + str(url_type): subject_info})) if url_type in [4, 6, 12]: f = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str) if not f: f = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str) subject_id_list = re.findall(r"\d+", str(f)) a = 1 for subject_id in subject_id_list: name = '轮播活动' + str(2) + str(a) subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 62, 7, url_type, a) a += 1 subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner_" + str(url_type): subject_info})) if url_type == 17: # 登陆接口 pass if url_type == 20: name_list = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str) if not name_list: name_list = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str) id_list = re.findall(r'\d+', str(name_list)) if id_list: id_list = list(set(id_list)) else: id_list = [] if len(id_list) > 1: a = 0 for id in id_list: a += 1 name = "轮播活动" + str(3) + str(a) subject_info = self.build_subject_info(id, name, path + [name], path_id + [id], 63, 7, url_type, a) subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner_" + str(url_type): subject_info})) else: f = re.findall(r'"DIYGoodsIDs":".*?"', result_str) subject_id_list = re.findall(r"\d+", str(f)) subject_str = '' if len(id_list) == 1: subject_id = id_list[0] else: subject_id = subject_id_list[0] for i in subject_id_list: i = i + ',' subject_str += i name = "轮播活动" + str(4) + str(1) subject_info = self.build_subject_20_info(subject_id, name, path + [name], path_id + [subject_id], subject_str, 64, 7, url_type) subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner_id_" + str(url_type): subject_info})) item['cat_list'] = subject_list yield item
def parse(self, response): """ 获取首页活动信息""" body = response.body.decode("utf-8") result = re.search(r'{"pageProps".*?null}}', body) if not result: result = re.search(r'{"props".*?206]}', body) if not result: result = re.search(r'{"props".*?355]}', body) if not result: result = re.search(r'{"props".*?344]}', body) result = json.loads(result.group()) # 处理轮播活动 banner = self.dict_get(result, 'carouselData', None) if len(banner) > 0: path = ['首页滚动banner'] for i in banner: title = i["title"] if not title: title = '商品' end_url = i["forwardURL"] subject = self.get_subject_type_id(end_url) if not subject: continue subject_id = subject["subject_id"] type = subject["type"] headers = self.make_headers() new_url = "https://mobile.yangkeduo.com/" + end_url meta = {'path': path + [title], 'path_id': [subject_id], 'url_type': type} yield scrapy.Request(new_url, meta=meta, callback=self.parse_subject_banner, headers=headers, dont_filter=True, errback=self.errback_httpbin) # 处理首页下拉商品中的活动 list_activity = self.dict_get(result, 'crossSlideList', None) if list_activity: subject_list = [] subject_info = self.build_subject_info(71, "首页商品", "首页商品", [71], 71, 1) subject_list.append(subject_info) a = 0 for i in list_activity: subject_list_id = re.findall(r"'brand_id': '\d+'", str(i)) subject_id_list = list(set(subject_list_id)) a += 1 if subject_id_list: if len(subject_id_list) == 1: name = self.dict_get(i, "subject", None) subject_id = self.dict_get(i, "subject_id", None) subject_info = self.build_subject_info(subject_id, name, name, [subject_id], 72, 1, a) subject_list.append(subject_info) logging.debug(json.dumps({'subject_info_home_1': subject_info})) self.save_log(json.dumps({"subject_info_home_1": subject_info})) else: b = 0 for i in i["subject_list"]: b += 1 subject_id = i['p_rec']["brand_id"] if not subject_id: subject_id = 0 name = i["name"] if not name: name = '商品' + str(b) subject_info = self.build_subject_info(subject_id, name, name, [subject_id], 72, 1, a, b) subject_list.append(subject_info) logging.debug(json.dumps({'subject_info_home_2': subject_info})) self.save_log(json.dumps({"subject_info_home_2": subject_info})) item = CategoryItem() item['cat_list'] = subject_list yield item
def parse(self, response): result = json.loads(response.body.decode('utf-8')) logging.debug(json.dumps(result)) meta = response.meta cat = meta['cat'] mall_id = meta["mall_id"] pass_id = meta["pass_id"] day = meta['day'] rank_type = meta['rank_type'] cat_id_1 = cat['cat_id_1'] cat_id_2 = cat['cat_id_2'] cat_id_3 = cat['cat_id_3'] level = cat['level'] if 'errorCode' in result.keys(): if result['errorCode'] == 9 or result['errorCode'] == 1000: content = json.dumps({ "mall_id": mall_id, "pass_id": pass_id, "status": "fail", "leval": level, "day": day, "rank_type": rank_type, "cat_id_1": cat_id_1, "cat_id_2": cat_id_2, "cat_id_3": cat_id_3, "result": result['errorCode'] }) + "," self.save_mall_log(content) # 一级分类 if not cat_id_2: mall_result = self.get_pass_mall_id(cat_id_1) mall_id, pass_id = mall_result if not mall_id: self.fail_count += 1 content = json.dumps({ "cat_id": cat_id_1, "fail_reason": "当前分类没有pass_id", "fail_count": self.fail_count }) self.save_mall_log(content) return None cat["mall_id"] = mall_id cookie = 'PASS_ID' + '=' + pass_id + ";" headers = self.make_headers(cookie) query_data = self.build_query_data(cat, meta['page'], self.size, day, rank_type) yield scrapy.Request(self.url, method="POST", body=json.dumps(query_data), meta=meta, headers=headers, callback=self.parse) # 二级分类 if not cat_id_3: mall_result = self.get_pass_mall_id(cat_id_1) mall_id, pass_id = mall_result if not mall_id: self.fail_count += 1 content = json.dumps({ "cat_id": cat_id_1, "fail_reason": "当前分类没有pass_id", "fail_count": self.fail_count }) self.save_mall_log(content) return None cat["mall_id"] = mall_id cookie = 'PASS_ID' + '=' + pass_id + ";" headers = self.make_headers(cookie) query_data = self.build_query_data(cat, meta['page'], self.size, day, rank_type) yield scrapy.Request(self.url, method="POST", body=json.dumps(query_data), meta=meta, headers=headers, callback=self.parse) # 三级分类 mall_result = self.get_pass_mall_id(cat_id_1) mall_id, pass_id = mall_result if not mall_id: self.fail_count += 1 content = json.dumps({ "cat_id": cat_id_1, "fail_reason": "当前分类没有pass_id", "fail_count": self.fail_count }) self.save_mall_log(content) return None cookie = 'PASS_ID' + '=' + pass_id + ";" cat["mall_id"] = mall_id headers = self.make_headers(cookie) query_data = self.build_query_data(cat, meta['page'], self.size, day, rank_type) yield scrapy.Request(self.url, method="POST", body=json.dumps(query_data), meta=meta, headers=headers, callback=self.parse) return None if 'result' in result.keys(): self.success_count += 1 content = json.dumps({ "pass_id": pass_id, "leval": level, "day": day, "rank_type": rank_type, "cat_id_1": cat_id_1, "cat_id_2": cat_id_2, "cat_id_3": cat_id_3, "seccuss": self.success_count, "goods_count": len(result['result']) }) + "," self.save_mall_log(content) for keyword_item in result['result']: keyword_data = { 'category': cat_id_1, 'day': day, 'rank_type': rank_type } keyword_data['cat_id_1'] = cat['cat_id_1'] if cat_id_2: keyword_data['cat_id_2'] = cat_id_2 if cat_id_3: keyword_data['cat_id_3'] = cat_id_3 keyword_data['rank_num'] = keyword_item['rankNum'] keyword_data['click_num'] = keyword_item['clickNum'] keyword_data['compete_value'] = keyword_item['competeValue'] keyword_data['ctr'] = keyword_item['ctr'] keyword_data['cvr'] = keyword_item['cvr'] keyword_data['impr_avg_bid'] = keyword_item['imprAvgBid'] keyword_data['pv'] = keyword_item['pv'] keyword_data['word'] = keyword_item['word'] keywordItem = CategoryItem() keywordItem['cat_list'] = keyword_data yield keywordItem
def parse_subject_banner(self, response): """ 首页轮播""" subject_list = [] body = response.body.decode('utf-8') logging.debug(json.dumps({'body': body})) result = re.search(r'{"store".*?"ssr":true}', body) path = response.meta['path'] path_id = response.meta['path_id'] url_type = response.meta["url_type"] item = CategoryItem() if result: result_dict = json.loads(result.group()) result_str = result.group() try: name = result_dict["store"]["pageTitle"] except Exception: name = '' if url_type in [1, 2]: subject_id = path_id subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 61, url_type) subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner" + str(url_type): subject_info})) if url_type in [4, 6, 12]: f = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str) if not f: f = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str) subject_id_list = re.findall(r"\d+", str(f)) a = 1 for subject_id in subject_id_list: subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 62, url_type, a) a += 1 subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner_" + str(url_type): subject_info})) if url_type == 17: # 接口不确定 pass if url_type == 20: name_list = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str) if not name_list: name_list = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str) id_list = re.findall(r'\d+', str(name_list)) if id_list: id_list = list(set(id_list)) else: id_list = [] if len(id_list) > 1: a = 0 for id in id_list: a += 1 name = "商品" + str(a) subject_info = self.build_subject_info(id, name, path + [name], path_id + [id], 63, url_type, a) subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner_" + str(url_type): subject_info})) else: f = re.findall(r'"DIYGoodsIDs":".*?"', result_str) subject_id_list = re.findall(r"\d+", str(f)) subject_str = '' if len(id_list) == 1: subject_id = id_list[0] else: subject_id = subject_id_list[0] for i in subject_id_list: i = i + ',' subject_str += i subject_info = self.build_subject_20_info(subject_id, name, path + [name], path_id + [subject_id], subject_str, 64, url_type) subject_list.append(subject_info) self.save_log(json.dumps({"subject_info_banner_id_" + str(url_type): subject_info})) item['cat_list'] = subject_list yield item