def get_promo_info(shop_id): """ 优惠券信息 @param shop_id: @return: """ assert len(shop_id) == len('H2noKWCDigM0H9c1') shop_url = get_shop_url(shop_id) url = 'ttp://www.dianping.com/ajax/json/shopDynamic/reviewAndStar?shopId=' + str( shop_id) + '&cityId=19&mainCategoryId=2821&_token=' + str(get_token( shop_url)) + '&uuid=38af1c67-4a50-3220-06f6-bf9f16e71c41.1611146098&platform=1&partner=150&optimusCode=10' \ '&originUrl=' + shop_url r = requests_util.get_requests(url, request_type='json') r_text = requests_util.replace_json_text(r.text, get_font_msg()) r_json = json.loads(r_text) # 验证码处理 if r_json['code'] == 406: verify_page_url = r_json['customData']['verifyPageUrl'] logger.warning('处理验证码,按任意键继续:', verify_page_url) input() elif r_json['code'] == 200: msg = r_json['msg']['shopInfo'] shop_name = msg['shopName'] shop_address = BeautifulSoup(msg['address'], 'lxml').text + BeautifulSoup( msg['crossRoad'], 'lxml').text shop_number = BeautifulSoup(msg['phoneNo'], 'lxml').text + BeautifulSoup( msg['phoneNo2'], 'lxml').text return [shop_name, shop_address, shop_number] else: logger.warning('json响应码异常,尝试更改提pr,或者提issue')
def get_basic_hidden_info(shop_id): """ 获取基础隐藏信息(名称、地址、电话号、cityid) @param shop_id: @return: """ assert len(shop_id) == len('H2noKWCDigM0H9c1') shop_url = get_shop_url(shop_id) url = 'http://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?' \ 'shopId=' + str(shop_id) + '&_token=' + str(get_token( shop_url)) + '&tcv=ck9rmnrofg&uuid=6ca1f51a-7653-b987-3cd6-95f3aadb13b8.1619854599&platform=1' \ '&partner=150&optimusCode=10&originUrl=' + str(shop_url) r = requests_util.get_requests(url, request_type='json') r_text = requests_util.replace_json_text(r.text, get_font_msg()) r_json = json.loads(r_text) # 验证码处理 if r_json['code'] == 406: verify_page_url = r_json['customData']['verifyPageUrl'] logger.warning('处理验证码,按任意键继续:', verify_page_url) input() elif r_json['code'] == 200: msg = r_json['msg']['shopInfo'] shop_name = msg['shopName'] shop_address = BeautifulSoup(msg['address'], 'lxml').text + BeautifulSoup( msg['crossRoad'], 'lxml').text shop_number = BeautifulSoup(msg['phoneNo'], 'lxml').text + BeautifulSoup( msg['phoneNo2'], 'lxml').text return [shop_name, shop_address, shop_number] else: logger.warning('json响应码异常,尝试更改提pr,或者提issue')
def get_basic_hidden_info(shop_id): """ 获取基础隐藏信息(名称、地址、电话号、cityid) @param shop_id: @return: """ assert len(shop_id) == len('H2noKWCDigM0H9c1') shop_url = get_shop_url(shop_id) url = 'http://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?' \ 'shopId=' + str(shop_id) + \ '&_token=' + str(get_token(shop_url)) + \ '&tcv=' + str(spider_config.TCV) + \ '&uuid=' + str(spider_config.UUID) + \ '&platform=1' \ '&partner=150' \ '&optimusCode=10' \ '&originUrl=' + str(shop_url) # 这里处理解决请求会异常的问题 retry_time = 5 while True: retry_time -= 1 r = requests_util.get_requests(url, request_type='proxy, no cookie') r_text = requests_util.replace_json_text(r.text, get_font_msg()) try: r_json = json.loads(r_text) # 前置验证码过滤 if r_json['code'] == 200: break if retry_time == 0: logger.warning('替换tsv和uuid') exit() except: pass # 验证码处理 if r_json['code'] == 406: verify_page_url = r_json['customData']['verifyPageUrl'] print('处理验证码,按任意键回车后继续:', verify_page_url) input() elif r_json['code'] == 200: msg = r_json['msg']['shopInfo'] shop_name = msg['shopName'] shop_address = BeautifulSoup(msg['address'], 'lxml').text if msg['address'] is not None else '' + \ BeautifulSoup(msg[ 'crossRoad'], 'lxml').text if \ msg['crossRoad'] is not None else '' shop_number = BeautifulSoup(msg['phoneNo'], 'lxml').text if msg['phoneNo'] is not None else '' + ', ' + \ BeautifulSoup( msg['phoneNo2'], 'lxml').text if \ msg['phoneNo2'] is not None else '' return { '店铺id': shop_id, '店铺名': shop_name, '店铺地址': shop_address, '店铺电话': shop_number } else: logger.warning('json响应码异常,尝试更改提pr,或者提issue')
def get_review_and_star(shop_id): """ 获取评分、人均,评论数 @param shop_id: @return: """ assert len(shop_id) == len('H2noKWCDigM0H9c1') shop_url = get_shop_url(shop_id) url = 'http://www.dianping.com/ajax/json/shopDynamic/reviewAndStar?shopId=' + str( shop_id) + '&cityId=19&mainCategoryId=2821&_token=' + str(get_token( shop_url)) + '&uuid=38af1c67-4a50-3220-06f6-bf9f16e71c41.1611146098&platform=1&partner=150&optimusCode=10' \ '&originUrl=' + shop_url r = requests_util.get_requests(url, request_type='json') r_text = requests_util.replace_json_text(r.text, get_font_msg()) r_json = json.loads(r_text) # 验证码处理 if r_json['code'] == 406: verify_page_url = r_json['customData']['verifyPageUrl'] logger.warning('处理验证码,按任意键继续:', verify_page_url) input() elif r_json['code'] == 200: shop_base_score = r_json['fiveScore'] score_title_list = r_json['shopScoreTitleList'] avg_price = BeautifulSoup(r_json['avgPrice'], 'lxml').text review_count = BeautifulSoup(r_json['defaultReviewCount'], 'lxml').text score_list = [] for each in r_json['shopRefinedScoreValueList']: score_list.append(BeautifulSoup(each, 'lxml').text) scores = '' for i, score in enumerate(score_list): scores = scores + ' ' + score_title_list[i] + score_list[i] return [shop_base_score, scores, avg_price, review_count] else: logger.warning('json响应码异常,尝试更改提pr,或者提issue')
def get_review_and_star(shop_id): """ 获取评分、人均,评论数 @param shop_id: @return: """ assert len(shop_id) == len('H2noKWCDigM0H9c1') shop_url = get_shop_url(shop_id) url = 'http://www.dianping.com/ajax/json/shopDynamic/reviewAndStar?' \ 'shopId=' + str(shop_id) + \ '&cityId=19' \ '&mainCategoryId=2821' \ '&_token=' + str(get_token(shop_url)) + \ '&uuid=' + str(spider_config.UUID) + \ '&platform=1' \ '&partner=150' \ '&optimusCode=10' \ '&originUrl=' + shop_url # 这里处理解决请求会异常的问题 while True: r = requests_util.get_requests(url, request_type='proxy, no cookie') r_text = requests_util.replace_json_text(r.text, get_font_msg()) try: r_json = json.loads(r_text) # 前置验证码过滤 if r_json['code'] == 200: break except: pass # 验证码处理 if r_json['code'] == 406: verify_page_url = r_json['customData']['verifyPageUrl'] print('处理验证码,按任意键回车后继续:', verify_page_url) input() elif r_json['code'] == 200: shop_base_score = r_json['fiveScore'] score_title_list = r_json['shopScoreTitleList'] avg_price = BeautifulSoup(r_json['avgPrice'], 'lxml').text review_count = BeautifulSoup(r_json['defaultReviewCount'], 'lxml').text score_list = [] for each in r_json['shopRefinedScoreValueList']: score_list.append(BeautifulSoup(each, 'lxml').text) # scores = '' # for i, score in enumerate(score_list): # scores = scores + ' ' + score_title_list[i] + score_list[i] scores = {} for i, score in enumerate(score_list): scores[score_title_list[i]] = score_list[i] # return [shop_base_score, scores, avg_price, review_count] return { '店铺id': shop_id, '店铺总分': shop_base_score, '店铺评分': scores, '人均价格': avg_price, '评论总数': review_count } else: logger.warning('json响应码异常,尝试更改提pr,或者提issue')
def download_woff(woff_url, filename): """ 下载字体文件 :param woff_url: :param filename: :return: """ r = requests_util.get_requests(woff_url, need_header=False) with open('./tmp/' + filename, 'wb') as f: f.write(r.content)
def get_detail_font_mapping(self, shop_id): """ 获取detail的字体映射,不要解析,只要加密字体映射,给json用 @param shop_id: @return: """ url = 'http://www.dianping.com/shop/' + str(shop_id) r = requests_util.get_requests(url, request_type='proxy, no cookie') # 对于部分敏感ip(比如我的ip,淦!)可能需要带cookie才允许访问 if r.status_code == 403: r = requests_util.get_requests(url, request_type='no proxy, cookie') if r.status_code == 403: logger.error('使用代理吧小伙汁') exit() text = r.text file_map = get_search_map_file(text) cache.search_font_map = file_map return file_map
def search(self, key_word, only_need_first=True, needed_pages=50): """ 搜索 :param key_word: 关键字 :param only_need_first: 只需要第一条 :param needed_pages: 需要多少页 :return: """ # Todo 不需要详情页和评论,只需要首页搜索 不需要cookie assert isinstance(key_word, str) assert key_word != None or key_word.strip() != '' if self.custom_search_url != '': key_word = self.custom_search_url logger.info('开始搜索:' + key_word) # header = self.get_header() for i in tqdm(range(1, needed_pages + 1), desc='页数'): # 针对只需要收条的情况,跳出页数循环 if only_need_first is True and i != 1: break url = 'http://www.dianping.com/search/keyword/' + str( self.location_id) + '/' + str( self.channel_id) + '_' + str(key_word) + '/p' + str(i) if self.custom_search_url != '': url = self.custom_search_url + str(i) r = requests_util.get_requests(url) # r = requests.get(url, headers=header) text = r.text # 获取加密文件 file_map = get_search_map_file(text) # 替换加密文件 text = requests_util.replace_search_html(text, file_map) # 网页解析 html = BeautifulSoup(text, 'lxml') shop_all_list = html.select('.shop-list')[0].select('li') search_res = [] for shop in shop_all_list: try: image_path = shop.select('.pic')[0].select('a')[0].select( 'img')[0]['src'] except: image_path = '-' try: shop_id = shop.select('.txt')[0].select('.tit')[0].select( 'a')[0]['data-shopid'] except: shop_id = '-' try: detail_url = shop.select('.txt')[0].select( '.tit')[0].select('a')[0]['href'] except: detail_url = '-' try: name = shop.select('.txt')[0].select('.tit')[0].select( 'a')[0].text.strip() except: name = '-' # 两个star方式,有的页面显示详细star分数,有的显示icon # 解析icon try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][ 1].split('_')[1] star_point = float(star_point) / 10 star_point = str(star_point) except: star_point = '-' # 解析详细star try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text star_point = float(star_point) star_point = str(star_point) except: pass try: review_number = shop.select('.txt')[0].select( '.comment')[0].select('.review-num')[0].text.replace( '\n', '') except: review_number = '-' try: mean_price = shop.select('.txt')[0].select('.comment')[ 0].select('.mean-price')[0].select('b')[0].text except: mean_price = '¥0' try: tags = shop.select('.txt')[0].select( '.tag-addr')[0].select('.tag') tag1 = tags[0].text.replace('\n', ' ').strip() tag2 = tags[1].text.replace('\n', ' ').strip() except: tag1 = '-' tag2 = '-' try: addr = shop.select('.txt')[0].select( '.tag-addr')[0].select('.addr')[0].text.replace( '\n', ' ').strip() except: addr = '-' try: recommend = shop.select('.recommend')[0].text.replace( '\n', ' ').strip() except: recommend = '-' try: commend_list = shop.select( '.comment-list')[0].text.replace('\n', ' ').strip() except: commend_list = '-' one_step_search_res = [ shop_id, name, star_point, review_number, mean_price, tag1, tag2, addr, recommend, commend_list, image_path, detail_url, 1, 1 ] # 最后两位是搜索标记 # 这个数据结构暂时没用 search_res.append(one_step_search_res) # 只要首条,跳出 if only_need_first is True: break # 解析详情页 if self.need_detail == '1': try: detail = Detail().get_detail(shop_id) print('\n' + ','.join(detail) + '\n') self.saver.save_data([detail], 'detail') except: # 设置标记 one_step_search_res[-2] = 0 logger.warning('详情信息获取失败,失败id:' + shop_id) print('\n' + ','.join(one_step_search_res) + '\n') if self.jump_wait is False: print( '检查浏览器,处理验证码,输入y程序继续运行,输入n跳过检查', 'http://www.dianping.com/shop/' + str(shop_id)) if input() == 'y': continue elif input() == 'n': self.jump_wait = True else: print('\n' + ','.join(one_step_search_res) + '\n') # 解析评论页 if self.need_comment == '1': try: review = Review().get_review(shop_id) print('获取', name, '评论', len(review), '条') self.saver.save_data(review, 'review') except: # 设置标记 one_step_search_res[-1] = 0 logger.warning('评论获取失败,失败id:' + shop_id) # 保存数据 self.saver.save_data([one_step_search_res], 'search') logger.info('解析完成:' + key_word)
def get_search_map_file(page_source): """ 获取搜索页映射文件 :param page_source: 页面源码 :return: """ # 创建临时缓存文件夹 create_dir('./tmp') # 返回json映射 return_file_map = {} # 如果无法在页面信息中解析出字体css文件,说明被反爬或者cookie失效 try: font_base_url = re.findall(' href="(//s3plus.meituan.net/v1/.*?)">', page_source)[0] except: global_logger.warning('cookie失效或者被限制访问,更新cookie或登录大众点评滑动验证') sys.exit() # global_logger.info('更新搜索页面加密字体映射文件') font_base_url = 'https:' + font_base_url # header = get_header() # r = requests.get(font_base_url, headers=header) r = requests_util.get_requests(url=font_base_url, need_header=False) text = r.text woff_urls = re.findall(',url\("(.*?\.woff"\).*?\{)', text) # 设置logger等级,解析woff会生成无关日志,屏蔽 logger = logging.getLogger() logger.setLevel(logging.WARNING) # 处理css中的woff链接 for each in woff_urls: # 解析address woff if 'address' in each: address_map_woff_url = re.findall('(//.*?woff)', each)[0] address_map_woff_url = 'https:' + address_map_woff_url # 获取文件名 file_name = address_map_woff_url[-13:-5] return_file_map['address'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(address_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') if 'shopNum' in each: shop_num_map_woff_url = re.findall('(//.*?woff)', each)[0] shop_num_map_woff_url = 'https:' + shop_num_map_woff_url # 获取文件名 file_name = shop_num_map_woff_url[-13:-5] return_file_map['shopNum'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(shop_num_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') if 'tagName' in each: tag_name_map_woff_url = re.findall('(//.*?woff)', each)[0] tag_name_map_woff_url = 'https:' + tag_name_map_woff_url # 获取文件名 file_name = tag_name_map_woff_url[-13:-5] return_file_map['tagName'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(tag_name_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') if 'reviewTag' in each: review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0] review_tag_map_woff_url = 'https:' + review_tag_map_woff_url # 获取文件名 file_name = review_tag_map_woff_url[-13:-5] return_file_map['reviewTag'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(review_tag_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') if 'num' in each: review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0] review_tag_map_woff_url = 'https:' + review_tag_map_woff_url # 获取文件名 file_name = review_tag_map_woff_url[-13:-5] return_file_map['num'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(review_tag_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') if 'dishname' in each: review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0] review_tag_map_woff_url = 'https:' + review_tag_map_woff_url # 获取文件名 file_name = review_tag_map_woff_url[-13:-5] return_file_map['dishname'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(review_tag_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') if 'shopdesc' in each: review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0] review_tag_map_woff_url = 'https:' + review_tag_map_woff_url # 获取文件名 file_name = review_tag_map_woff_url[-13:-5] return_file_map['shopdesc'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(review_tag_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') if 'review' in each: review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0] review_tag_map_woff_url = 'https:' + review_tag_map_woff_url # 获取文件名 file_name = review_tag_map_woff_url[-13:-5] return_file_map['review'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(review_tag_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') if 'hours' in each: review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0] review_tag_map_woff_url = 'https:' + review_tag_map_woff_url # 获取文件名 file_name = review_tag_map_woff_url[-13:-5] return_file_map['hours'] = './tmp/' + file_name + '.json' # 如果文件存在不用解析 if os.path.exists('./tmp/' + file_name + '.json'): continue # 下载字体文件,解析文件 download_woff(review_tag_map_woff_url, file_name + '.woff') parse_woff(file_name + '.woff') parse_xml(file_name + '.xml') os.remove('./tmp/' + file_name + '.woff') os.remove('./tmp/' + file_name + '.xml') # 将logger等级恢复 logger = logging.getLogger() logger.setLevel(logging.INFO) # global_logger.info('加密字体映射文件获取完成') return return_file_map
def get_review_map_file(page_source): """ 获取评论页加密文件 :param page_source: :return: """ create_dir('./tmp') # 如果无法在页面信息中解析出字体css文件,说明被反爬或者cookie失效 try: css_url = 'https:' + re.findall( ' href="(//s3plus.meituan.net/v1/.*?)">', page_source)[0] except: global_logger.warning('cookie失效或者被限制访问,更新cookie或登录大众点评滑动验证') sys.exit() # 下载css文件 r = requests_util.get_requests(css_url, need_header=False) with open('./tmp/review_css.css', 'wb') as f: f.write(r.content) # 解析css文件 css_role = re.findall('.(.*?)\{background:-(.*?)px -(.*?)px;}', r.text, re.S) css_loc = [] for each in css_role: # 过滤css中的svg信息,也会正则出来 if '[' in each[0]: continue css_loc.append([each[0], int(float(each[1])), int(float(each[2]))]) # 解析svg字体 svg_url = re.findall( '\[class\^="(.*?)"\].*?url\((//s3plus.meituan.net/v1/.*?)\)', r.text, re.S) svg_map = {} return_svg_name = {} for each in svg_url: url = 'https:' + each[1] r = requests_util.get_requests(url, need_header=False) svg_name = each[1][-18:-3] + 'json' # 检查缓存json文件,以节约解析时间 if os.path.exists('./tmp/' + svg_name): return_svg_name[each[0]] = './tmp/' + svg_name continue # 字体类型,用于区分不同字体的height、weight偏移不同 if '#333' in r.text: font_height_offset = 23 font_weight_offset = 0 elif '#666' in r.text: font_height_offset = 15 font_weight_offset = 0 else: global_logger.warning('评论页字体变更,尝试修改代码或者联系作者') sys.exit() # 第一种文件格式解析 re_font_loc = re.findall('<path id="(.*?)" d="M0 (.*?) H600"/>', r.text) font_loc = {} for i in range(len(re_font_loc)): font_loc[int(re_font_loc[i][1])] = i + 1 font_list = re.findall('>(.*?)</textPath>', r.text) # 如果第一种解析失败,尝试第二种文件格式解析 if len(font_loc) == 0: font_loc = {} font_list = [] font_loc_tmp = re.findall('<text x=".*?" y="(.*?)">(.*?)</text>', r.text) for i in range(len(font_loc_tmp)): font_loc[int(font_loc_tmp[i][0])] = i + 1 font_list.append(font_loc_tmp[i][1]) # Todo 这个svg_map上一个存储结构需要,目前这个存储结构比较冗余,但是为了简单起见继续使用,留给以后重构的时候解决 svg_map[each[0]] = [ font_loc, font_list, font_height_offset, font_weight_offset, svg_name, each[0] ] css_map_result = {} css_key = each[0][:3] # 解析css文件 for each_css in css_loc: if each_css[0][:len(each[0])] != each[0]: continue loc_x, loc_y = each_css[1], each_css[2] # 字体的长宽偏移量 font_height_offset, font_weight_offset = svg_map[css_key][ 2], svg_map[css_key][3] # 计算文字位置 loc_x_line, loc_y_line = ( loc_x + font_weight_offset) // 14, svg_map[css_key][0][ loc_y + font_height_offset] # 获取文字 css_value = svg_map[css_key][1][loc_y_line - 1][loc_x_line] css_map_result[each_css[0]] = css_value # 保存json文件 with open('./tmp/' + str(svg_map[css_key][4]), 'w', encoding='utf-8') as f: json.dump(css_map_result, f, ensure_ascii=False) return_svg_name[str( svg_map[css_key][5])] = './tmp/' + str(svg_map[css_key][4]) return return_svg_name
def get_detail(self, shop_id, request_type='proxy, cookie'): url = 'http://www.dianping.com/shop/' + str(shop_id) r = requests_util.get_requests(url, request_type=request_type) if r.status_code == 403: print('检查浏览器,处理验证码,替换cookie,输入y解除限制', 'http://www.dianping.com/shop/' + str(shop_id)) while input() != 'y': import time time.sleep(1) requests_util.update_cookie() r = requests_util.get_requests(url, request_type=request_type) text = r.text # 获取加密文件 file_map = get_search_map_file(text) # 替换加密字符串 text = requests_util.replace_search_html(text, file_map) # 网页解析 html = BeautifulSoup(text, 'lxml') """ 解析格式1(一般餐饮居多) """ # 基础信息 main_info = html.select('.main')[0] shop_name = '-' review_count = '-' avg_price = '-' score = '-' address = '-' phone = '-' other_info = '-' try: base_info = main_info.select('#basic-info')[0] try: shop_name = base_info.select('.shop-name')[0].text # 过滤标题后缀,例:手机扫码 优惠买单 remove_a = base_info.select('a') for each in remove_a: shop_name = shop_name.replace(each.text, '') shop_name = shop_name.strip() except: shop_name = '-' try: brief_info = main_info.select('.brief-info')[0] # Todo 单独json接口响应,js加密参数,由后期慢慢解决,但是仍然保留这个字段,其他解析方式有时可以解析这个字段 # try: # score = brief_info.select('.star-wrapper')[0].select('.mid-score')[0].text.strip() # except: # score = None try: review_count = brief_info.select( '#reviewCount')[0].text.strip() except: review_count = '-' try: avg_price = brief_info.select( '#avgPriceTitle')[0].text.strip() except: avg_price = '-' # Todo 这个建议使用info中信息,这里的有可能会不准,动态参数由json返回 # try: # comment_score = brief_info.select('#comment_score')[0].text.strip() # except: # comment_score = None try: address = main_info.find(attrs={ 'itemprop': 'street-address' }).text.strip() except: address = '-' try: phone = main_info.select('.tel')[0].text.strip() except: phone = '-' try: other_info = main_info.select('.other')[0].text.replace( '修改', '').strip() except: other_info = '-' except: # Todo 前台显示手动滑动解锁 # self.get_detail(shop_id) pass # Todo 促销信息 (单独接口 js加密) # try: # sale_info = '' # sales = main_info.select('#sales') # for sale in sales: # for tag in sale.select('.item'): # try: # title = tag.select('.title')[0].text # price = tag.select('.price')[0].text # del_price = tag.select('.del-price')[0].text # sale_info += title + '\t' + price + '\t' + del_price + '\n' # except: # continue # except: # sales = None except: # 切换解析方式 pass """ 解析格式2(一般酒店居多) """ # Todo 这种解析方式没有加密,会在解析加密文件时报错,反正这种格式数量不多,暂时不做更改了 # if shop_name is '-': # # 名称解析不到,换一种解析方式 # try: # base_info = html.select('base-info')[0] # try: # shop_name = base_info.select('.hotel-title')[0].text # except: # shop_name = None # try: # address = base_info.find(attrs={'itemprop': 'address'}).text.strip() # except: # address = None # try: # score = base_info.select('.hotel-scope')[0].select('.score')[0].text # except: # score = None # except: # # Todo 前台显示手动滑动解锁 # # self.get_detail(shop_id) # pass # pass detail_info = { '店铺id': shop_id, '店铺名': shop_name, '评论总数': review_count, '人均价格': avg_price, '店铺地址': address, '店铺电话': phone, '其他信息': other_info } return detail_info
def get_basic_review(shop_id): """ 获取评分、人均,评论数 @param shop_id: @return: """ assert len(shop_id) == len('H2noKWCDigM0H9c1') shop_url = get_shop_url(shop_id) url = 'http://www.dianping.com/ajax/json/shopDynamic/allReview?' \ 'shopId=' + str(shop_id) + \ '&cityId=19' \ '&shopType=10' \ '&tcv=' + str(spider_config.TCV) + \ '&_token=' + str(get_token(shop_url)) + \ '&uuid=' + str(spider_config.UUID) + \ '&platform=1' \ '&partner=150' \ '&optimusCode=10' \ '&originUrl=' + shop_url # 这里处理解决请求会异常的问题 while True: r = requests_util.get_requests(url, request_type='proxy, no cookie') r_text = requests_util.replace_json_text(r.text, get_font_msg()) try: r_json = json.loads(r_text) # 前置验证码过滤 if r_json['code'] == 200: break except: pass # 验证码处理 if r_json['code'] == 406: verify_page_url = r_json['customData']['verifyPageUrl'] print('处理验证码,按任意键回车后继续:', verify_page_url) input() get_basic_review(shop_id) elif r_json['code'] == 200: # 获取评论的标签以及每个标签的个数 summaries = [] for summary in r_json['summarys']: summaries.append({ '描述': summary['summaryString'], '个数': summary['summaryCount'] }) # 获取评论数量信息 all_review_count = r_json['reviewCountAll'] review_with_pic_count = r_json['reviewCountPic'] good_review_count = r_json['reviewCountGood'] mid_review_count = r_json['reviewCountCommon'] bad_review_count = r_json['reviewCountBad'] # 获取精选评论详情信息 reviews = [] for review in r_json['reviewAllDOList']: # 基础评论信息 review_info = review['reviewDataVO'] review_id = review_info['reviewData']['reviewId'] review_star = review_info['reviewData']['star'] review_body = BeautifulSoup(review_info['reviewData']['reviewBody'], 'lxml').text review_vote_count = review_info['reviewData']['voteCount'] review_reply_count = review_info['reviewData']['replyCount'] review_view_count = review_info['reviewData']['viewCount'] # 喜欢的菜 if review_info['reviewData']['extInfoList'] is not None: review_like_dish = review_info['reviewData']['extInfoList'][0]['values'] else: review_like_dish = [] review_avg_price = review_info['reviewData']['avgPrice'] review_publish_time = review_info['addTimeVO'] # 商家回复 review_merchant_reply = review_info['followNoteString'] # 用户评论图片 if review['picList'] is not None: review_pic_list = [] for each_pic in review['picList']: review_pic_list.append(each_pic['bigPicture']) else: review_pic_list = [] # 获取用户相关信息 review_username = review['user']['userNickName'] user_id = review['user']['userId'] # each_review = [shop_id, review_id, user_id, review_username, review_star, review_body, review_vote_count, # review_reply_count, review_view_count, review_avg_price, review_like_dish, # review_publish_time, review_merchant_reply, review_pic_list] each_review = { '店铺id': shop_id, '评论id': review_id, '用户id': user_id, '用户名': review_username, '用户打分': review_star, '评论内容': review_body, '点赞个数': review_vote_count, '回复个数': review_reply_count, '浏览次数': review_view_count, '人均价格': review_avg_price, '喜欢的菜': review_like_dish, '发布时间': review_publish_time, '商家回复': review_merchant_reply, '评论图片': review_pic_list, } reviews.append(each_review) # 推荐菜 dish_tag_list = r_json['dishTagStrList'] # return [summaries, all_review_count, good_review_count, mid_review_count, bad_review_count, # review_with_pic_count, reviews, dish_tag_list] return { '店铺id': shop_id, '评论摘要': summaries, '评论总数': all_review_count, '好评个数': good_review_count, '中评个数': mid_review_count, '差评个数': bad_review_count, '带图评论个数': review_with_pic_count, '精选评论': reviews, '推荐菜': dish_tag_list, } else: logger.warning('json响应码异常,尝试更改提pr,或者提issue')
def search(self, search_url, request_type='proxy, cookie'): """ 搜索 :param key_word: 关键字 :param only_need_first: 只需要第一条 :param needed_pages: 需要多少页 :return: """ r = requests_util.get_requests(search_url, request_type=request_type) text = r.text # 获取加密文件 file_map = get_search_map_file(text) # 替换加密文件 text = requests_util.replace_search_html(text, file_map) # 网页解析 html = BeautifulSoup(text, 'lxml') shop_all_list = html.select('.shop-list')[0].select('li') search_res = [] for shop in shop_all_list: try: image_path = shop.select('.pic')[0].select('a')[0].select('img')[0]['src'] except: image_path = '-' try: shop_id = shop.select('.txt')[0].select('.tit')[0].select('a')[0]['data-shopid'] except: shop_id = '-' try: detail_url = shop.select('.txt')[0].select('.tit')[0].select('a')[0]['href'] except: detail_url = '-' try: name = shop.select('.txt')[0].select('.tit')[0].select('a')[0].text.strip() except: name = '-' # 两个star方式,有的页面显示详细star分数,有的显示icon # 解析icon try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][ 1].split('_')[1] star_point = float(star_point) / 10 star_point = str(star_point) except: star_point = '-' # 解析详细star try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text star_point = float(star_point) star_point = str(star_point) except: pass try: review_number = shop.select('.txt')[0].select('.comment')[0].select('.review-num')[0].text.replace( '\n', '') except: review_number = '-' try: mean_price = shop.select('.txt')[0].select('.comment')[0].select('.mean-price')[0].select('b')[ 0].text except: mean_price = '¥0' try: tags = shop.select('.txt')[0].select('.tag-addr')[0].select('.tag') tag1 = tags[0].text.replace('\n', ' ').strip() tag2 = tags[1].text.replace('\n', ' ').strip() except: tag1 = '-' tag2 = '-' try: addr = shop.select('.txt')[0].select('.tag-addr')[0].select('.addr')[0].text.replace('\n', ' ').strip() except: addr = '-' try: recommend = shop.select('.recommend')[0].text.replace('\n', ' ').strip() except: recommend = '-' try: comment_list = shop.select('.comment-list')[0].text.replace('\n', ' ').strip() except: comment_list = '-' one_step_search_res = { '店铺id': shop_id, '店铺名': name, '评论个数': review_number, '人均价格': mean_price, '标签1': tag1, '标签2': tag2, '店铺地址': addr, '详情链接': detail_url, '图片链接': image_path, '详细评分': comment_list, '推荐菜': recommend, '店铺均分': star_point, } search_res.append(one_step_search_res) # yield one_step_search_res return search_res
def get_review(self, shop_id): all_pages = -1 cur_pages = 1 all_review = [] while all_pages == -1 or all_pages > 0: url = 'http://www.dianping.com/shop/' + str( shop_id) + '/review_all/p' + str(cur_pages) # 访问p1会触发验证码,因此对第一页单独处理 if cur_pages == 1: url = 'http://www.dianping.com/shop/' + str( shop_id) + '/review_all' r = requests_util.get_requests(url, request_type='review') if r.status_code == 403: logger.warning('评论页请求被ban') raise Exception text = r.text # 获取加密文件 file_map = get_review_map_file(text) # 替换加密字符串 text = requests_util.replace_review_html(text, file_map) html = BeautifulSoup(text, 'lxml') # 更新页数 if all_pages == -1: all_pages = min( int(html.select('.reviews-pages')[0].select('a')[-2].text), int(self.pages_needed)) reviews = html.select('.reviews-items')[0].select('.main-review') for review in reviews: # single_review = [] try: user_name = review.select('.name')[0].text.strip() except: user_name = '-' try: score = review.select('.score')[0].text.replace( ' ', '').replace('\n', ' ').strip() except: score = '-' try: review_text = review.select('.review-words')[0].text.replace(' ', ''). \ replace('收起评价', '').replace('\r', ' ').replace('\n', ' ').strip() except: review_text = '-' try: like = review.select('.review-recommend')[0].text.replace(' ', '').\ replace('\r', ' ').replace('\n', ' ').strip() except: like = '-' try: time = review.select('.time')[0].text.strip() except: time = '-' try: review_id = review.select('.actions')[0].select( 'a')[0].attrs['data-id'] except: review_id = '-' all_review.append([ review_id, shop_id, user_name, score, review_text, like, time ]) cur_pages += 1 all_pages -= 1 return all_review
def get_review(self, shop_id, request_type='proxy, cookie'): all_pages = -1 cur_pages = 1 all_review = [] while all_pages == -1 or all_pages > 0: url = 'http://www.dianping.com/shop/' + str( shop_id) + '/review_all/p' + str(cur_pages) # 访问p1会触发验证码,因此对第一页单独处理 if cur_pages == 1: url = 'http://www.dianping.com/shop/' + str( shop_id) + '/review_all' r = requests_util.get_requests(url, request_type=request_type) if r.status_code == 403: logger.warning('评论页请求被ban') raise Exception text = r.text # 获取加密文件 file_map = get_review_map_file(text) # 替换加密字符串 text = requests_util.replace_review_html(text, file_map) html = BeautifulSoup(text, 'lxml') # 更新页数 if all_pages == -1: all_pages = min( int(html.select('.reviews-pages')[0].select('a')[-2].text), int(self.pages_needed)) # 只用解析一次的东西比如评论个数也放这里来 summaries = [] for summary in html.select('.content')[0].select('span'): tag_string = summary.text.strip().replace('\n', '').split() string = tag_string[0] count = tag_string[1][1:-1] summaries.append({ '描述': string, '个数': count, }) # 各种评论个数 review_with_pic_count = html.select('.filter-pic')[0].select( '.count')[0].text[1:-1] good_review_count = html.select('.filter-good')[0].select( '.count')[0].text[1:-1] mid_review_count = html.select('.filter-middle')[0].select( '.count')[0].text[1:-1] bad_review_count = html.select('.filter-bad')[0].select( '.count')[0].text[1:-1] try: all_review_count = int(good_review_count) + int( mid_review_count) + int(bad_review_count) except: all_review_count = '-' reviews = html.select('.reviews-items')[0].select('.main-review') for review in reviews: try: review_username = review.select('.name')[0].text.strip() except: review_username = '******' try: user_id = review.select('.name')[0]['href'].split('/')[-1] except: user_id = '-' try: review_score_detail = {} review_avg_price = '' review_score_detail_temp = review.select( '.score')[0].text.replace(' ', '').replace( '\n', ' ').strip().split() for each in review_score_detail_temp: if '人均' in each: review_avg_price = each.split(':')[1].replace( '元', '') else: temp = each.split(':') review_score_detail[temp[0]] = temp[1] except: review_score_detail = {} review_avg_price = '' try: review_text = review.select('.review-words')[0].text.replace(' ', ''). \ replace('收起评价', '').replace('\r', ' ').replace('\n', ' ').strip() except: review_text = '-' try: review_like_dish = review.select('.review-recommend')[0].text.replace(' ', ''). \ replace('\r', ' ').replace('\n', ' ').strip()[5:].split() except: review_like_dish = [] try: review_publish_time = review.select( '.time')[0].text.strip() except: review_publish_time = '-' try: review_id = review.select('.actions')[0].select( 'a')[0].attrs['data-id'] except: review_id = '-' try: review_pic_list = [] review_pic_list_temp = review.select( '.review-pictures')[0].select('a') for each in review_pic_list_temp: url = each['href'] review_pic_list.append('http://www.dianping.com' + str(url)) except: review_pic_list = [] try: review_merchant_reply = review.select( '.shop-reply-content')[0].text.strip() except: review_merchant_reply = '' each_review = { '店铺id': shop_id, '评论id': review_id, '用户id': user_id, '用户名': review_username, '用户打分': review_score_detail, '评论内容': review_text, '人均价格': review_avg_price, '喜欢的菜': review_like_dish, '发布时间': review_publish_time, '商家回复': review_merchant_reply, '评论图片': review_pic_list, } all_review.append(each_review) cur_pages += 1 all_pages -= 1 return_data = { '店铺id': shop_id, '评论摘要': summaries, '评论总数': all_review_count, '好评个数': good_review_count, '中评个数': mid_review_count, '差评个数': bad_review_count, '带图评论个数': review_with_pic_count, '精选评论': all_review, } return return_data