def get_detail(self, shop_id, request_type='proxy, cookie'): url = 'http://www.dianping.com/shop/' + str(shop_id) r = requests_util.get_requests(url, request_type=request_type) if r.status_code == 403: print('检查浏览器,处理验证码,替换cookie,输入y解除限制', 'http://www.dianping.com/shop/' + str(shop_id)) while input() != 'y': import time time.sleep(1) requests_util.update_cookie() r = requests_util.get_requests(url, request_type=request_type) text = r.text # 获取加密文件 file_map = get_search_map_file(text) # 替换加密字符串 text = requests_util.replace_search_html(text, file_map) # 网页解析 html = BeautifulSoup(text, 'lxml') """ 解析格式1(一般餐饮居多) """ # 基础信息 main_info = html.select('.main')[0] shop_name = '-' review_count = '-' avg_price = '-' score = '-' address = '-' phone = '-' other_info = '-' try: base_info = main_info.select('#basic-info')[0] try: shop_name = base_info.select('.shop-name')[0].text # 过滤标题后缀,例:手机扫码 优惠买单 remove_a = base_info.select('a') for each in remove_a: shop_name = shop_name.replace(each.text, '') shop_name = shop_name.strip() except: shop_name = '-' try: brief_info = main_info.select('.brief-info')[0] # Todo 单独json接口响应,js加密参数,由后期慢慢解决,但是仍然保留这个字段,其他解析方式有时可以解析这个字段 # try: # score = brief_info.select('.star-wrapper')[0].select('.mid-score')[0].text.strip() # except: # score = None try: review_count = brief_info.select( '#reviewCount')[0].text.strip() except: review_count = '-' try: avg_price = brief_info.select( '#avgPriceTitle')[0].text.strip() except: avg_price = '-' # Todo 这个建议使用info中信息,这里的有可能会不准,动态参数由json返回 # try: # comment_score = brief_info.select('#comment_score')[0].text.strip() # except: # comment_score = None try: address = main_info.find(attrs={ 'itemprop': 'street-address' }).text.strip() except: address = '-' try: phone = main_info.select('.tel')[0].text.strip() except: phone = '-' try: other_info = main_info.select('.other')[0].text.replace( '修改', '').strip() except: other_info = '-' except: # Todo 前台显示手动滑动解锁 # self.get_detail(shop_id) pass # Todo 促销信息 (单独接口 js加密) # try: # sale_info = '' # sales = main_info.select('#sales') # for sale in sales: # for tag in sale.select('.item'): # try: # title = tag.select('.title')[0].text # price = tag.select('.price')[0].text # del_price = tag.select('.del-price')[0].text # sale_info += title + '\t' + price + '\t' + del_price + '\n' # except: # continue # except: # sales = None except: # 切换解析方式 pass """ 解析格式2(一般酒店居多) """ # Todo 这种解析方式没有加密,会在解析加密文件时报错,反正这种格式数量不多,暂时不做更改了 # if shop_name is '-': # # 名称解析不到,换一种解析方式 # try: # base_info = html.select('base-info')[0] # try: # shop_name = base_info.select('.hotel-title')[0].text # except: # shop_name = None # try: # address = base_info.find(attrs={'itemprop': 'address'}).text.strip() # except: # address = None # try: # score = base_info.select('.hotel-scope')[0].select('.score')[0].text # except: # score = None # except: # # Todo 前台显示手动滑动解锁 # # self.get_detail(shop_id) # pass # pass detail_info = { '店铺id': shop_id, '店铺名': shop_name, '评论总数': review_count, '人均价格': avg_price, '店铺地址': address, '店铺电话': phone, '其他信息': other_info } return detail_info
def search(self, key_word, only_need_first=True, needed_pages=50): """ 搜索 :param key_word: 关键字 :param only_need_first: 只需要第一条 :param needed_pages: 需要多少页 :return: """ # Todo 不需要详情页和评论,只需要首页搜索 不需要cookie assert isinstance(key_word, str) assert key_word != None or key_word.strip() != '' if self.custom_search_url != '': key_word = self.custom_search_url logger.info('开始搜索:' + key_word) # header = self.get_header() for i in tqdm(range(1, needed_pages + 1), desc='页数'): # 针对只需要收条的情况,跳出页数循环 if only_need_first is True and i != 1: break url = 'http://www.dianping.com/search/keyword/' + str( self.location_id) + '/' + str( self.channel_id) + '_' + str(key_word) + '/p' + str(i) if self.custom_search_url != '': url = self.custom_search_url + str(i) r = requests_util.get_requests(url) # r = requests.get(url, headers=header) text = r.text # 获取加密文件 file_map = get_search_map_file(text) # 替换加密文件 text = requests_util.replace_search_html(text, file_map) # 网页解析 html = BeautifulSoup(text, 'lxml') shop_all_list = html.select('.shop-list')[0].select('li') search_res = [] for shop in shop_all_list: try: image_path = shop.select('.pic')[0].select('a')[0].select( 'img')[0]['src'] except: image_path = '-' try: shop_id = shop.select('.txt')[0].select('.tit')[0].select( 'a')[0]['data-shopid'] except: shop_id = '-' try: detail_url = shop.select('.txt')[0].select( '.tit')[0].select('a')[0]['href'] except: detail_url = '-' try: name = shop.select('.txt')[0].select('.tit')[0].select( 'a')[0].text.strip() except: name = '-' # 两个star方式,有的页面显示详细star分数,有的显示icon # 解析icon try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][ 1].split('_')[1] star_point = float(star_point) / 10 star_point = str(star_point) except: star_point = '-' # 解析详细star try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text star_point = float(star_point) star_point = str(star_point) except: pass try: review_number = shop.select('.txt')[0].select( '.comment')[0].select('.review-num')[0].text.replace( '\n', '') except: review_number = '-' try: mean_price = shop.select('.txt')[0].select('.comment')[ 0].select('.mean-price')[0].select('b')[0].text except: mean_price = '¥0' try: tags = shop.select('.txt')[0].select( '.tag-addr')[0].select('.tag') tag1 = tags[0].text.replace('\n', ' ').strip() tag2 = tags[1].text.replace('\n', ' ').strip() except: tag1 = '-' tag2 = '-' try: addr = shop.select('.txt')[0].select( '.tag-addr')[0].select('.addr')[0].text.replace( '\n', ' ').strip() except: addr = '-' try: recommend = shop.select('.recommend')[0].text.replace( '\n', ' ').strip() except: recommend = '-' try: commend_list = shop.select( '.comment-list')[0].text.replace('\n', ' ').strip() except: commend_list = '-' one_step_search_res = [ shop_id, name, star_point, review_number, mean_price, tag1, tag2, addr, recommend, commend_list, image_path, detail_url, 1, 1 ] # 最后两位是搜索标记 # 这个数据结构暂时没用 search_res.append(one_step_search_res) # 只要首条,跳出 if only_need_first is True: break # 解析详情页 if self.need_detail == '1': try: detail = Detail().get_detail(shop_id) print('\n' + ','.join(detail) + '\n') self.saver.save_data([detail], 'detail') except: # 设置标记 one_step_search_res[-2] = 0 logger.warning('详情信息获取失败,失败id:' + shop_id) print('\n' + ','.join(one_step_search_res) + '\n') if self.jump_wait is False: print( '检查浏览器,处理验证码,输入y程序继续运行,输入n跳过检查', 'http://www.dianping.com/shop/' + str(shop_id)) if input() == 'y': continue elif input() == 'n': self.jump_wait = True else: print('\n' + ','.join(one_step_search_res) + '\n') # 解析评论页 if self.need_comment == '1': try: review = Review().get_review(shop_id) print('获取', name, '评论', len(review), '条') self.saver.save_data(review, 'review') except: # 设置标记 one_step_search_res[-1] = 0 logger.warning('评论获取失败,失败id:' + shop_id) # 保存数据 self.saver.save_data([one_step_search_res], 'search') logger.info('解析完成:' + key_word)
def search(self, search_url, request_type='proxy, cookie'): """ 搜索 :param key_word: 关键字 :param only_need_first: 只需要第一条 :param needed_pages: 需要多少页 :return: """ r = requests_util.get_requests(search_url, request_type=request_type) text = r.text # 获取加密文件 file_map = get_search_map_file(text) # 替换加密文件 text = requests_util.replace_search_html(text, file_map) # 网页解析 html = BeautifulSoup(text, 'lxml') shop_all_list = html.select('.shop-list')[0].select('li') search_res = [] for shop in shop_all_list: try: image_path = shop.select('.pic')[0].select('a')[0].select('img')[0]['src'] except: image_path = '-' try: shop_id = shop.select('.txt')[0].select('.tit')[0].select('a')[0]['data-shopid'] except: shop_id = '-' try: detail_url = shop.select('.txt')[0].select('.tit')[0].select('a')[0]['href'] except: detail_url = '-' try: name = shop.select('.txt')[0].select('.tit')[0].select('a')[0].text.strip() except: name = '-' # 两个star方式,有的页面显示详细star分数,有的显示icon # 解析icon try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][ 1].split('_')[1] star_point = float(star_point) / 10 star_point = str(star_point) except: star_point = '-' # 解析详细star try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text star_point = float(star_point) star_point = str(star_point) except: pass try: review_number = shop.select('.txt')[0].select('.comment')[0].select('.review-num')[0].text.replace( '\n', '') except: review_number = '-' try: mean_price = shop.select('.txt')[0].select('.comment')[0].select('.mean-price')[0].select('b')[ 0].text except: mean_price = '¥0' try: tags = shop.select('.txt')[0].select('.tag-addr')[0].select('.tag') tag1 = tags[0].text.replace('\n', ' ').strip() tag2 = tags[1].text.replace('\n', ' ').strip() except: tag1 = '-' tag2 = '-' try: addr = shop.select('.txt')[0].select('.tag-addr')[0].select('.addr')[0].text.replace('\n', ' ').strip() except: addr = '-' try: recommend = shop.select('.recommend')[0].text.replace('\n', ' ').strip() except: recommend = '-' try: comment_list = shop.select('.comment-list')[0].text.replace('\n', ' ').strip() except: comment_list = '-' one_step_search_res = { '店铺id': shop_id, '店铺名': name, '评论个数': review_number, '人均价格': mean_price, '标签1': tag1, '标签2': tag2, '店铺地址': addr, '详情链接': detail_url, '图片链接': image_path, '详细评分': comment_list, '推荐菜': recommend, '店铺均分': star_point, } search_res.append(one_step_search_res) # yield one_step_search_res return search_res