def __init__(self): self.s = Search() self.d = Detail() self.r = Review() # 初始化基础URL if spider_config.SEARCH_URL == '': keyword = spider_config.KEYWORD channel_id = spider_config.CHANNEL_ID city_id = spider_config.LOCATION_ID self.base_url = 'http://www.dianping.com/search/keyword/' + str( city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p' pass else: self.base_url = spider_config.SEARCH_URL
def get_font_msg(): """ 获取加密字体映射文件,如果常规流程,这一步应该是由search中完成并存入缓存。 如果冷启动,一次search更新缓存 @return: """ if cache.search_font_map != {}: return cache.search_font_map else: Detail().get_detail('l3BEUN08X4TT52bm', just_need_map=True) return cache.search_font_map
def get_font_msg(): """ 获取加密字体映射文件,如果常规流程,这一步应该是由search中完成并存入缓存。 如果冷启动,一次search更新缓存 @return: """ if cache.search_font_map != {}: return cache.search_font_map else: Detail().get_detail_font_mapping('H2noKWCDigM0H9c1') return cache.search_font_map
required=False, default='', help='custom shop id') args = parser.parse_args() if __name__ == '__main__': # args.review = 1 # args.normal = 0 # args.shop_id = 'l8QDQukrl2tXhzmY' if args.normal == 1: keyword = global_config.getRaw('detail', 'keyword') need_first = True if global_config.getRaw( 'detail', 'need_first') is 'True' else False need_pages = int(global_config.getRaw('detail', 'need_pages')) s = Search() s.search(keyword, need_first, need_pages) if args.detail == 1: from function.detail import Detail shop_id = args.shop_id logger.info('爬取店铺id:' + shop_id + '详情') d = Detail() d.get_detail(shop_id) if args.review == 1: from function.review import Review shop_id = args.shop_id logger.info('爬取店铺id:' + shop_id + '评论') r = Review() r.get_review(shop_id)
class Controller(): """ 整个程序的控制器 用来进行爬取策略选择以及数据汇总存储 """ def __init__(self): self.s = Search() self.d = Detail() self.r = Review() # 初始化基础URL if spider_config.SEARCH_URL == '': keyword = spider_config.KEYWORD channel_id = spider_config.CHANNEL_ID city_id = spider_config.LOCATION_ID self.base_url = 'http://www.dianping.com/search/keyword/' + str( city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p' pass else: self.base_url = spider_config.SEARCH_URL def main(self): """ 调度 @return: """ # Todo 其实这里挺犹豫是爬取完搜索直接详情还是爬一段详情一段 # 本着稀释同类型访问频率的原则,暂时采用爬一段详情一段 # 调用搜索 for page in tqdm(range(1, spider_config.NEED_SEARCH_PAGES + 1), desc='搜索页数'): # 拼凑url search_url, request_type = self.get_search_url(page) # each_search_res = self.s.search(search_url, request_type) """ { '店铺id': -, '店铺名': -, '评论个数': -, '人均价格': -, '标签1': -, '标签2': -, '店铺地址': -, '详情链接': -, '图片链接': -, '详细评分': -, '推荐菜': -, '店铺均分': -, } """ search_res = self.s.search(search_url, request_type) for each_search_res in tqdm(search_res, desc='详细爬取'): each_detail_res = {} each_review_res = {} # 爬取详情 if spider_config.NEED_DETAIL: shop_id = each_search_res['店铺id'] if spider_config.NEED_PHONE_DETAIL: """ { '店铺id': -, '店铺名': -, '评论总数': -, '人均价格': -, '店铺地址': -, '店铺电话': -, '其他信息': - } """ each_detail_res = self.d.get_detail(shop_id) # 多版本爬取格式适配 each_detail_res.update({ '店铺总分': '-', '店铺评分': '-', }) else: """ { '店铺id': -, '店铺名': -, '店铺地址': -, '店铺电话': -, '店铺总分': -, '店铺评分': -, '人均价格': -, '评论总数': -, } """ hidden_info = get_basic_hidden_info(shop_id) review_and_star = get_review_and_star(shop_id) each_detail_res.update(hidden_info) each_detail_res.update(review_and_star) # 多版本爬取格式适配 each_detail_res.update({'其他信息': '-'}) # 爬取评论 if spider_config.NEED_REVIEW: shop_id = each_search_res['店铺id'] if spider_config.NEED_REVIEW_DETAIL: """ { '店铺id': -, '评论摘要': -, '评论总数': -, '好评个数': -, '中评个数': -, '差评个数': -, '带图评论个数': -, '精选评论': -, } """ each_review_res = self.r.get_review(shop_id) each_review_res.update({'推荐菜': '-'}) else: """ { '店铺id': -, '评论摘要': -, '评论总数': -, '好评个数': -, '中评个数': -, '差评个数': -, '带图评论个数': -, '精选评论': -, '推荐菜': -, } """ each_review_res = get_basic_review(shop_id) self.saver(each_search_res, each_detail_res, each_review_res) def get_review(self, shop_id, detail=False): if detail: each_review_res = self.r.get_review(shop_id) else: each_review_res = get_basic_review(shop_id) saver.save_data(each_review_res, 'review') def get_detail(self, shop_id, detail=False): each_detail_res = {} if detail: each_detail_res = self.d.get_detail(shop_id) # 多版本爬取格式适配 each_detail_res.update({ '店铺总分': '-', '店铺评分': '-', }) else: hidden_info = get_basic_hidden_info(shop_id) review_and_star = get_review_and_star(shop_id) each_detail_res.update(hidden_info) each_detail_res.update(review_and_star) # 多版本爬取格式适配 each_detail_res.update({'其他信息': '-'}) saver.save_data(each_detail_res, 'detail') def get_search_url(self, cur_page): """ 获取搜索链接 @param cur_page: @return: """ if cur_page == 1: # return self.base_url[:-2], 'no proxy, no cookie' return self.base_url[:-2], 'proxy, cookie' else: return self.base_url + str(cur_page), 'proxy, cookie' def saver(self, each_search_res, each_detail_res, each_review_res): # save search saver.save_data(each_search_res, 'search') # save detail if spider_config.NEED_DETAIL: saver.save_data(each_detail_res, 'detail') # save review if spider_config.NEED_REVIEW: saver.save_data(each_review_res, 'review')
def search(self, key_word, only_need_first=True, needed_pages=50): """ 搜索 :param key_word: 关键字 :param only_need_first: 只需要第一条 :param needed_pages: 需要多少页 :return: """ # Todo 不需要详情页和评论,只需要首页搜索 不需要cookie assert isinstance(key_word, str) assert key_word != None or key_word.strip() != '' if self.custom_search_url != '': key_word = self.custom_search_url logger.info('开始搜索:' + key_word) # header = self.get_header() for i in tqdm(range(1, needed_pages + 1), desc='页数'): # 针对只需要收条的情况,跳出页数循环 if only_need_first is True and i != 1: break url = 'http://www.dianping.com/search/keyword/' + str( self.location_id) + '/' + str( self.channel_id) + '_' + str(key_word) + '/p' + str(i) if self.custom_search_url != '': url = self.custom_search_url + str(i) r = requests_util.get_requests(url) # r = requests.get(url, headers=header) text = r.text # 获取加密文件 file_map = get_search_map_file(text) # 替换加密文件 text = requests_util.replace_search_html(text, file_map) # 网页解析 html = BeautifulSoup(text, 'lxml') shop_all_list = html.select('.shop-list')[0].select('li') search_res = [] for shop in shop_all_list: try: image_path = shop.select('.pic')[0].select('a')[0].select( 'img')[0]['src'] except: image_path = '-' try: shop_id = shop.select('.txt')[0].select('.tit')[0].select( 'a')[0]['data-shopid'] except: shop_id = '-' try: detail_url = shop.select('.txt')[0].select( '.tit')[0].select('a')[0]['href'] except: detail_url = '-' try: name = shop.select('.txt')[0].select('.tit')[0].select( 'a')[0].text.strip() except: name = '-' # 两个star方式,有的页面显示详细star分数,有的显示icon # 解析icon try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][ 1].split('_')[1] star_point = float(star_point) / 10 star_point = str(star_point) except: star_point = '-' # 解析详细star try: star_point = \ shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text star_point = float(star_point) star_point = str(star_point) except: pass try: review_number = shop.select('.txt')[0].select( '.comment')[0].select('.review-num')[0].text.replace( '\n', '') except: review_number = '-' try: mean_price = shop.select('.txt')[0].select('.comment')[ 0].select('.mean-price')[0].select('b')[0].text except: mean_price = '¥0' try: tags = shop.select('.txt')[0].select( '.tag-addr')[0].select('.tag') tag1 = tags[0].text.replace('\n', ' ').strip() tag2 = tags[1].text.replace('\n', ' ').strip() except: tag1 = '-' tag2 = '-' try: addr = shop.select('.txt')[0].select( '.tag-addr')[0].select('.addr')[0].text.replace( '\n', ' ').strip() except: addr = '-' try: recommend = shop.select('.recommend')[0].text.replace( '\n', ' ').strip() except: recommend = '-' try: commend_list = shop.select( '.comment-list')[0].text.replace('\n', ' ').strip() except: commend_list = '-' one_step_search_res = [ shop_id, name, star_point, review_number, mean_price, tag1, tag2, addr, recommend, commend_list, image_path, detail_url, 1, 1 ] # 最后两位是搜索标记 # 这个数据结构暂时没用 search_res.append(one_step_search_res) # 只要首条,跳出 if only_need_first is True: break # 解析详情页 if self.need_detail == '1': try: detail = Detail().get_detail(shop_id) print('\n' + ','.join(detail) + '\n') self.saver.save_data([detail], 'detail') except: # 设置标记 one_step_search_res[-2] = 0 logger.warning('详情信息获取失败,失败id:' + shop_id) print('\n' + ','.join(one_step_search_res) + '\n') if self.jump_wait is False: print( '检查浏览器,处理验证码,输入y程序继续运行,输入n跳过检查', 'http://www.dianping.com/shop/' + str(shop_id)) if input() == 'y': continue elif input() == 'n': self.jump_wait = True else: print('\n' + ','.join(one_step_search_res) + '\n') # 解析评论页 if self.need_comment == '1': try: review = Review().get_review(shop_id) print('获取', name, '评论', len(review), '条') self.saver.save_data(review, 'review') except: # 设置标记 one_step_search_res[-1] = 0 logger.warning('评论获取失败,失败id:' + shop_id) # 保存数据 self.saver.save_data([one_step_search_res], 'search') logger.info('解析完成:' + key_word)
# Search().search('一方', only_need_first=False, needed_pages=2) # debug review font parse # header = get_header() # url = 'http://www.dianping.com/shop/i24HGIrTSjD3Tcyy/review_all' # r = requests.get(url, headers=header) # get_review_map_file(r.text) # debug requests utils # from utils.requests_utils import requests_util # # print(requests_util.parse_stop_time('5,10;20,100')) # requests_util.get_requests('http://www.baidu.com') # print(1) # requests_util.get_requests('http://www.baidu.com') # print(2) # requests_util.get_requests('http://www.baidu.com') # print(3) # requests_util.get_requests('http://www.baidu.com') # print(4) # requests_util.get_requests('http://www.baidu.com') # print(5) # requests_util.get_requests('http://www.baidu.com') # print(6) # requests_util.get_requests('http://www.baidu.com') # debug detail from function.detail import Detail Detail().get_detail('k55CTXmrQdpFgFaf') pass