class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy()) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath( each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_amuse_link(self): """ 获取每个城市中所有的娱乐场所的链接 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) for each_city in city_list: try: url = each_city.strip().split('\u0001')[1] + '-wanle' name = each_city.strip().split('\u0001')[0] params_city = {'page': 0} maxpage = 200 # 默认最大页数 while True: save_list = [] params_city['page'] += 1 content = self.crawl.crawl_by_get( url, headers=setting.HEADERS, params=params_city, proxies=self._engine_use_proxy(), retry=2, timeout=15) if not content: break # 获取总页数 if params_city['page'] == 1: # 找到最大页数,使用map函数 pagecount = map( lambda x: int(x) if x != '下一页' else -1, self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_NEXTPAGE)) try: maxpage = max(pagecount) except: break element_li = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_LI) if not element_li: break for each_ele in element_li: amuse_name = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_AMUSE_NAME) amuse_type = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_AMUSE_TYPE) amuse_url = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_AMUSE_URL) try: save_info = '{}\u0001{}\u0001{}\u0001{}'.format( name, ''.join(amuse_name), ''.join(amuse_type), ''.join(amuse_url)) except: continue save_list.append(save_info) self.pipe.pipe_txt_save(save_list, filename=setting.FILE_AMUSE_LIST, savetype='a') if params_city['page'] >= maxpage: break time.sleep(0.2) except: continue def _engine_amuse_info(self): """ 获取所有娱乐场所详细数据 :return: """ amuse_list = self.pipe.pipe_txt_load(filename=setting.FILE_AMUSE_LIST) for each_amuse in amuse_list: try: # 娱乐场所数据 amuse_info = each_amuse.strip().split('\u0001') city_name = amuse_info[0] amuse_name = amuse_info[1] amuse_type = amuse_info[2] amuse_url = amuse_info[3] find_id = re.search(re.compile(r'p-oi(\d+)-'), amuse_url) if find_id: amuse_id = find_id.group(1) else: amuse_id = 0 # 获取娱乐场所详细信息 content = self.crawl.crawl_by_get( amuse_url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=5, timeout=10) detail = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_AMUSE_DETAIL) detail['city_name'] = city_name detail['amuse_name'] = amuse_name detail['amuse_type'] = amuse_type detail['amuse_url'] = amuse_url detail['amuse_id'] = amuse_id detail['get_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # 存储数据 # 字段顺序 # city_name, amuse_name, amuse_type, amuse_id, # score, ranking, describe, address, tel, open_time, arrive, intro, web, get_time, amuse_url save_data = '{0[city_name]}\u0001{0[amuse_name]}\u0001{0[amuse_type]}\u0001' \ '{0[amuse_id]}\u0001{0[score]}\u0001{0[ranking]}\u0001' \ '{0[describe]}\u0001{0[address]}\u0001{0[tel]}\u0001' \ '{0[open_time]}\u0001{0[arrive]}\u0001{0[intro]}\u0001' \ '{0[web]}\u0001{0[get_time]}\u0001{0[amuse_url]}\u0001'.format(detail) self.pipe.pipe_txt_save(save_data, filename=setting.FILE_AMUSE_INFO, savetype='a') # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_shop_info') time.sleep(0.1) except Exception as e: print('crawl error', e) continue def _engine_amuse_comments(self): """ 获取所有购物店评论数据 :return: """ amuse_list = self.pipe.pipe_txt_load(filename=setting.FILE_AMUSE_LIST) # 每个店铺最新评论时间表 check_dict = self.pipe.pipe_pickle_load( filename=setting.FILE_COMMENTS_CHECK) if not check_dict: check_dict = {} for each_amuse in amuse_list: try: # 店铺数据 city = each_amuse.strip().split('\u0001')[0] amuse = each_amuse.strip().split('\u0001')[1] type = each_amuse.strip().split('\u0001')[2] amuse_url = each_amuse.strip().split('\u0001')[3] find_id = re.search(re.compile(r'p-oi(\d+)-'), amuse_url) if not find_id: break amuse_id = find_id.group(1) api = setting.COMMENTS_API.format(amuse_id) setting.HEADERS_COMMENTS['Referer'] = amuse_url params = { 'page': 0, 'pageSize': '10', 'poiList': 'true', 'rank': 0, # 全部评论 'sortField': 0 # 按照时间排序 } comments_time = set([]) current_time = check_dict.get(amuse_id, '0') max_page = 1 while True: params['page'] += 1 content = self.crawl.crawl_by_get( api, headers=setting.HEADERS_COMMENTS, proxies=self._engine_use_proxy(), params=params, retry=2, timeout=15) try: content_dict = json.loads(content) except: break if not content_dict.get('data'): break content_comments = content_dict.get('data') # 第一遍抓取要确定评论页数 if params['page'] == 1: page = self.analysis.analysis_by_xpath( content_comments, xpahter=setting.XPATH_COMMENTS_PAGE) if page: max_page = int(''.join(page)) elements_com = self.analysis.analysis_by_xpath( content_comments, xpahter=setting.XPATH_COMMENTS_LI) if not elements_com: break for each_element in elements_com: title = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_TITLE) start = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_START) nick = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_NICK) more = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_MORE) if more: content_more = self.crawl.crawl_by_get( more[0], headers=setting.HEADERS, proxies=self._engine_use_proxy()) content = self.analysis.analysis_by_xpath( content_more, xpahter=setting.XPATH_COMMENTS_DETAIL) else: content = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_CONTENT) date = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_DATE) deal_content = ''.join( list( map( lambda x: x.replace('\n', '').replace( '\r', '').replace('\t', '').replace( ' ', ''), content))) if ''.join(date) > current_time: commetents_info = { 'city': city, 'amuse': amuse, 'amuse_id': amuse_id, 'type': type, 'title': ''.join(title), 'nick': ''.join(nick), 'start': ''.join(start), 'content': deal_content, 'date': ''.join(date), 'get_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'url': amuse_url } for eachkey in commetents_info.keys(): commetents_info[eachkey] = commetents_info[ eachkey].replace('\n', '').replace('\r', '') # 存储数据 # 字段顺序 # city, amuse, amuse_id, type, title, nick, start, content, date, get_time, url save_data = '{0[city]}\u0001{0[amuse]}\u0001{0[amuse_id]}\u0001' \ '{0[type]}\u0001{0[title]}\u0001{0[nick]}\u0001' \ '{0[start]}\u0001{0[content]}\u0001{0[date]}\u0001' \ '{0[get_time]}\u0001{0[url]}'.format(commetents_info) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_AMUSE_COMMENTS, savetype='a') # self.pipe.pipe_mongo_save(commetents_info, dbname='db_qunaer', colname='col_shopping_comments') comments_time.add(''.join(date)) # 超过评论最大页数则切换 if params['page'] >= max_page: break # 当前页面没有新增评论也切换至下一店铺 if not len(comments_time): break # 每个店铺最新的评论时间 if comments_time: check_dict[amuse_id] = max(comments_time) # 抓取到的评论数据 self.pipe.pipe_pickle_save( check_dict, filename=setting.FILE_COMMENTS_CHECK) except: continue def _temp_city_info(self, cityname): """ 做22项数据处理时临时用 :return: """ citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt') city_params = { '国别': '&', '省自治区全称': '&', '省自治区简称': '&', '市州全称': '&', '市州简称': '&', '区县全称': '&', '区县简称': '&', '地区编码': '&', '等级': '&' } spec_city = { '北京': '110000', '天津': '120000', '上海': '310000', '重庆': '500000' } for each in citylist: cityinfo = each.split('\u0001') if cityname in cityinfo: site = cityinfo.index(cityname) if site == 4 or site == 5: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['区县全称'] = cityinfo[4].strip() city_params['区县简称'] = cityinfo[5].strip() city_params['地区编码'] = cityinfo[-1].strip() city_params['等级'] = '区县级' elif site == 2 or site == 3: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00' city_params['等级'] = '地市级' elif cityname in ['北京', '重庆', '上海', '天津']: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityname + '市' city_params['省自治区简称'] = cityname city_params['市州全称'] = cityname + '市' city_params['市州简称'] = cityname city_params['地区编码'] = spec_city[cityname] city_params['等级'] = '直辖' break return city_params @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def start_engine(self): # self._engine_city_link() # self._engine_amuse_link() # 店铺信息和店铺评论可以同时抓取的,用多进程实现,后期可根据需求添加该功能,目前未开发循环抓取功能 # self._engine_amuse_info() self._engine_amuse_comments()
class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy()) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath( each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_tactic_link(self): """ 获取每个城市中所有的攻略的链接 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) tactic_check = self.pipe.pipe_pickle_load( filename=setting.FILE_TACTIC_CHECK) if not tactic_check: tactic_check = set([]) for each_city in city_list: """ http://travel.qunar.com/travelbook/list/22-城市拼音-城市id/ hot(hot为热门游记,elite为精华游记,start为行程计划)_ctime(ctime为按最新发表排序,heat为热度排序)/页码.htm """ try: url = each_city.strip().split('\u0001')[1] name = each_city.strip().split('\u0001')[0] pattern = re.compile(r'p-cs(\d+)-(\w+)') city_pname = re.search(pattern, url).group(2) city_id = re.search(pattern, url).group(1) # 拼接攻略所在url(1.城市拼音名称:city_pname, 2.城市id:city_id, 3.分类) tactic_type = ['hot', 'elite', 'start'] # 攻略分类,目前脚本先抓取hot类 tactic_url = setting.TACTIC_URL.format(city_pname, city_id, tactic_type[0]) current_page = 0 maxpage = 200 # 默认最大页数 while True: save_list = [] current_page += 1 content = self.crawl.crawl_by_get( tactic_url + '{}.htm'.format(current_page), headers=setting.HEADERS, retry=2, timeout=15, proxies=self._engine_use_proxy()) if not content: break # 获取总页数 if current_page == 1: # 找到最大页数,使用map函数 pagecount = map( lambda x: int(x) if x != '下一页>' else -1, self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_NEXTPAGE)) try: maxpage = max(pagecount) except: break tactic_ids = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_ID) for each_id in tactic_ids: each_url = 'http://travel.qunar.com/youji/{}'.format( each_id) save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format( name, city_pname, city_id, each_id, each_url) if each_id not in tactic_check: save_list.append(save_info) tactic_check.add(each_id) if save_list: self.pipe.pipe_txt_save( save_list, filename=setting.FILE_TACTIC_LIST, savetype='a') if current_page >= maxpage: break time.sleep(0.2) except: continue def _engine_tactic_info(self): """ 获取所有攻略详细数据 :return: """ tactic_list = self.pipe.pipe_txt_load( filename=setting.FILE_TACTIC_LIST) for each_tactic in tactic_list: try: # 攻略数据 tactic_info = each_tactic.strip().split('\u0001') city_name = tactic_info[0] city_pname = tactic_info[1] city_id = tactic_info[2] tactic_id = tactic_info[3] tactic_url = tactic_info[4] # 获取娱乐场所详细信息 content = self.crawl.crawl_by_get( tactic_url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=3, timeout=15) detail = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_TACTIC_DETAIL) detail['city_name'] = city_name detail['city_pname'] = city_pname detail['city_id'] = city_id detail['tactic_id'] = tactic_id detail['tactic_url'] = tactic_url detail['get_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # 存储数据 # 字段顺序 # city_name, city_pname, city_id, # tactic_id,title,author, # create_date,start_date,days, # avgs_price,person,play_type, # content,get_time, tactic_url save_data = '{0[city_name]}\u0001{0[city_pname]}\u0001{0[city_id]}\u0001' \ '{0[tactic_id]}\u0001{0[title]}\u0001{0[author]}\u0001' \ '{0[create_date]}\u0001{0[start_date]}\u0001{0[days]}\u0001' \ '{0[avgs_price]}\u0001{0[person]}\u0001{0[play_type]}\u0001' \ '{0[content]}\u0001{0[get_time]}\u0001{0[tactic_url]}\u0001'.format(detail) self.pipe.pipe_txt_save(save_data, filename=setting.FILE_TACTIC_INFO, savetype='a') # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_shop_info') time.sleep(0.1) except Exception as e: print('crawl error', e) continue def _engine_tactic_comments(self): """ 获取所有攻略评论数据 :return: """ tactic_list = self.pipe.pipe_txt_load( filename=setting.FILE_TACTIC_LIST) # 每个店铺最新评论时间表 for each_tactic in tactic_list: try: # 店铺数据 each_info = each_tactic.strip().split('\u0001') city_name = each_info[0] city_pname = each_info[1] city_id = each_info[2] tactic_id = each_info[3] tactic_url = each_info[4] setting.HEADERS_COMMENTS['Referer'] = tactic_url params = { 'bookId': tactic_id, # 攻略id 'csrfToken': 'o7mGNaK63wbEaYFJTnDue14WX7sPlyXB', # 暂时固定token 'page': 0, # 页码 'pageSize': 30, # 每页数量 } while True: params['page'] += 1 content = self.crawl.crawl_by_get( setting.COMMENTS_API, headers=setting.HEADERS_COMMENTS, proxies=self._engine_use_proxy(), params=params, retry=2, timeout=15) try: content_dict = json.loads(content) except: break if not content_dict.get('data', {}).get('html'): break content_comments = content_dict.get('data', {}).get('html') # 第一遍抓取要确定评论页数 elements_com = self.analysis.analysis_by_xpath( content_comments, xpahter=setting.XPATH_COMMENTS_LI) if not elements_com: break for each_element in elements_com: ask_content = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_ASK_CONTENT) answer_content = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_ANSWER_CONTENT) ask_date = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_ASK_DATE) answer_date = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_ANSWER_DATE) commetents_info = { 'city_name': city_name, 'city_id': city_id, 'tactic_id': tactic_id, 'ask_content': ask_content, 'answer_content': answer_content, 'ask_date': ask_date, 'answer_date': answer_date, 'get_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'tactic_url': tactic_url } for eachkey in commetents_info.keys(): if isinstance(commetents_info[eachkey], str): commetents_info[eachkey] = commetents_info[eachkey]\ .replace('\n', '').replace('\r', '').replace('\xa0', '') elif isinstance(commetents_info[eachkey], list): commetents_info[eachkey] = ''.join(commetents_info[eachkey])\ .replace('\n', '').replace('\r', '') # 存储数据 # 字段顺序 # city_name, city_id, tactic_id, # ask_content, answer_content, ask_date, # answer_date, get_time, tactic_url, save_data = '{0[city_name]}\u0001{0[city_id]}\u0001{0[tactic_id]}\u0001' \ '{0[ask_content]}\u0001{0[answer_content]}\u0001{0[ask_date]}\u0001' \ '{0[answer_date]}\u0001{0[get_time]}\u0001' \ '{0[tactic_url]}\u0001'.format(commetents_info) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_TACTIC_COMMENTS, savetype='a') # self.pipe.pipe_mongo_save(commetents_info, dbname='db_qunaer', colname='col_shopping_comments') except: continue def _temp_city_info(self, cityname): """ 做22项数据处理时临时用 :return: """ citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt') city_params = { '国别': '&', '省自治区全称': '&', '省自治区简称': '&', '市州全称': '&', '市州简称': '&', '区县全称': '&', '区县简称': '&', '地区编码': '&', '等级': '&' } spec_city = { '北京': '110000', '天津': '120000', '上海': '310000', '重庆': '500000' } for each in citylist: cityinfo = each.split('\u0001') if cityname in cityinfo: site = cityinfo.index(cityname) if site == 4 or site == 5: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['区县全称'] = cityinfo[4].strip() city_params['区县简称'] = cityinfo[5].strip() city_params['地区编码'] = cityinfo[-1].strip() city_params['等级'] = '区县级' elif site == 2 or site == 3: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00' city_params['等级'] = '地市级' elif cityname in ['北京', '重庆', '上海', '天津']: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityname + '市' city_params['省自治区简称'] = cityname city_params['市州全称'] = cityname + '市' city_params['市州简称'] = cityname city_params['地区编码'] = spec_city[cityname] city_params['等级'] = '直辖' break return city_params @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def start_engine(self): self._engine_city_link() # 本版块循环策略为循环抓取攻略,然后评论每次抓取一次攻略列表之后,抓取一遍所有攻略所有评论,并入存入新的文本 self._engine_tactic_link() self._engine_tactic_info() self._engine_tactic_comments()
class Engine: def __init__(self): self.crawl = Crawl() self.pipe = Pipeline() self.analysis = Analysis() # def _engine_residential_area_by_json(self): # """ # 获取小区数据,output为json, # 但是高德那边返回的json数据小区更位置对应不上,只能使用xml数据,故不用该模块,使用xml # """ # citys = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID) # types = self.pipe.pipe_txt_load(filename=setting.FILE_TYPE_ID) # current_params = deepcopy(setting.PARAMS) # current_params['key'] = setting.KEY # # 每种类型 # for each_type in types: # typeinfo = each_type.strip().split('\u0001') # type_id = typeinfo[0] # 类型id # type_large = typeinfo[1] # 类型大分类 # type_middle = typeinfo[2] # 类型中分类 # type_small = typeinfo[3] # 类型小分类 # current_params['types'] = type_id # save_filename = '{}_{}_{}_{}.txt'.format(type_id, type_large, type_middle, type_small) # # 每个城市 # for each_city in citys: # cityinfo = each_city.strip().split('\u0001') # province = cityinfo[0] # 省名 # city_name = cityinfo[1] # 城市名 # city_id = cityinfo[2] # 城市id # current_params['city'] = city_id # current_params['page'] = 0 # save_data = [] # while True: # current_params['page'] += 1 # content_json = self.crawl.crawl_by_get(setting.SEARCH_API, params=current_params, # retry=2, timeout=30) # try: # data_json = json.loads(content_json) # except: # continue # pois_list = data_json.get('pois') # if not pois_list: # break # for each_poi in pois_list: # """ # 字段说明: # id: 唯一ID, name: 名称, pcode: poi所在省份编码, pname: poi所在省份名称,citycode: 城市编码, # cityname: 城市名,adcode: 区域编码, adname: 区域名称,address: 地址, alias: 别名, # biz_ext: 深度信息, biz_type: 行业类型, business_area: 所在商圈, discount_num: 优惠信息数目, # distance: 离中心点距离(此结果仅在周边搜索的时候有值), email: 该POI的电子邮箱, entr_location: 入口经纬度, # exit_location: 出口经纬度, gridcode: 地理格ID, groupbuy_num: 团购数据, indoor_data: 室内地图相关数据, # indoor_map: 是否有室内地图标志, location: 经纬度, navi_poiid: 地图编号, photos: 照片相关信息, # postcode: 邮编, tag: 该POI的特色内容, tel: 该POI的电话, type: 兴趣点类型, typecode: 兴趣点类型编码, # website: 该POI的网址 # """ # save_dict = {} # save_dict['id'] = each_poi.get('id', '') # id: 唯一ID # save_dict['name'] = each_poi.get('name', '') # name: 名称 # save_dict['pcode'] = each_poi.get('pcode', '') # pcode: poi所在省份编码 # save_dict['pname'] = each_poi.get('pname', '') # pname: poi所在省份名称 # save_dict['citycode'] = each_poi.get('citycode', '') # citycode: 城市编码 # save_dict['cityname'] = each_poi.get('cityname', '') # cityname: 城市名 # save_dict['adcode'] = each_poi.get('adcode', '') # adcode: 区域编码 # save_dict['adname'] = each_poi.get('adname', '') # adname: 区域名称 # save_dict['address'] = each_poi.get('address', '') # address: 地址 # save_dict['alias'] = each_poi.get('alias', '') # alias: 别名 # save_dict['biz_ext'] = each_poi.get('biz_ext', '') # biz_ext: 深度信息 # save_dict['biz_type'] = each_poi.get('biz_type', '') # biz_type: 行业类型 # save_dict['business_area'] = each_poi.get('business_area', '') # business_area: 所在商圈 # save_dict['discount_num'] = each_poi.get('discount_num', '') # discount_num: 优惠信息数目 # save_dict['email'] = each_poi.get('email', '') # email: 该POI的电子邮箱 # save_dict['entr_location'] = each_poi.get('entr_location', '') # entr_location: 入口经纬度 # save_dict['exit_location'] = each_poi.get('exit_location', '') # exit_location: 出口经纬度 # save_dict['gridcode'] = each_poi.get('gridcode', '') # gridcode: 地理格ID # save_dict['groupbuy_num'] = each_poi.get('groupbuy_num', '') # groupbuy_num: 团购数据 # save_dict['indoor_data'] = each_poi.get('indoor_data', '') # indoor_data: 室内地图相关数据 # save_dict['indoor_map'] = each_poi.get('indoor_map', '') # indoor_map: 是否有室内地图标志 # save_dict['location'] = each_poi.get('location', '') # location: 经纬度 # save_dict['navi_poiid'] = each_poi.get('navi_poiid', '') # navi_poiid: 地图编号 # photos = each_poi.get('photos', []) # photos: 照片相关信息 # save_dict['photo_info'] = '' # for each_photo in photos: # if isinstance(each_photo.get('title', {}), dict): # each_photo['title'] = 'notitle' # save_dict['photo_info'] += '{0[title]}-{0[url]},'.format(each_photo) # save_dict['postcode'] = each_poi.get('postcode', '') # postcode: 邮编 # save_dict['tag'] = each_poi.get('tag', '') # tag: 该POI的特色内容 # save_dict['tel'] = each_poi.get('tel', '') # tel: 该POI的电话 # save_dict['type'] = each_poi.get('type', '') # type: 兴趣点类型 # save_dict['typecode'] = each_poi.get('typecode', '') # typecode: 兴趣点类型编码 # save_dict['website'] = each_poi.get('website', '') # website: 该POI的网址 # for each_key in save_dict.keys(): # save_dict[each_key] = \ # save_dict[each_key] if not isinstance(save_dict[each_key], dict) else '' # # 存储字段类型 # # id, name, pcode, pname, citycode, cityname, adcode, adname, # # address, alias, biz_type, business_area, discount_num, email, # # entr_location, exit_location, gridcode, groupbuy_num, indoor_data, # # indoor_map, location, navi_poiid, photo_info, postcode, tag, tel, type, typecode, website, # save_info = '{0[id]}\u0001{0[name]}\u0001{0[pcode]}\u0001{0[pname]}\u0001' \ # '{0[citycode]}\u0001{0[cityname]}\u0001{0[adcode]}\u0001{0[adname]}\u0001' \ # '{0[address]}\u0001{0[alias]}\u0001{0[biz_type]}\u0001{0[business_area]}\u0001' \ # '{0[discount_num]}\u0001{0[email]}\u0001{0[entr_location]}\u0001' \ # '{0[exit_location]}\u0001' \ # '{0[gridcode]}\u0001{0[groupbuy_num]}\u0001{0[indoor_data]}\u0001' \ # '{0[indoor_map]}\u0001' \ # '{0[location]}\u0001{0[navi_poiid]}\u0001{0[photo_info]}\u0001{0[postcode]}\u0001' \ # '{0[tag]}\u0001{0[tel]}\u0001{0[type]}\u0001{0[typecode]}\u0001' \ # '{0[website]}'.format(save_dict) # save_data.append(save_info) # time.sleep(0.1) # self.pipe.pipe_txt_save(save_data, filename=save_filename, savetype='a') def _engine_residential_area(self): """获取小区数据""" citys = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID) types = self.pipe.pipe_txt_load(filename=setting.FILE_TYPE_ID) current_params = deepcopy(setting.PARAMS) current_params['key'] = setting.KEY # 每种类型 for each_type in types: typeinfo = each_type.strip().split('\u0001') type_id = typeinfo[0] # 类型id type_large = typeinfo[1] # 类型大分类 type_middle = typeinfo[2] # 类型中分类 type_small = typeinfo[3] # 类型小分类 current_params['types'] = type_id save_filename = '{}_{}_{}_{}.txt'.format(type_id, type_large, type_middle, type_small) # 每个城市 for each_city in citys: cityinfo = each_city.strip().split('\u0001') province = cityinfo[0] # 省名 city_name = cityinfo[1] # 城市名 city_id = cityinfo[2] # 城市id current_params['city'] = city_id current_params['page'] = 0 save_data = [] while True: current_params['page'] += 1 content = self.crawl.crawl_by_get(setting.SEARCH_API, params=current_params, retry=2, timeout=30) try: con = re.search(re.compile(r'<response>(.*?)</response>', re.S), content).group(1) pois_list = self.analysis.analysis_by_xpath(con, xpahter=setting.XPATH_POIS) except: continue if not pois_list: break for each_poi in pois_list: """ 字段说明: id: 唯一ID, name: 名称, pcode: poi所在省份编码, pname: poi所在省份名称,citycode: 城市编码, cityname: 城市名,adcode: 区域编码, adname: 区域名称,address: 地址, alias: 别名, biz_ext: 深度信息, biz_type: 行业类型, business_area: 所在商圈, discount_num: 优惠信息数目, distance: 离中心点距离(此结果仅在周边搜索的时候有值), email: 该POI的电子邮箱, entr_location: 入口经纬度, exit_location: 出口经纬度, gridcode: 地理格ID, groupbuy_num: 团购数据, indoor_data: 室内地图相关数据, indoor_map: 是否有室内地图标志, location: 经纬度, navi_poiid: 地图编号, photos: 照片相关信息, postcode: 邮编, tag: 该POI的特色内容, tel: 该POI的电话, type: 兴趣点类型, typecode: 兴趣点类型编码, website: 该POI的网址 """ save_dict = self.analysis.analysis_by_xpath(each_poi, xpahter=setting.XPATH_DETAIL) photos = self.analysis.analysis_by_xpath(each_poi, xpahter=setting.XPATH_PHOTOS) photo_info = '' for each_photo in photos: photo_dict = self.analysis.analysis_by_xpath(each_photo, xpahter=setting.XPATH_PHOTO_DETAIL) photo_dict['title'] = photo_dict['title'] if photo_dict['title'] else 'no_title' photo_info += '{0[title]}-{0[url]},'.format(photo_dict) save_dict['photo_info'] = photo_info # 存储字段类型 # id, name, pcode, pname, citycode, cityname, adcode, adname, # address, alias, biz_type, business_area, discount_num, email, # entr_location, exit_location, gridcode, groupbuy_num, indoor_data, # indoor_map, location, navi_poiid, photo_info, postcode, tag, tel, type, typecode, website, save_info = '{0[id]}\u0001{0[name]}\u0001{0[pcode]}\u0001{0[pname]}\u0001' \ '{0[citycode]}\u0001{0[cityname]}\u0001{0[adcode]}\u0001{0[adname]}\u0001' \ '{0[address]}\u0001{0[alias]}\u0001{0[biz_type]}\u0001{0[business_area]}\u0001' \ '{0[discount_num]}\u0001{0[email]}\u0001{0[entr_location]}\u0001' \ '{0[exit_location]}\u0001' \ '{0[gridcode]}\u0001{0[groupbuy_num]}\u0001{0[indoor_data]}\u0001' \ '{0[indoor_map]}\u0001' \ '{0[location]}\u0001{0[navi_poiid]}\u0001{0[photo_info]}\u0001{0[postcode]}\u0001' \ '{0[tag]}\u0001{0[tel]}\u0001{0[type]}\u0001{0[typecode]}\u0001' \ '{0[website]}'.format(save_dict) save_data.append(save_info) time.sleep(5) self.pipe.pipe_txt_save(save_data, filename=save_filename, savetype='a') @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {"host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass} proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def run_engine(self): self._engine_residential_area()
class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy()) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_surround_link(self): """ 获取每个城市中所有的周边游玩地点的链接 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) for each_city in city_list: url = each_city.strip().split('\u0001')[1] + '-zhoubian' name = each_city.strip().split('\u0001')[0] page = 1 maxpage = 200 # 默认最大页数 while True: try: next_url = url + '-2-1-{}'.format(page) save_list = [] # 获取总页数 content = self.crawl.crawl_by_get(next_url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=3, timeout=15) # 找到最大页数,使用map函数 if page == 1: pagecount = map(lambda x: int(x) if x != '下一页' else -1, self.analysis.analysis_by_xpath(content, xpahter=setting.XPATH_NEXTPAGE)) if pagecount: maxpage = max(pagecount) element_li = self.analysis.analysis_by_xpath(content, xpahter=setting.XPATH_DIV) if not element_li: break for each_ele in element_li: surround_name = self.analysis.analysis_by_xpath(each_ele, xpahter=setting.XPATH_SURROUND_NAME) surround_type = self.analysis.analysis_by_xpath(each_ele, xpahter=setting.XPATH_SURROUND_TYPE) surround_url = self.analysis.analysis_by_xpath(each_ele, xpahter=setting.XPATH_SURROUND_URL) try: save_info = '{}\u0001{}\u0001{}\u0001{}'.format(name, ''.join(surround_name), '-'.join(surround_type), ''.join(surround_url)) except: continue save_list.append(save_info) self.pipe.pipe_txt_save(save_list, filename=setting.FILE_SURROUND_LIST, savetype='a') if page >= maxpage: break page += 1 time.sleep(0.2) except: break def _engine_surround_info(self): """ 获取所有周边游场所详细数据 :return: """ surround_list = self.pipe.pipe_txt_load(filename=setting.FILE_SURROUND_LIST) for each_res in surround_list: try: # 景区数据 surround_info = each_res.strip().split('\u0001') city_name = surround_info[0] surround_name = surround_info[1] surround_url = surround_info[3] surround_type = surround_info[2] find_id = re.search(re.compile(r'p-oi(\d+)-'), surround_url) if find_id: surround_id = find_id.group(1) else: surround_id = 0 # 获取店铺详细信息 content = self.crawl.crawl_by_get(surround_url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=5, timeout=15) detail = self.analysis.analysis_by_xpath(content, xpahter=setting.XPATH_SURROUND_DETAIL) detail['city_name'] = city_name detail['surround_name'] = surround_name detail['surround_url'] = surround_url detail['surround_id'] = surround_id detail['surround_type'] = surround_type detail['get_time'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 字段顺序:city_name, surround_name, surround_id, surround_type # score, ranking, describe, address, tel, web, time, open_time, arrive, # ticket, travel_time, tip, surround_url, get_time save_data = '{0[city_name]}\u0001{0[surround_name]}\u0001{0[surround_id]}\u0001{0[surround_type]}\u0001' \ '{0[score]}\u0001{0[ranking]}\u0001{0[describe]}\u0001' \ '{0[address]}\u0001{0[tel]}\u0001{0[web]}\u0001' \ '{0[time]}\u0001{0[open_time]}\u0001{0[arrive]}\u0001' \ '{0[ticket]}\u0001{0[travel_time]}\u0001{0[tip]}\u0001' \ '{0[surround_url]}\u0001{0[get_time]}'.format(detail) self.pipe.pipe_txt_save(save_data, filename=setting.FILE_SURROUND_INFO, savetype='a') # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_scenic_info') time.sleep(0.2) except: continue def _engine_surround_comments(self): """ 获取所有景区评论数据 :return: """ scen_list = self.pipe.pipe_txt_load(filename=setting.FILE_SURROUND_LIST) # 每个景区最新评论时间表 check_dict = self.pipe.pipe_pickle_load(filename=setting.FILE_COMMENTS_CHECK) if not check_dict: check_dict = {} for each_res in scen_list: try: # 景区数据 city = each_res.strip().split('\u0001')[0] surround = each_res.strip().split('\u0001')[1] surround_type = each_res.strip().split('\u0001')[2] surround_url = each_res.strip().split('\u0001')[3] find_id = re.search(re.compile(r'p-oi(\d+)-'), surround_url) if find_id: surround_id = find_id.group(1) else: continue api = setting.COMMENTS_API.format(surround_id) setting.HEADERS_COMMENTS['Referer'] = surround_url params = { 'page': 0, 'pageSize': '10', 'poiList': 'true', 'rank': 0, # 全部评论 'sortField': 0 # 按照时间排序 } comments_time = set([]) current_time = check_dict.get(surround_id, '0') max_page = 1 while True: params['page'] += 1 content = self.crawl.crawl_by_get(api, headers=setting.HEADERS_COMMENTS, proxies=self._engine_use_proxy(), params=params, retry=3, timeout=15) try: content_dict = json.loads(content) except: break if not content_dict.get('data'): break content_comments = content_dict.get('data') # 第一遍抓取要确定评论页数 if params['page'] == 1: page = self.analysis.analysis_by_xpath(content_comments, xpahter=setting.XPATH_COMMENTS_PAGE) if page: max_page = int(''.join(page)) elements_com = self.analysis.analysis_by_xpath(content_comments, xpahter=setting.XPATH_COMMENTS_LI) if not elements_com: break for each_element in elements_com: title = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_TITLE) start = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_START) nick = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_NICK) more = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_MORE) if more: content_more = self.crawl.crawl_by_get(more[0], headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=2, timeout=15) content = self.analysis.analysis_by_xpath(content_more, xpahter=setting.XPATH_COMMENTS_DETAIL) else: content = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_CONTENT) date = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_DATE) deal_content = ''.join( list(map(lambda x: x.replace('\n', '').replace('\r', '').replace('\t', ''). replace(' ', ''), content))) if ''.join(date) > current_time: commetents_info = { 'city': city, 'surround': surround, 'surround_id': surround_id, 'title': ''.join(title), 'nick': ''.join(nick), 'start': ''.join(start), 'content': deal_content, 'date': ''.join(date), 'get_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'url': surround_url } comments_time.add(''.join(date)) for eachkey in commetents_info.keys(): commetents_info[eachkey] = commetents_info[eachkey].replace('\n', '').replace('\r', '') # 存储数据 # 字段顺序 # city, surround, surround_id, title, nick, start, content, date, get_time, url save_data = '{0[city]}\u0001{0[surround]}\u0001{0[surround_id]}\u0001' \ '{0[title]}\u0001{0[nick]}\u0001{0[start]}\u0001' \ '{0[content]}\u0001{0[date]}\u0001{0[get_time]}\u0001' \ '{0[url]}\u0001'.format(commetents_info) self.pipe.pipe_txt_save(save_data, filename=setting.FILE_SURROUND_COMMENTS, savetype='a') # self.pipe.pipe_mongo_save(save_list, dbname='db_qunaer', colname='col_scenic_comments') if params['page'] >= max_page: break # 当前页面没有新增评论也切换至下一店铺 if not len(comments_time): break if comments_time: check_dict[surround_id] = max(comments_time) # 抓取到的评论数据 self.pipe.pipe_pickle_save(check_dict, filename=setting.FILE_COMMENTS_CHECK) except: continue # 每个店铺最新的评论时间 @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {"host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass} proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def start_engine(self): self._engine_city_link() self._engine_surround_link() self._engine_surround_info() self._engine_surround_comments()
class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy()) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath( each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_scenic_link(self): """ 获取每个城市中所有的热门景点的链接 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) for each_city in city_list: url = each_city.strip().split('\u0001')[1] + '-jingdian' city_name = each_city.strip().split('\u0001')[0] content = self.crawl.crawl_by_get(url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=2, timeout=15) element_a = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_HOT_A) save_list = [] for each_ele in element_a: scenic_full_name = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_HOT_NAME) current_url = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_HOT_HREF) scenic_name = ''.join(scenic_full_name).replace('旅游攻略', '') scenic_url = ''.join(current_url) scenic_id = re.search(re.compile(r'p-oi(\d+)-'), scenic_url).group(1) # 存储字段 # city_name, scenic_id, scenic_name, scenic_url save_info = '{}\u0001{}\u0001{}\u0001{}'.format( city_name, scenic_id, scenic_name, scenic_url) save_list.append(save_info) self.pipe.pipe_txt_save(save_list, filename=setting.FILE_SCENIC_LIST, savetype='a') @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def start_engine(self): self._engine_city_link() self._engine_scenic_link()
class Engine: def __init__(self): self.crawl = Crawl() self.pipe = Pipeline() self.analysis = Analysis() def _engine_search_by_city(self): """指定城市检索关键字数据""" city_id = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID) history_id = list( map(lambda x: x.strip(), self.pipe.pipe_txt_load(filename=setting.FILE_HISTORY_ID))) current_params = deepcopy(setting.PARAMS) current_params['ak'] = setting.KEY for k, v in setting.QUERY_DICT.items(): filename = 'baidu_{}.txt'.format(k) for query in v.get('query'): current_params['query'] = query # 检索内容 for each_city in city_id: current_params['page_num'] = 0 citycode = each_city.strip().split('\u0001')[1] current_params['region'] = citycode # citycode,检索行政区域 while True: time.sleep(0.2) # 每种类型 current_params['page_num'] += 1 content = self.crawl.crawl_by_get( setting.SEARCH_API, params=current_params, retry=2, timeout=20) try: content_dict = json.loads(content) except: continue results = content_dict.get('results', []) if not results: break for each in results: """ 字段说明: uid: 唯一标识, name: 名称, address: 地址, province: 所在省, city: 所在城市, area: 所在区域, street_id: 街道id, location: 地图坐标 tag: 标签类型, type: 类型, detail_url: 详情url, """ # 存储数据 # uid, name, address, province, city, area, street_id, location # (detail_info) tag, type, detail_url, lat = each.get('location', {}).get('lat', 0) lng = each.get('location', {}).get('lng', 0) tag = each.get('detail_info', {}).get('tag', '') uid = each.get('uid', '') if uid in history_id: continue check_tag = tag.split(';')[0] # 过滤一下,如果抓取到的数据不存在标签也默认为是正确的数据 if check_tag in v.get('tag') or check_tag == '': save_dict = { 'uid': each.get('uid', ''), 'name': each.get('name', ''), 'address': each.get('address', ''), 'province': each.get('province', ''), 'city': each.get('city', ''), 'area': each.get('area', ''), 'street_id': each.get('street_id', ''), 'location': '{},{}'.format(lat, lng), 'tag': tag, 'type': each.get('detail_info', {}).get('type', ''), 'detail_url': each.get('detail_info', {}).get('detail_url', '') } save_info = '{0[uid]}\u0001{0[name]}\u0001{0[address]}\u0001' \ '{0[province]}\u0001{0[city]}\u0001{0[area]}\u0001' \ '{0[street_id]}\u0001{0[location]}\u0001' \ '{0[tag]}\u0001' \ '{0[type]}\u0001{0[detail_url]}'.format(save_dict) self.pipe.pipe_txt_save( uid, filename=setting.FILE_HISTORY_ID, savetype='a') self.pipe.pipe_txt_save(save_info, filename=filename, savetype='a') def _engine_search_by_location(self): """ 指定坐标点检索关键字数据 所有坐标数据来自 _engine_search_by_city 模块根据城市检索关键字的数据 此模块开发原因是百度返回数据量只有400,想通过坐标获取更多数据 :return: """ city_name = list( map(lambda x: x.strip().split('\u0001')[1], self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID))) location_list = self._engine_all_location() history_id = list( map(lambda x: x.strip(), self.pipe.pipe_txt_load(filename=setting.FILE_HISTORY_ID))) current_params = deepcopy(setting.PARAMS) current_params['ak'] = setting.KEY for k, v in setting.QUERY_DICT.items(): filename = 'baidu_{}.txt'.format(k) for query in v.get('query'): current_params['query'] = query # 检索内容 for each_location in location_list: current_params['page_num'] = 0 current_params['location'] = each_location # 检索坐标 while True: time.sleep(0.2) # 每种类型 current_params['page_num'] += 1 content = self.crawl.crawl_by_get( setting.SEARCH_API, params=current_params, retry=2, timeout=20) try: content_dict = json.loads(content) except: continue results = content_dict.get('results', []) if not results: break for each in results: """ 字段说明: uid: 唯一标识, name: 名称, address: 地址, province: 所在省, city: 所在城市, area: 所在区域, street_id: 街道id, location: 地图坐标 tag: 标签类型, type: 类型, detail_url: 详情url, """ # 存储数据 # uid, name, address, province, city, area, street_id, location # (detail_info) tag, type, detail_url, area = each.get('area', '') if area not in city_name: # 根绝坐标点抓取数据可能会超出目前限制的大成都范围,所以限制个区域吧 continue lat = each.get('location', {}).get('lat', 0) lng = each.get('location', {}).get('lng', 0) tag = each.get('detail_info', {}).get('tag', '') uid = each.get('uid', '') if uid in history_id: continue check_tag = tag.split(';')[0] # 过滤一下,如果抓取到的数据不存在标签也默认为是正确的数据 if check_tag in v.get('tag') or check_tag == '': save_dict = { 'uid': each.get('uid', ''), 'name': each.get('name', ''), 'address': each.get('address', ''), 'province': each.get('province', ''), 'city': each.get('city', ''), 'area': each.get('area', ''), 'street_id': each.get('street_id', ''), 'location': '{},{}'.format(lat, lng), 'tag': tag, 'type': each.get('detail_info', {}).get('type', ''), 'detail_url': each.get('detail_info', {}).get('detail_url', '') } save_info = '{0[uid]}\u0001{0[name]}\u0001{0[address]}\u0001' \ '{0[province]}\u0001{0[city]}\u0001{0[area]}\u0001' \ '{0[street_id]}\u0001{0[location]}\u0001' \ '{0[tag]}\u0001' \ '{0[type]}\u0001{0[detail_url]}'.format(save_dict) self.pipe.pipe_txt_save( uid, filename=setting.FILE_HISTORY_ID, savetype='a') self.pipe.pipe_txt_save(save_info, filename=filename, savetype='a') def _engine_all_location(self): """ 获取所有坐标点 :return: """ all_location = [] for k, v in setting.QUERY_DICT.items(): filename = 'baidu_{}.txt'.format(k) area_list = self.pipe.pipe_txt_load(filename=filename) if not area_list: continue all_location.extend( list(map(lambda x: x.strip().split('\u0001')[7], area_list))) return set(all_location) @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def run_engine(self): while True: self._engine_search_by_city() self._engine_search_by_location() nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') save_log = [] for k, v in setting.QUERY_DICT.items(): filename = 'baidu_{}.txt'.format(k) save_log.append('[{}] {}: {} 条'.format( nowtime, k, len(self.pipe.pipe_txt_load(filename=filename)))) save_log.append('=' * 30) self.pipe.pipe_txt_save(save_log, filename=setting.FILE_LOG_INFO, savetype='a')
class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() self._use_log() try: self.args_dict = eval(sys.argv[1:]) if not isinstance(self.args_dict, dict): raise ValueError('args must be like key-value ') except Exception as e: self.args_dict = {} logging.warning('get args failed:{}'.format(e)) self.proxies = self.args_dict.get('proxies') # 代理配置 self.hdfs = self.args_dict.get('hdfs', {}) # hdfs配置 # 如果没有这两个参数 直接报异常 不执行 if not self.hdfs or not self.proxies: raise ValueError('args not have hdfs or proxies') self.sleep_time = self.args_dict.get('sleep_time', 0.2) # 休眠时间 self.service_args = self.args_dict.get('service_args', {}) # PhantomJS代理配置 self.aliyun_log = self.args_dict.get('aliyun_log', {}) self.alilog = AliyunLog( '{}_{}'.format(setting.OTA_NAME, setting.CATEGORY_NAME), endp=self.aliyun_log.get('endpoint', endpoint), accid=self.aliyun_log.get('accessKeyId', accessKeyId), acckey=self.aliyun_log.get('accessKey', accessKey), proj=self.aliyun_log.get('project', project), logst=self.aliyun_log.get('logstore', logstore)) # 阿里云log配置文件,需要校验如果没有该参数会不会报错 try: self.HDFS = HDFileSystem(host=self.hdfs.get( 'ip', '192.168.100.178'), port=self.hdfs.get('port', 8020)) except: pass def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self.proxies) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath( each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_restaurant_link(self): """ 获取每个城市中所有的美食店铺的链接 抓取之前获取当前已抓取的美食店铺id,当前抓取的id或进行校验是否为新增 新增数据则存入到对应的TEMP文件中,最后本次循化完毕后,统一推送新增数据到HDFS 本次循化所有模块执行完毕后,新增数据要追加入历史数据中,追加成功后修改新增数据文件名称,以便后面的新增文件不与前一次数据冲突 修改新政文件名称时候使用完成抓取当日的日期作为文件名称前缀 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) # 获取已经抓取店铺id,便于识别新增数据 history_restautrant = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_LIST) history_id = set( map(lambda x: x.strip().split('\u0001')[2], [each for each in history_restautrant])) for each_city in set(city_list): # try: url = each_city.strip().split('\u0001')[1] + '-meishi' name = each_city.strip().split('\u0001')[0] params_city = {'page': 0} maxpage = 200 # 默认最大页数 while True: save_list = [] params_city['page'] += 1 content = self.crawl.crawl_by_get(url, headers=setting.HEADERS, params=params_city, proxies=self.proxies, retry=5) if not content: break element_li = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_LI) if not element_li: break for each_ele in element_li: restaurant_name = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_NAME) restaurant_type = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_TYPE) restaurant_url = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_URL) current_id = re.search(re.compile(r'p-oi(\d+)-'), ''.join(restaurant_url)).group(1) if current_id in history_id: continue else: history_id.add(current_id) try: # 存储字段 # name, restaurant_name, current_id, restaurant_type,, restaurant_url save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format( name, ''.join(restaurant_name), current_id, ''.join(restaurant_type), ''.join(restaurant_url)) except Exception as e: self.alilog.warning('[list] {}'.format(e)) continue save_list.append(save_info) if save_list: self.pipe.pipe_txt_save( save_list, filename=setting.TEMP_RESTAURANT_LIST, savetype='a') if params_city['page'] >= maxpage: break time.sleep(self.sleep_time) # except: # continue def _engine_restaurant_info(self): """ 获取所有餐厅详细数据 :return: """ res_list = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_LIST) temp_list = self.pipe.pipe_txt_load( filename=setting.TEMP_RESTAURANT_LIST) res_list.extend(temp_list) history_restautrant = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_INFO) history_id = set( map(lambda x: x.strip().split('\u0001')[2], [each for each in history_restautrant])) for each_res in set(res_list): try: # 店铺数据 res_info = each_res.strip().split('\u0001') city_name = res_info[0] res_name = res_info[1] res_id = res_info[2] if res_id in history_id: continue else: history_id.add(res_id) res_type = res_info[3] res_url = res_info[4] # 获取店铺详细信息 content = self.crawl.crawl_by_get(res_url, headers=setting.HEADERS, proxies=self.proxies, retry=5, timeout=10) detail = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_RES_DETAIL) detail['city_name'] = city_name detail['restaurant_name'] = res_name detail['restaurant_type'] = res_type detail['restaurant_url'] = res_url detail['restaurant_id'] = res_id detail['get_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # 构建存储的数据 # 字段: # city_name, restaurant_name, restaurant_id, restaurant_type, # score, ranking, price, describe, address, tel, open_time, dish, arrive, intro, restaurant_url, # get_time datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') savedata = '{0[city_name]}\u0001{0[restaurant_name]}\u0001{0[restaurant_id]}\u0001' \ '{0[restaurant_type]}\u0001{0[score]}\u0001{0[ranking]}\u0001{0[price]}\u0001' \ '{0[describe]}\u0001{0[address]}\u0001{0[tel]}\u0001{0[open_time]}\u0001' \ '{0[dish]}\u0001{0[arrive]}\u0001{0[intro]}\u0001{0[restaurant_url]}\u0001' \ '{0[get_time]}'.format(detail) self.pipe.pipe_txt_save(savedata, filename=setting.TEMP_RESTAURANT_INFO, savetype='a') time.sleep(self.sleep_time) except Exception as e: self.alilog.warning('[detail] {}'.format(e)) continue def _engine_restaurant_comments(self): """ 获取所有餐厅评论数据 :return: """ res_list = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_LIST) temp_list = self.pipe.pipe_txt_load( filename=setting.TEMP_RESTAURANT_LIST) res_list.extend(temp_list) # 每个店铺最新评论时间表 check_dict = self.pipe.pipe_pickle_load( filename=setting.FILE_COMMENTS_CHECK) if not check_dict: check_dict = {} for each_res in res_list: try: # 店铺数据 city = each_res.strip().split('\u0001')[0] food = each_res.strip().split('\u0001')[1] res_id = each_res.strip().split('\u0001')[2] type = each_res.strip().split('\u0001')[3] res_url = each_res.strip().split('\u0001')[4] api = setting.COMMENTS_API.format(res_id) setting.HEADERS_COMMENTS['Referer'] = res_url params = { 'page': 0, 'pageSize': '10', 'poiList': 'true', 'rank': 0, # 全部评论 'sortField': 0 # 按照时间排序 } comments_time = set([]) current_time = check_dict.get(res_id, '0') while True: time.sleep(self.sleep_time) try: params['page'] += 1 content = self.crawl.crawl_by_get( api, headers=setting.HEADERS_COMMENTS, proxies=self.proxies, params=params, retry=3, timeout=20) content_dict = json.loads(content) if not content_dict.get('data'): break content_comments = content_dict.get('data') elements_com = self.analysis.analysis_by_xpath( content_comments, xpahter=setting.XPATH_COMMENTS_LI) if not elements_com: break for each_element in elements_com: title = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_TITLE) start = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_START) nick = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_NICK) more = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_MORE) if more: content_more = self.crawl.crawl_by_get( more[0], headers=setting.HEADERS, proxies=self.proxies) content = self.analysis.analysis_by_xpath( content_more, xpahter=setting.XPATH_COMMENTS_DETAIL) else: content = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_CONTENT) date = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_DATE) try: deal_content = ''.join( list( map( lambda x: x.replace('\n', ''). replace('\r', '').replace( '\t', '').replace(' ', ''), content))) except: self.alilog.info( '[review] have no deal_content') deal_content = '' if ''.join(date) > current_time: commetents_info = { 'city': city, 'food': food, 'food_id': res_id, 'type': type, 'title': ''.join(title), 'nick': ''.join(nick), 'start': ''.join(start), 'content': deal_content, 'date': ''.join(date), 'get_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'url': res_url } for eachkey in commetents_info.keys(): commetents_info[eachkey] = commetents_info[ eachkey].replace('\n', '').replace('\r', '') # 存储数据 # 字段顺序:city, food, food_id, type, title, nick, start, content, date, get_time, url save_info = '{0[city]}\u0001{0[food]}\u0001{0[food_id]}\u0001' \ '{0[type]}\u0001{0[title]}\u0001{0[nick]}\u0001' \ '{0[start]}\u0001{0[content]}\u0001{0[date]}\u0001' \ '{0[get_time]}\u0001{0[url]}'.format(commetents_info) self.pipe.pipe_txt_save( save_info, filename=setting.TEMP_RESTAURANT_COMMENTS, savetype='a') comments_time.add(''.join(date)) # 当前页面没有新增评论也切换至下一店铺 if not len(comments_time): break except Exception as e: self.alilog.warning('[review] {}'.format(e)) break # 每个店铺最新的评论时间 if comments_time: check_dict[res_id] = max(comments_time) # 抓取到的评论数据 self.pipe.pipe_pickle_save( check_dict, filename=setting.FILE_COMMENTS_CHECK) except Exception as e: self.alilog.warning('[review] {}'.format(e)) continue def _engine_restaurant_link_by_args(self): """ 根据配置参数来进行抓取,从该模块提供参数的接口 :return: """ # 传入的参数中是否有dist参数,此处暂时默认arg_dist为一个字符串参数,实际是一个列表 arg_dist = self.args_dict.get('dist', []) # 如果没该参数,则全部抓取所有城市数据 if not arg_dist: self._engine_restaurant_link() else: try: city_dict = eval( self.pipe.pipe_txt_load( filename='./DATA/file_city_dict.txt')) except Exception as e: logging.warning('get city dict error: {}'.format(e)) # 假设此处获取到了待抓取的url prov = arg_dist[0] # 省 city = arg_dist[1] # 市 area = arg_dist[2] # 县 city_dict = { '四川省': { '成都市': { '': 'http1' }, '德阳市': { '': 'http2' }, '眉山市': { '': 'http3' }, '人寿市': { '': 'http4' }, } } if prov and city and area: current_list = city_dict.get(prov, {}).get(city, {}).get(area, '') city_list = [current_list] elif prov and city and not area: current_list = city_dict.get(prov, {}).get(city, {}) city_list = set([]) for name, url in current_list.items(): city_list.add(url) elif prov and not city and not area: current_list = city_dict.get(prov, {}) city_list = set([]) for eachkey in current_list.keys(): for url in current_list[eachkey].values(): city_list.add(url) else: raise ValueError('args_dist error') # 获取已经抓取店铺id,便于识别新增数据 history_restautrant = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_LIST) history_id = set( map(lambda x: x.strip().split('\u0001')[2], [each for each in history_restautrant])) for each_city in set(city_list): # try: url = each_city.strip().split('\u0001')[1] + '-meishi' name = each_city.strip().split('\u0001')[0] params_city = {'page': 0} maxpage = 200 # 默认最大页数 while True: save_list = [] params_city['page'] += 1 content = self.crawl.crawl_by_get(url, headers=setting.HEADERS, params=params_city, proxies=self.proxies, retry=5) if not content: break element_li = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_LI) if not element_li: break for each_ele in element_li: restaurant_name = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_NAME) restaurant_type = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_TYPE) restaurant_url = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_URL) current_id = re.search( re.compile(r'p-oi(\d+)-'), ''.join(restaurant_url)).group(1) if current_id in history_id: continue else: history_id.add(current_id) try: # 存储字段 # name, restaurant_name, current_id, restaurant_type,, restaurant_url save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format( name, ''.join(restaurant_name), current_id, ''.join(restaurant_type), ''.join(restaurant_url)) except Exception as e: self.alilog.warning('[list] {}'.format(e)) continue save_list.append(save_info) if save_list: self.pipe.pipe_txt_save( save_list, filename=setting.TEMP_RESTAURANT_LIST, savetype='a') if params_city['page'] >= maxpage: break time.sleep(self.sleep_time) # except: # continue def _temp_city_info(self, cityname): """ 做22项数据处理时临时用 :return: """ citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt') city_params = { '国别': '&', '省自治区全称': '&', '省自治区简称': '&', '市州全称': '&', '市州简称': '&', '区县全称': '&', '区县简称': '&', '地区编码': '&', '等级': '&' } spec_city = { '北京': '110000', '天津': '120000', '上海': '310000', '重庆': '500000' } for each in citylist: cityinfo = each.split('\u0001') if cityname in cityinfo: site = cityinfo.index(cityname) if site == 4 or site == 5: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['区县全称'] = cityinfo[4].strip() city_params['区县简称'] = cityinfo[5].strip() city_params['地区编码'] = cityinfo[-1].strip() city_params['等级'] = '区县级' elif site == 2 or site == 3: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00' city_params['等级'] = '地市级' elif cityname in ['北京', '重庆', '上海', '天津']: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityname + '市' city_params['省自治区简称'] = cityname city_params['市州全称'] = cityname + '市' city_params['市州简称'] = cityname city_params['地区编码'] = spec_city[cityname] city_params['等级'] = '直辖' break return city_params @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "****" proxy_port = "****" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies # 集群操作 def _engine_push_hdfs(self, filename): try: if os.path.exists('DATA/' + filename): # HDFS.put(当前文件,目标文件) self.HDFS.put('DATA/' + filename, '/user/spider/everyday/{}'.format(filename)) # 推送备份数据 for eachfile in [ setting.FILE_RESTAURANT_LIST, setting.FILE_RESTAURANT_INFO, setting.FILE_RESTAURANT_COMMENTS ]: if os.path.exists('DATA/' + eachfile): # HDFS.put(当前文件,目标文件) self.HDFS.put( 'DATA/' + eachfile, '/user/spider/xieyangjie/Qunar/{}'.format(eachfile)) except Exception as e: print('集群挂了', e) @staticmethod def _use_log(LOGFMT=None, DATEFMT=None): """ 本地日志记录 该模块用于配置日志记录格式以及存储位置等 :return: """ LOGFMT = "%(asctime)s - %(levelname)s - %(message)s" DATEFMT = "%Y/%m/%d %H:%M:%S" logging.basicConfig(filename='./logbag/{}_{}_{}.log'.format( setting.OTA_NAME, setting.CATEGORY_NAME, datetime.datetime.today().strftime('%Y%m%d')), format=LOGFMT, datefmt=DATEFMT, level=logging.INFO) def start_engine(self): self.alilog.debug('script {}_{} running'.format( setting.OTA_NAME, setting.CATEGORY_NAME)) self._engine_restaurant_link_by_args() return self._engine_restaurant_link_by_args() self._engine_restaurant_info() self._engine_restaurant_comments() logging.info('{}_{} spider running'.format(setting.OTA_NAME, setting.CATEGORY_NAME)) try: self._engine_city_link() self.alilog.debug('script {}_{} running'.format( setting.OTA_NAME, setting.CATEGORY_NAME)) while True: self._engine_restaurant_link() self._engine_restaurant_info() self._engine_restaurant_comments() current_time = datetime.datetime.now().strftime('%Y-%m-%d') file_dict = { setting.FILE_RESTAURANT_LIST: setting.TEMP_RESTAURANT_LIST, setting.FILE_RESTAURANT_INFO: setting.TEMP_RESTAURANT_INFO, setting.FILE_RESTAURANT_COMMENTS: setting.TEMP_RESTAURANT_COMMENTS } for f, t in file_dict.items(): newname = 'qunar{}({}).txt'.format(t[4:-4], current_time) if os.path.exists('DATA/{}'.format(f)): temp = self.pipe.pipe_txt_load(filename=t) if temp: self.pipe.pipe_txt_save(list( map(lambda x: x.strip(), temp)), filename=f, savetype='a') os.rename('DATA/{}'.format(t), 'DATA/{}'.format(newname)) else: self.pipe.pipe_txt_save('', filename=newname) else: os.rename('DATA/{}'.format(t), 'DATA/{}'.format(f)) shutil.copy('DATA/{}'.format(f), 'DATA/{}'.format(newname)) self._engine_push_hdfs(newname) self.alilog.debug('script {}_{} finish'.format( setting.OTA_NAME, setting.CATEGORY_NAME)) except Exception as e: self.alilog.error('script {}_{} error {}'.format( setting.OTA_NAME, setting.CATEGORY_NAME, e))
class Engine(object): def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_get_citylist(self): """ 获取城市列表,包括城市的url及城市名称 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS) res = self.analysis.analysis_by_xpath(content, xpahter=setting.XPATH_CITYLIST_A) saveinfo = set([]) for each in res: cityname = self.analysis.analysis_by_xpath( each, xpahter=setting.XPATH_TEXT) cityhref = self.analysis.analysis_by_xpath( each, xpahter=setting.XPATH_HREF) citylink = setting.START_URL + cityhref[0][1:] try: savelist = '{}\u0001{}'.format(cityname[0], citylink) saveinfo.add(savelist) except: continue self.pipe.pipe_txt_save(saveinfo, filename=setting.FILE_CITY_LIST, savetype='w') def _engine_get_touristlist(self): """ 获取所有景区链接以及id :return: """ # 清空文本 self.pipe.pipe_remove_file(setting.FILE_TOURIST_LIST) citylist = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST, loadtype='r') for eachcity in citylist: try: saveinfo = set([]) params = { 'from': 'mpshouye_hotdest_more', 'keyword': '柳州', 'page': 1 } cityname = eachcity.strip().split('\u0001')[0] params['keyword'] = cityname while True: content = self.crawl.crawl_by_get( setting.TOURIS_URL, params=params, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=3, timeout=15) res_element = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_TOURIST_A) if not res_element: break for eachelement in res_element: tourist_name = self.analysis.analysis_by_xpath( eachelement, xpahter=setting.XPATH_TEXT) tourist_href = self.analysis.analysis_by_xpath( eachelement, xpahter=setting.XPATH_HREF) tourist_link = setting.START_URL + tourist_href[0][1:] pattern = re.compile(r'detail_(\d+)', re.S) re_id = re.search(pattern, tourist_link) if re_id: tourist_id = re_id.group(1) else: tourist_id = '' # 数据结构依次为 景区名字 景区id 景区链接 saveinfo.add('{}\u0001{}\u0001{}'.format( tourist_name[0], tourist_id, tourist_link)) # print(saveinfo) params['page'] += 1 self.pipe.pipe_txt_save(saveinfo, filename=setting.FILE_TOURIST_LIST, savetype='a') except: continue def _engine_get_touristinfo(self): """ 获取每个景区详细信息 :return: """ tourist_list = self.pipe.pipe_txt_load( filename=setting.FILE_TOURIST_LIST) for eachtourist in tourist_list: try: tourist_url = eachtourist.strip().split('\u0001')[2] content = self.crawl.crawl_by_get( tourist_url, headers=setting.HEADERS, proxies=self._engine_use_proxy()) res = self.analysis.analysis_by_xpath( content, setting.XPATH_TOURIST_DETAIL) # 存储数据 # 字段顺序 # t_name, t_type, t_des, address, score, price, describe save_data = '{0[t_name]}\u0001{0[t_type]}\u0001{0[t_des]}\u0001' \ '{0[address]}\u0001{0[score]}\u0001{0[price]}\u0001' \ '{0[describe]}'.format(res) self.pipe.pipe_txt_save(save_data, filename=setting.FILE_TOURIST_INFO, savetype='a') time.sleep(0.1) except: continue def _engine_get_comments(self): """ 获取景区评论数据 :return: """ # 景区名称/id/链接 tourist_list = self.pipe.pipe_txt_load( filename='file_tourist_list.txt', loadtype='r') for each_tourist in tourist_list: try: tourist_id = each_tourist.strip().split('\u0001')[1] tourist_url = each_tourist.strip().split('\u0001')[2] tourist_name = each_tourist.strip().split('\u0001')[0] # 评论翻页参数 params_comments = { 'sightId': '12579', 'index': 0, 'page': 0, 'pageSize': '10', 'tagType': '0', } # 查询景区节点数据 check_node = self.pipe.pipe_pickle_load( filename=setting.FILE_TOURIST_CHECK) if not check_node: check_node = {} tourist_node = check_node.get(tourist_id, {}) # 节点中记录的上次抓取评论数量 node_count = tourist_node.get('comments_count', 0) # 节点中记录的上次抓取的最大时间节点 node_latest = tourist_node.get('comments_latest', '0') savelist = [] # 有效评论(新增评论) latest_time = set([]) # 评论时间集合 datanum = -1 # 评论数量 while True: params_comments['sightId'] = tourist_id params_comments['index'] += 1 params_comments['page'] += 1 setting.HEADERS_COMMENTS['Referer'] = tourist_url content = self.crawl.crawl_by_get( setting.COMMENTS_API, headers=setting.HEADERS_COMMENTS, params=params_comments, proxies=self._engine_use_proxy(), retry=2, timeout=15) contnet_dict = json.loads(content) # 查看当前评论数量,自在第一页的时候进行这一步检查 if params_comments['page'] == 1: taglist = contnet_dict.get('data', {}).get('tagList', []) if taglist: for each in taglist: if each.get('tagName') == '全部': datanum = each.get('tagNum') break # 如果节点中的数据数量与当前景区实时评论数量一致,则说明没有新增评论 if node_count == datanum: break # 获取评论列表 datalist = contnet_dict.get('data', {}).get('commentList', []) if not datalist: break # 直接写到mongodb中去 如果后期需要输出到txt文本,请修改此处 current_data = False for each in datalist: current_time = each.get('date') each['tourist_id'] = tourist_id each['tourist_name'] = tourist_name each['tourist_url'] = tourist_url each['get_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') if current_time > node_latest: # 存储数据 # 字段顺序 # tourist_name, tourist_id, author, commentId, content, date, score, get_time,tourist_url save_data = '{0[tourist_name]}\u0001{0[tourist_id]}\u0001{0[author]}\u0001' \ '{0[commentId]}\u0001{0[content]}\u0001{0[date]}\u0001' \ '{0[score]}\u0001{0[get_time]}\u0001{0[tourist_url]}\u0001'.format(each) self.pipe.pipe_txt_save( save_data, filename=setting.FILE_TOURIST_COMMENTS, savetype='a') latest_time.add(current_time) current_data = True # 如果当前页面没有新增数据,且页码数大于15页面,则此景区本次评论抓取结束 if not current_data and params_comments['page'] >= 15: break time.sleep(0.2) # 若从页面获取数据量失败,则不更新数据量这一字段 if datanum != -1: tourist_node['comments_count'] = datanum # 若没有新增的数据 则不更新数据时间节点这一字段 if latest_time: tourist_node['comments_latest'] = max(latest_time) check_node = {tourist_id: tourist_node} self.pipe.pipe_pickle_save(check_node, filename=setting.FILE_TOURIST_CHECK) except: continue @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def engine_run(self): self._engine_get_citylist() self._engine_get_touristlist() self._engine_get_touristinfo() self._engine_get_comments()
class Engine: """ 成都公交线路数据抓取脚本 """ def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_bus_info(self): """ 获取所有bus的urls :return: """ content_home = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, retry=2, timeout=30) each_list = self.analysis.analysis_by_xpath(content_home, xpahter=setting.XPATH_LIST) urls = list(map(lambda x: setting.DOMAIN_URL.format(x), each_list)) for each in urls: content_bus = self.crawl.crawl_by_get(each, headers=setting.HEADERS, retry=2, timeout=30) bus_list = self.analysis.analysis_by_xpath( content_bus, xpahter=setting.XPATH_BUS) bus_urls = list( map(lambda x: setting.DOMAIN_URL.format(x), bus_list)) if bus_urls: self.pipe.pipe_txt_save(bus_urls, filename=setting.FILE_BUS_LIST) def _engine_bus_detail(self): """ 获取bus详细信息 :return: """ bus_urls = self.pipe.pipe_txt_load(filename=setting.FILE_BUS_LIST) for each_bus in bus_urls: content_detail = self.crawl.crawl_by_get(each_bus, headers=setting.HEADERS, retry=2, timeout=30) detail_info = self.analysis.analysis_by_xpath( content_detail, xpahter=setting.XPATH_DETAIL) # 存储字段 # name,time,ticket,company,update,station """ name:线路名称,time:收发时间,ticket:票价, company:所属公司,update:最后更新时间,station:途经站点, """ save_info = '{0[name]}\u0001{0[time]}\u0001{0[ticket]}\u0001' \ '{0[company]}\u0001{0[update]}\u0001{0[station]}'.format(detail_info) self.pipe.pipe_txt_save(save_info, filename=setting.FILE_BUS_DETAIL) time.sleep(2) @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies def run_engine(self): self._engine_bus_info() self._engine_bus_detail()
class Engine: def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() def _engine_city_link(self): """ 获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中 :return: """ content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy()) element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A) city_list = [] for each_element in element_city: city_name = self.analysis.analysis_by_xpath( each_element, setting.XPATH_CITY_NAME) city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL) city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url))) self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST) def _engine_restaurant_link(self): """ 获取每个城市中所有的美食店铺的链接 抓取之前获取当前已抓取的美食店铺id,当前抓取的id或进行校验是否为新增 新增数据则存入到对应的TEMP文件中,最后本次循化完毕后,统一推送新增数据到HDFS 本次循化所有模块执行完毕后,新增数据要追加入历史数据中,追加成功后修改新增数据文件名称,以便后面的新增文件不与前一次数据冲突 修改新政文件名称时候使用完成抓取当日的日期作为文件名称前缀 :return: """ city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST) # 获取已经抓取店铺id,便于识别新增数据 history_restautrant = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_LIST) history_id = set( map(lambda x: x.strip().split('\u0001')[2], [each for each in history_restautrant])) for each_city in city_list: # try: url = each_city.strip().split('\u0001')[1] + '-meishi' name = each_city.strip().split('\u0001')[0] params_city = {'page': 0} maxpage = 200 # 默认最大页数 while True: save_list = [] params_city['page'] += 1 content = self.crawl.crawl_by_get( url, headers=setting.HEADERS, params=params_city, proxies=self._engine_use_proxy(), retry=5) if not content: break element_li = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_LI) if not element_li: break for each_ele in element_li: restaurant_name = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_NAME) restaurant_type = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_TYPE) restaurant_url = self.analysis.analysis_by_xpath( each_ele, xpahter=setting.XPATH_RES_URL) current_id = re.search(re.compile(r'p-oi(\d+)-'), ''.join(restaurant_url)).group(1) if current_id in history_id: continue try: save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format( name, ''.join(restaurant_name), current_id, ''.join(restaurant_type), ''.join(restaurant_url)) except: continue save_list.append(save_info) if save_list: self.pipe.pipe_txt_save( save_list, filename=setting.TEMP_RESTAURANT_LIST, savetype='a') if params_city['page'] >= maxpage: break time.sleep(0.1) # except: # continue def _engine_restaurant_info(self): """ 获取所有餐厅详细数据 :return: """ res_list = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_LIST) temp_list = self.pipe.pipe_txt_load( filename=setting.TEMP_RESTAURANT_LIST) res_list.extend(temp_list) history_restautrant = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_INFO) history_id = set( map(lambda x: x.strip().split('\u0001')[2], [each for each in history_restautrant])) for each_res in res_list: try: # 店铺数据 res_info = each_res.strip().split('\u0001') city_name = res_info[0] res_name = res_info[1] res_id = res_info[2] if res_id in history_id: continue res_type = res_info[3] res_url = res_info[4] # 获取店铺详细信息 content = self.crawl.crawl_by_get( res_url, headers=setting.HEADERS, proxies=self._engine_use_proxy(), retry=5, timeout=10) detail = self.analysis.analysis_by_xpath( content, xpahter=setting.XPATH_RES_DETAIL) detail['city_name'] = city_name detail['restaurant_name'] = res_name detail['restaurant_type'] = res_type detail['restaurant_url'] = res_url detail['restaurant_id'] = res_id detail['get_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # 构建存储的数据 # 字段: # city_name, restaurant_name, restaurant_id, restaurant_type, # score, ranking, price, describe, address, tel, open_time, dish, arrive, intro, restaurant_url, # get_time datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') savedata = '{0[city_name]}\u0001{0[restaurant_name]}\u0001{0[restaurant_id]}\u0001' \ '{0[restaurant_type]}\u0001{0[score]}\u0001{0[ranking]}\u0001{0[price]}\u0001' \ '{0[describe]}\u0001{0[address]}\u0001{0[tel]}\u0001{0[open_time]}\u0001' \ '{0[dish]}\u0001{0[arrive]}\u0001{0[intro]}\u0001{0[restaurant_url]}\u0001' \ '{0[get_time]}'.format(detail) self.pipe.pipe_txt_save(savedata, filename=setting.TEMP_RESTAURANT_INFO, savetype='a') # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_food_info') time.sleep(0.02) except Exception as e: print('crawl error', e) continue def _engine_restaurant_comments(self): """ 获取所有餐厅评论数据 :return: """ res_list = self.pipe.pipe_txt_load( filename=setting.FILE_RESTAURANT_LIST) temp_list = self.pipe.pipe_txt_load( filename=setting.TEMP_RESTAURANT_LIST) res_list.extend(temp_list) # 每个店铺最新评论时间表 check_dict = self.pipe.pipe_pickle_load( filename=setting.FILE_COMMENTS_CHECK) if not check_dict: check_dict = {} for each_res in res_list: try: # 店铺数据 city = each_res.strip().split('\u0001')[0] food = each_res.strip().split('\u0001')[1] res_id = each_res.strip().split('\u0001')[2] type = each_res.strip().split('\u0001')[3] res_url = each_res.strip().split('\u0001')[4] api = setting.COMMENTS_API.format(res_id) setting.HEADERS_COMMENTS['Referer'] = res_url params = { 'page': 0, 'pageSize': '10', 'poiList': 'true', 'rank': 0, # 全部评论 'sortField': 0 # 按照时间排序 } comments_time = set([]) current_time = check_dict.get(res_id, '0') while True: time.sleep(0.2) try: params['page'] += 1 content = self.crawl.crawl_by_get( api, headers=setting.HEADERS_COMMENTS, proxies=self._engine_use_proxy(), params=params, retry=3, timeout=20) content_dict = json.loads(content) if not content_dict.get('data'): break content_comments = content_dict.get('data') elements_com = self.analysis.analysis_by_xpath( content_comments, xpahter=setting.XPATH_COMMENTS_LI) if not elements_com: break for each_element in elements_com: title = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_TITLE) start = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_START) nick = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_NICK) more = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_MORE) if more: content_more = self.crawl.crawl_by_get( more[0], headers=setting.HEADERS, proxies=self._engine_use_proxy()) content = self.analysis.analysis_by_xpath( content_more, xpahter=setting.XPATH_COMMENTS_DETAIL) else: content = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_CONTENT) date = self.analysis.analysis_by_xpath( each_element, xpahter=setting.XPATH_COMMENTS_DATE) try: deal_content = ''.join( list( map( lambda x: x.replace('\n', ''). replace('\r', '').replace( '\t', '').replace(' ', ''), content))) except: deal_content = '' if ''.join(date) > current_time: commetents_info = { 'city': city, 'food': food, 'food_id': res_id, 'type': type, 'title': ''.join(title), 'nick': ''.join(nick), 'start': ''.join(start), 'content': deal_content, 'date': ''.join(date), 'get_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), 'url': res_url } for eachkey in commetents_info.keys(): commetents_info[eachkey] = commetents_info[ eachkey].replace('\n', '').replace('\r', '') # 存储数据 # 字段顺序:city, food, food_id, type, title, nick, start, content, date, get_time, url save_info = '{0[city]}\u0001{0[food]}\u0001{0[food_id]}\u0001' \ '{0[type]}\u0001{0[title]}\u0001{0[nick]}\u0001' \ '{0[start]}\u0001{0[content]}\u0001{0[date]}\u0001' \ '{0[get_time]}\u0001{0[url]}'.format(commetents_info) self.pipe.pipe_txt_save( save_info, filename=setting.TEMP_RESTAURANT_COMMENTS, savetype='a') comments_time.add(''.join(date)) # 当前页面没有新增评论也切换至下一店铺 if not len(comments_time): break except: break # 每个店铺最新的评论时间 if comments_time: check_dict[res_id] = max(comments_time) # 抓取到的评论数据 self.pipe.pipe_pickle_save( check_dict, filename=setting.FILE_COMMENTS_CHECK) except Exception as e: print(e) continue def _temp_city_info(self, cityname): """ 做22项数据处理时临时用 :return: """ citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt') city_params = { '国别': '&', '省自治区全称': '&', '省自治区简称': '&', '市州全称': '&', '市州简称': '&', '区县全称': '&', '区县简称': '&', '地区编码': '&', '等级': '&' } spec_city = { '北京': '110000', '天津': '120000', '上海': '310000', '重庆': '500000' } for each in citylist: cityinfo = each.split('\u0001') if cityname in cityinfo: site = cityinfo.index(cityname) if site == 4 or site == 5: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['区县全称'] = cityinfo[4].strip() city_params['区县简称'] = cityinfo[5].strip() city_params['地区编码'] = cityinfo[-1].strip() city_params['等级'] = '区县级' elif site == 2 or site == 3: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityinfo[0].strip() city_params['省自治区简称'] = cityinfo[1].strip() city_params['市州全称'] = cityinfo[2].strip() city_params['市州简称'] = cityinfo[3].strip() city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00' city_params['等级'] = '地市级' elif cityname in ['北京', '重庆', '上海', '天津']: city_params['国别'] = 'CN' city_params['省自治区全称'] = cityname + '市' city_params['省自治区简称'] = cityname city_params['市州全称'] = cityname + '市' city_params['市州简称'] = cityname city_params['地区编码'] = spec_city[cityname] city_params['等级'] = '直辖' break return city_params @staticmethod def _engine_use_proxy(): """ 使用代理ip :return: 代理ip """ proxy_host = "proxy.abuyun.com" proxy_port = "9010" proxy_user = "******" proxy_pass = "******" proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxy_host, "port": proxy_port, "user": proxy_user, "pass": proxy_pass } proxies = {"http": proxy_meta, "https": proxy_meta} return proxies # 集群操作 @staticmethod def _engine_push_hdfs(filename): try: if os.path.exists('DATA/' + filename): # HDFS.put(当前文件,目标文件) HDFS.put('DATA/' + filename, '/user/spider/everyday/{}'.format(filename)) except Exception as e: print('集群挂了', e) def start_engine(self): # self._engine_city_link() while True: # self._engine_restaurant_link() # self._engine_restaurant_info() # self._engine_restaurant_comments() current_time = datetime.datetime.now().strftime('%Y-%m-%d') file_dict = { setting.FILE_RESTAURANT_LIST: setting.TEMP_RESTAURANT_LIST, setting.FILE_RESTAURANT_INFO: setting.TEMP_RESTAURANT_INFO, setting.FILE_RESTAURANT_COMMENTS: setting.TEMP_RESTAURANT_COMMENTS } for f, t in file_dict.items(): if os.path.exists('DATA/{}'.format(f)): temp = self.pipe.pipe_txt_load(filename=t) newname = 'qunar{}({}).txt'.format(t[4:-4], current_time) if temp: self.pipe.pipe_txt_save(temp, filename=f, savetype='a') os.rename('DATA/{}'.format(t), 'DATA/{}'.format(newname)) else: os.rename('DATA/{}'.format(t), 'DATA/{}'.format(f)) newname = 'qunar{}({}).txt'.format(t[4:-4], current_time) shutil.copy('DATA/{}'.format(f), 'DATA/{}'.format(newname)) self._engine_push_hdfs(newname)