def hilton_to_database(tid, used_times, source, keyword, extra, spider_tag, need_cache=True): task = Task() task.content = keyword task.extra = extra spider = factory.get_spider_by_old_source(spider_tag) spider.task = task if need_cache: error_code = spider.crawl(required=['suggest'], cache_config=cache_config) else: error_code = spider.crawl(required=['suggest'], cache_config=none_cache_config) logger.info( str(len(spider.result['suggest'])) + ' -- ' + keyword) return error_code, spider.result['suggest']
def poidetail_to_database(tid, used_times, source, url, need_cache=True): task = Task() task.content = url task.ticket_info = { 'tid': tid, 'used_times': used_times } print (source + '_detail') spider = factory.get_spider_by_old_source(source+'_detail') spider.task = task if need_cache: error_code = spider.crawl(required=['POIdetail'], cache_config=cache_config) else: error_code = spider.crawl(required=['POIdetail'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['POIdetail']) + ' -- ' + task.content) return error_code, spider.result['POIdetail'], spider.page_store_key_list
def hilton_to_database(tid, used_times, source, source_id, city_id, check_in, need_cache=True): task = Task() task.content = 'NULL&' + str(city_id) + '&' + str(source_id) + '&' + '2&{0}'.format(check_in) task.ticket_info = { 'tid': tid, 'used_times': used_times, 'room_info': [{"occ": 2, "num": 1}] } spider = factory.get_spider_by_old_source('hiltonHotel2') spider.task = task if need_cache: error_code = spider.crawl(required=['list', 'room'], cache_config=cache_config) else: error_code = spider.crawl(required=['list', 'room'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['room']) + ' -- ' + task.content) return error_code, spider.result['room'], spider.page_store_key_list
def qyer_list_to_database(tid, used_times, source, city_id, check_in, city_url, need_cache=True): task = Task() task.content = city_url task.ticket_info = {'tid': tid, 'used_times': used_times} spider = factory.get_spider_by_old_source('qyerList') spider.task = task if need_cache: error_code = spider.crawl(required=['list'], cache_config=cache_config) else: error_code = spider.crawl(required=['list'], cache_config=none_cache_config) print(error_code) logger.info(str(spider.result['list']) + ' -- ' + task.content) return error_code, spider.result[ 'list'], spider.page_store_key_list, spider.types_result_num
def hotel_rest_list_task(self, source, url, city_id, **kwargs): try: self.task_source = source.title() self.task_type = 'DaodaoListInfo' logger.info("任务进行中。。。") code, result = hotel_list_database(source, url) logger.info("code : %s" % str(code)) if int(code) != 0: logger.info("=======================0=========================\n") logger.info(str(code) + ' | ' + str(result)) logger.info("\n=======================1=========================") raise Exception self.error_code = str(code) for one in result: for key, view in one.items(): rest = HotelRestList() rest.source = source rest.source_id = int(view['source_id']) rest.city_id = int(city_id) rest.url = view['view_url'] rest.name = view['view_name'].strip('\n').strip() try: ss = DBSession_mb4() ss.merge(rest) ss.commit() except Exception as e: logger.info( "======================= sql 异常=========================" ) logger.exception(traceback.format_exc(e)) return True except Exception as e: logger.exception('================== 异常 0==================') logger.exception(source + ' | ' + str(city_id) + ' | ' + url) logger.exception(traceback.format_exc(e)) logger.exception('================== 异常 1==================') raise Exception(e)
def _execute(self, **kwargs): source = self.task.kwargs['source'] city_id = self.task.kwargs['city_id'] country_id = self.task.kwargs['country_id'] fla = self.task.kwargs.get('list_more', False) @func_time_logger def hotel_list_crawl(): error_code, result, page_store_key = hotel_list_database( tid=self.task.task_id, used_times=self.task.used_times, source=source, city_id=city_id, check_in=self.task.kwargs['check_in'], is_new_type=self.task.kwargs.get('is_new_type', False), suggest_type=self.task.kwargs.get('suggest_type', '1'), suggest=self.task.kwargs.get('suggest', ''), need_cache=self.task.used_times == 0, flag=fla) return error_code, result, page_store_key error_code, result, page_store_key = hotel_list_crawl() print(result) # more_list if fla: for line in result['filter']: line['country_id'] = country_id line['source'] = source filter_collections.insert_many(result['filter']) if len(result['filter']) > 0: self.task.error_code = 0 elif int(error_code) == 0: raise ServiceStandardError(ServiceStandardError.EMPTY_TICKET) else: raise ServiceStandardError(error_code=error_code) return result, error_code, self.task.error_code, self.task.task_name, self.task.kwargs[ 'suggest'] if source == 'starwood' and error_code == 29: self.task.error_code = 109 error_code = 109 else: self.task.error_code = error_code res_data = [] if source in ('ctrip', 'ctripcn', 'starwood', 'gha'): for line in result['hotel']: sid = line[3] hotel_url = line[-1] res_data.append((source, sid, city_id, country_id, hotel_url)) elif source in ('bestwest'): for sr, sid, city_id, hotel_url in result['hotel']: res_data.append((source, sid, city_id, country_id, hotel_url)) elif source in ('fourseasons'): for line in result['hotel']: sid = line[-1] hotel_url = line[0] res_data.append((source, sid, city_id, country_id, hotel_url)) elif source in ('hyatt'): for line in result['hotel']: sid = line[-1] hotel_url = line[1] res_data.append((source, sid, city_id, country_id, hotel_url)) elif source == 'hilton': for dict_obj in result['hotel']: line = dict_obj.values() res_data.append( (source, line[2], city_id, country_id, line[0])) else: for sid, hotel_url in result['hotel']: res_data.append((source, sid, city_id, country_id, hotel_url)) @func_time_logger def hotel_list_insert_db(): try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() sql = "INSERT IGNORE INTO {} (source, source_id, city_id, country_id, hotel_url) VALUES (%s,%s,%s,%s,%s)".format( self.task.task_name) _res = cursor.executemany(sql, res_data) service_platform_conn.commit() cursor.close() service_platform_conn.close() self.task.list_task_insert_db_count = _res self.task.get_data_per_times = len(res_data) except Exception as e: self.logger.exception(msg="[mysql error]", exc_info=e) raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) hotel_list_insert_db() try: data_collections = mongo_data_client['ServicePlatform'][ self.task.task_name] data_collections.create_index([('source', 1), ('source_id', 1), ('city_id', 1)], unique=True, background=True) data = [] if data: for line in res_data: data.append({ 'list_task_token': self.task.list_task_token, 'task_id': self.task.task_id, 'source': line[0], 'source_id': line[1], 'city_id': line[2], 'country_id': line[3], 'hotel_url': line[4] }) data_collections.insert(data, continue_on_error=True) except pymongo.errors.DuplicateKeyError: logger.info("[Duplicate Key]") except Exception as exc: raise ServiceStandardError( error_code=ServiceStandardError.MONGO_ERROR, wrapped_exception=exc) # 由于错误都是 raise 的, # 所以当出现此种情况是,return 的内容均为正确内容 # 对于抓取平台来讲,当出现此中情况时,数据均应该入库 # 用 res_data 判断,修改 self.error_code 的值 if len(res_data) > 0: self.task.error_code = 0 elif int(error_code) == 0: raise ServiceStandardError(ServiceStandardError.EMPTY_TICKET) else: raise ServiceStandardError(error_code=error_code) return len( res_data), error_code, self.task.error_code, self.task.task_name