def send_result_detail_task(source, tasks, detail_tables, priority): timestamp = None if not tasks: return timestamp convert_data = defaultdict(list) for id, source_list, timestamp in tasks: for _source, hotel_url in json.loads(source_list).iteritems(): task_tag = detail_tables.get(_source) if not task_tag:continue convert_data[(task_tag, _source)].append((id, hotel_url, timestamp)) for (task_tag, _source), hotel_url_and_hids in convert_data.iteritems(): with InsertTask(worker='proj.total_tasks.hotel_detail_task', queue='hotel_detail', routine_key='hotel_detail', task_name=task_tag, source=_source.title(), _type='Hotel', priority=priority) as it: for id, hotel_url, timestamp in hotel_url_and_hids: if hotel_url in ('null', '{}', None, 'http://', '', 'https://'): continue if hotel_url.strip().startswith('https://www.tripadvisor.cn'): continue it.insert_task({ 'source': _source, 'url': hotel_url, 'part': task_tag, 'source_id': 'NULL', 'hid': id, 'city_id': 'NULL', 'country_id': 'NULL' }) return timestamp
def qyer_supplement_map_info(tasks): utime = None _count = 0 if not tasks: return utime source = tasks[0][0] with InsertTask(worker='proj.total_tasks.supplement_map_info', queue='supplement_field', routine_key='supplement_field', task_name='supplement_field', source=source.title(), _type='SupplementField', priority=3) as it: for table_name, type, source, sid, other_info, status, utime in tasks: _count += 1 it.insert_task({ 'table_name': table_name, 'source': source, 'sid': sid, 'other_info': other_info, }) return utime
def send_qyer_detail_task(tasks, task_tag, priority): utime = None with InsertTask(worker='proj.total_tasks.qyer_detail_task', queue='poi_detail', routine_key='poi_detail', task_name=task_tag, source='Qyer', _type='QyerDetail', priority=priority) as it: for source, source_id, city_id, qyer_url, utime in tasks: it.insert_task({ 'target_url': qyer_url, 'city_id': 'NULL', 'part': task_tag }) return utime
def send_GT_detail_task(tasks, task_tag, priority): utime = None with InsertTask(worker='proj.total_tasks.GT_detail_task', queue='grouptravel', routine_key='grouptravel', task_name=task_tag, source='GT', _type='GTDetail', priority=priority) as it: for source, source_id, city_id, country_id, url, utime in tasks: gg = json.loads(url) gg['dest_id'] = country_id gg['source']=source it.insert_task( gg ) return utime
def send_ctripPoi_detail_task(tasks, task_tag, priority): utime = None with InsertTask(worker='proj.total_tasks.ctrip_poi_detail_task', queue='poi_detail', routine_key='poi_detail', task_name=task_tag, source='CtripPoi',_type='CtripPoiDetail', priority=priority) as it: for source, source_id, city_id,country_id, url, utime in tasks: it.insert_task({ 'source': source, 'poi_id': source_id, 'tag': country_id, 'city_id': city_id, 'detail_url': url }) return utime
def send_poi_detail_task(tasks, task_tag, priority): utime = None typ1, typ2, source, tag = task_tag.split('_') with InsertTask(worker='proj.total_tasks.poi_detail_task', queue='poi_detail', routine_key='poi_detail', task_name=task_tag, source='Daodao', _type='DaodaoDetail', priority=priority) as it: for source, source_id, city_id, hotel_url, utime in tasks: it.insert_task({ 'target_url': hotel_url, 'city_id': 'NULL', 'poi_type': typ2, 'country_id': 'NULL', 'part': task_tag }) return utime
def send_hotel_detail_task(tasks, task_tag, priority): timestamp = None if not tasks: return timestamp source = tasks[0][0] with InsertTask(worker='proj.total_tasks.hotel_detail_task', queue='hotel_detail', routine_key='hotel_detail', task_name=task_tag, source=source.title(), _type='Hotel', priority=priority) as it: for source, source_id, city_id, hotel_url, timestamp in tasks: it.insert_task({ 'source': source, 'url': hotel_url, 'part': task_tag, 'source_id': source_id, 'city_id': 'NULL', 'country_id': 'NULL' }) return timestamp
def send_result_daodao_filter(source, tasks, daodao_filter_table, priority): timestamp = None if not tasks: return timestamp task_name = daodao_filter_table+'f' task_name = task_name.replace('list', 'detail') with InsertTask(worker='proj.total_tasks.result_daodao_filter', queue='hotel_detail', routine_key='hotel_detail', task_name=task_name, source=source.title(), _type='daodaoURLFilter', task_type=TaskType.NORMAL, priority=priority) as it: for id, source_list, timestamp in tasks: for _source, hotel_url in json.loads(source_list).iteritems(): if _source not in ('agoda', 'booking', 'ctrip', 'elong', 'hotels'):continue if hotel_url in ('null', '{}', None, 'http://', '', 'https://'): continue if not hotel_url.startswith('https://www.tripadvisor.cn'): continue it.insert_task({ 'url': hotel_url.strip(), 'source': _source, 'id': id, 'table_name': daodao_filter_table, # 'date_index': 0 }) return timestamp
def city2list(): # aaa = str([str(collection_name) for collection_name in db.collection_names() if str(collection_name).startswith('City_Queue_')]) for collection_name in db.collection_names(): if not str(collection_name).startswith('City_Queue_'): continue if collection_name in ('City_Queue_grouptravel_TaskName_city_total_GT_20180312a', 'City_Queue_grouptravel_TaskName_city_total_GT_20180314a'):continue # if not collection_name.endswith('0416a'):continue collections = db[collection_name] _count = 0 # 先获取一条数据,用以初始化入任务模块,可能这条数据有问题 for each in collections.find({}): if 'task_name' in each: per_data = copy.deepcopy(each) break # per_data = collections.find_one() task_name = per_data['task_name'] new_task_name = re.sub('city_', 'list_', task_name) create_table(new_task_name) logger.info('转换任务名 %s : %s' % (task_name, new_task_name)) with InsertTask(worker=per_data['worker'], queue=per_data['queue'], routine_key=per_data['routing_key'], task_name=new_task_name, source=per_data['source'], _type=per_data['type'], priority=per_data['priority'], task_type=TaskType.LIST_TASK) as it: for line in collections.find({"finished": 0}): # 由于上方使用取多个,找 task_name 的方法来实现,这里会判断 task_name 是否在此当中 if 'task_name' not in line: continue if int(line['date_index']) > len(set(list(map(lambda x: x[0], line['data_count'])))): # 当前日期数目如果与已回来的任务数目相同,或者小于的话,则应该推进任务分发,否则为任务还没有完成,需要等待任务完成后再分发 # 发任务数目与返回的全量任务 id 数目相同时,代表之前发的任务已经完成 continue if len(set(list(map(lambda x: x[0], line['data_count'])))) >= MAX_TASK_PER_CITY: # 当前已完成任务数目大于城市最大任务数目,可认为任务完成 collections.update({'list_task_token': line['list_task_token']}, {"$set": {"finished": 1}}) if len(filter(lambda x: x[-1], line['data_count'])) > FINISHED_ZERO_COUNT: # 如果正常返回的数据中连续 FINISHED_ZERO_COUNT 次为 0 ,认为任务完成,并修改状态位置 if all( map( lambda x: int(x[3]) == 0, list( sorted( filter( lambda x: x[-1], line['data_count'] ), key=lambda x: x[1] ) )[-FINISHED_ZERO_COUNT:] ) ): # 全部为 0 则表明该城市任务已经积累完成 collections.update({'list_task_token': line['list_task_token']}, {"$set": {"finished": 1}}) continue # if all(map(lambda x: x[3] == 0, # list(filter(lambda x: x[-1], line['data_count']))[-FINISHED_ZERO_COUNT:])): # # 全部为 0 则表明该城市任务已经积累完成 # collections.update({'list_task_token': line['list_task_token']}, {"$set": {"finished": 1}}) # continue _count += 1 if _count == MAX_CITY_TASK_PER_SEARCH: # 到达最大城市任务数目后,结束任务分发 break # 基本信息,第几个日期 date_index = line['date_index'] args = line['args'] new_date = get_city_date(task_name, date_index) args['check_in'] = new_date args['date_index'] = date_index it.insert_task(args=args) # 更新任务状态 collections.update({ '_id': line['_id'] }, { '$inc': {'date_index': 1} })
def send_image_task(tasks, task_tag, priority, is_poi_task): _count = 0 md5_data = [] conn = pymysql.connect(host='10.10.228.253', user='******', password='******', charset='utf8', db='ServicePlatform') cursor = conn.cursor() update_time = None if not tasks: return update_time source = tasks[0][0] suffix = task_tag.split('_', 1)[1] with InsertTask(worker='proj.total_tasks.images_task', queue='file_downloader', routine_key='file_downloader', task_name='images_' + suffix, source=source.title(), _type='DownloadImages', priority=priority) as it: for source, source_id, city_id, img_items, update_time in tasks: if img_items is None: continue for url in img_items.split('|'): if not url: continue md5 = hashlib.md5(source + str(source_id) + url).hexdigest() if '20171122a' not in task_tag and '20171120a' not in task_tag: if redis_md5.get(md5): continue redis_md5.set(md5, 1) md5_data.append((md5, datetime.datetime.now())) _count += 1 task_type = task_tag.split('_')[1] if task_type == 'total': bucket_name = 'mioji-attr' else: bucket_name = "mioji-{}".format(task_type) if bucket_name == 'mioji-wanle': file_prefix = "huantaoyou" else: file_prefix = "" it.insert_task({ 'source': source, 'new_part': task_tag, 'target_url': url, 'is_poi_task': is_poi_task, 'source_id': source_id, 'part': task_tag.split('_')[-1], 'bucket_name': bucket_name, 'file_prefix': file_prefix }) if _count % 5000 == 0: cursor.executemany('insert ignore into crawled_url(md5, update_time) values(%s, %s)', args=md5_data) conn.commit() md5_data = [] else: if len(md5_data) > 0: cursor.executemany('insert ignore into crawled_url(md5, update_time) values(%s, %s)', args=md5_data) conn.commit() cursor.close() conn.close() return update_time