def insert_task(queue, limit): _count = 0 # max_retry_times = get_max_retry_times(queue_name=queue) for task in get_task_total_simple(queue=queue, limit=limit, used_times=6): _count += 1 # 初始化需要修改的 worker changed_worker = None if task.source.lower() in hotel_slow_source: s = task.source.lower() change_worker_dict = hotel_slow_source[s] if task.worker in change_worker_dict: changed_worker, changed_queue, changed_routine_key = change_worker_dict[ task.worker] if changed_worker is None: app.send_task(task.worker, task_id="[collection: {}][tid: {}]".format( task.collection, task.task_id), kwargs={'task': task}, queue=task.queue, routing_key=task.routine_key) else: # 如果需要开启慢任务 app.send_task(changed_worker, task_id="[collection: {}][tid: {}]".format( task.collection, task.task_id), kwargs={'task': task}, queue=changed_queue, routing_key=changed_routine_key) logger.info("Insert queue: {0} task count: {1}".format(queue, _count))
def list_page_task(self, ctx, city_id, **kwargs): self.task_source = 'TripAdvisor' self.task_type = 'HotelList' with MySession() as session: try: session.headers.update(ctx['headers']) resp = session.get(ctx['url']) jq = PyQuery(resp.text) # 爬取详情页 doc_a_href = jq(".property_title") for each in doc_a_href.items(): # 详情页 id detail_id = each.attr("id").split('_')[-1] # 详情页链接 detail_url = urlparse.urljoin(resp.url, each.attr("href")) collections.save({ 'city_id': city_id, 'source_id': detail_id, 'source_url': detail_url, 'task_id': kwargs['task_id'], 'page_index': ctx['page_index'] }) # 爬取下一页,如果不是第一页,这部分不进行 if ctx['page_index'] == 0: total_page = jq(".pageNum.last").attr("data-page-number") for i in range(1, int(total_page) + 1): # 用对方的 city_id 生成抓取信息 ctx = init_header(ctx['source_city_id'], i) # 分发异步任务 app.send_task('proj.tripadvisor_list_tasks.list_page_task', args=( ctx, city_id, ), kwargs=kwargs, queue='tripadvisor_list_tasks', routing_key='tripadvisor_list_tasks') update_task(kwargs['task_id']) except Exception as exc: session.update_proxy('23') self.retry(exc=traceback.format_exc(exc))
add_target(args['url'], args['mid'], args['special_str'], task_id=task_id) _count += 1 # todo hotel list init by Task if worker == 'hotel_list': # hotel_list_task.delay(args['source'], args['city_id'], args['part'], task_id=task_id) kwargs = {} app.send_task('proj.hotel_list_task.hotel_list_task', args=( args['source'], args['city_id'], args['check_in'], args['part'], ), kwargs={'task_id': task_id}, queue='hotel_list_task', routing_key='hotel_list_task') _count += 1 # todo hotel base data init by Task if worker == 'hotel_base_data': # hotel_base_data.delay(args['source'], args['hotel_url'], args['other_info'], args['part'], task_id=task_id) kwargs = {} app.send_task('proj.hotel_tasks.hotel_base_data', args=( args['source'], args['hotel_url'], args['other_info'],
def full_site_spider(self, url, level, parent_url, parent_info, **kwargs): self.task_source = 'TripAdvisor' self.task_type = 'WholeSiteCrawl' with MySession() as session: try: page = session.get(url) if ('text/html' in page.headers['Content-type']) or ( 'text/plain' in page.headers['Content-type']): # 解析 img_url_set, pdf_url_set, next_url_set = full_website_parser( page.text, url) # 保存已抓取页面 url urlSaver.add_url(parent_url, url) # 保存结果信息 save_crawl_result(parent_url, parent_info, level, url, img_url=list(img_url_set), pdf_url=list(pdf_url_set), next_url=list(next_url_set)) # 分发新的任务 for next_url in next_url_set: if not (urlSaver.has_crawled(parent_url, next_url) or urlSaver.has_crawled('static_data', next_url) or urlSaver.crawled_enough(parent_url)): if level < MAX_LEVEL - 1: # 发任务的时候就添加已抓取 url,防止因中间的时间间隔导致队列中任务指数暴增 urlSaver.add_url(parent_url, next_url) # full_site_spider.delay(next_url, level + 1, parent_url, parent_info, **kwargs) app.send_task( 'proj.full_website_spider_task.full_site_spider', args=( next_url, level + 1, parent_url, parent_info, ), kwargs=kwargs, queue='full_site_task', routing_key='full_site_task') elif 'image' in page.headers['Content-type']: # 无法直接从页面信息中查看到是否为图片的页面,通过 Content-type 检测是否为图片,并入库保存 urlSaver.add_url(parent_url, url) save_crawl_result(parent_url, parent_info, level, url, img_url=[ url, ]) elif 'application/pdf' in page.headers['Content-type']: # 无法直接从页面信息中查看到是否为 pdf 的页面,通过 Content-type 检测是否为 pdf,并入库保存 urlSaver.add_url(parent_url, url) save_crawl_result(parent_url, parent_info, level, url, pdf_url=[ url, ]) else: # 将非 html 页面入 set 防止多次抓取,保存已抓取页面 url , 文件类型诸如 mp3 rar 等格式 urlSaver.add_url('static_data', url) save_crawl_result(parent_url, parent_info, level, url, unknown_static_file=[ url, ]) except Exception as exc: session.update_proxy('23') self.retry(exc=traceback.format_exc(exc))
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/7/15 下午11:00 # @Author : Hou Rong # @Site : # @File : spider_init_by_mongo.py # @Software: PyCharm import os from proj.celery import app from proj.my_lib.task_module.mongo_task_func import get_task_total if __name__ == '__main__': _count = 0 for mongo_task_id, args in get_task_total(50000): _count += 1 t_id = app.send_task('proj.file_downloader_task.file_downloader', args=( args['source_url'], args['type'], os.path.join( '/data/nfs/image/hotel_whole_site', args['mid']), ), kwargs={'mongo_task_id': mongo_task_id}, queue='file_downloader', routing_key='file_downloader') print _count
from proj.celery import app client = pymongo.MongoClient(host='10.10.231.105') collections = client['Task']['FullSite'] if __name__ == '__main__': kwargs = {} for line in collections.find().sort('select_time', 1).limit(500): _id = line['_id'] mid = line['mid'] website_url = line['website_url'] collections.update({'_id': _id}, {'$set': { 'select_time': datetime.datetime.now() }}, upsert=False, multi=False) print(mid, website_url) app.send_task('proj.full_website_spider_task.full_site_spider', args=( website_url, 0, website_url, { 'id': mid }, ), kwargs=kwargs, queue='full_site_task', routing_key='full_site_task')