예제 #1
0
def insert_task(queue, limit):
    _count = 0
    # max_retry_times = get_max_retry_times(queue_name=queue)
    for task in get_task_total_simple(queue=queue, limit=limit, used_times=6):
        _count += 1
        # 初始化需要修改的 worker
        changed_worker = None
        if task.source.lower() in hotel_slow_source:
            s = task.source.lower()
            change_worker_dict = hotel_slow_source[s]
            if task.worker in change_worker_dict:
                changed_worker, changed_queue, changed_routine_key = change_worker_dict[
                    task.worker]
        if changed_worker is None:
            app.send_task(task.worker,
                          task_id="[collection: {}][tid: {}]".format(
                              task.collection, task.task_id),
                          kwargs={'task': task},
                          queue=task.queue,
                          routing_key=task.routine_key)
        else:
            # 如果需要开启慢任务
            app.send_task(changed_worker,
                          task_id="[collection: {}][tid: {}]".format(
                              task.collection, task.task_id),
                          kwargs={'task': task},
                          queue=changed_queue,
                          routing_key=changed_routine_key)
    logger.info("Insert queue: {0} task count: {1}".format(queue, _count))
예제 #2
0
def list_page_task(self, ctx, city_id, **kwargs):
    self.task_source = 'TripAdvisor'
    self.task_type = 'HotelList'
    with MySession() as session:
        try:
            session.headers.update(ctx['headers'])
            resp = session.get(ctx['url'])
            jq = PyQuery(resp.text)

            # 爬取详情页
            doc_a_href = jq(".property_title")

            for each in doc_a_href.items():
                # 详情页 id
                detail_id = each.attr("id").split('_')[-1]
                # 详情页链接
                detail_url = urlparse.urljoin(resp.url, each.attr("href"))
                collections.save({
                    'city_id': city_id,
                    'source_id': detail_id,
                    'source_url': detail_url,
                    'task_id': kwargs['task_id'],
                    'page_index': ctx['page_index']
                })

            # 爬取下一页,如果不是第一页,这部分不进行
            if ctx['page_index'] == 0:
                total_page = jq(".pageNum.last").attr("data-page-number")
                for i in range(1, int(total_page) + 1):
                    # 用对方的 city_id 生成抓取信息
                    ctx = init_header(ctx['source_city_id'], i)

                    # 分发异步任务
                    app.send_task('proj.tripadvisor_list_tasks.list_page_task',
                                  args=(
                                      ctx,
                                      city_id,
                                  ),
                                  kwargs=kwargs,
                                  queue='tripadvisor_list_tasks',
                                  routing_key='tripadvisor_list_tasks')

            update_task(kwargs['task_id'])
        except Exception as exc:
            session.update_proxy('23')
            self.retry(exc=traceback.format_exc(exc))
            add_target(args['url'],
                       args['mid'],
                       args['special_str'],
                       task_id=task_id)
            _count += 1

        # todo hotel list init by Task
        if worker == 'hotel_list':
            # hotel_list_task.delay(args['source'], args['city_id'], args['part'], task_id=task_id)

            kwargs = {}
            app.send_task('proj.hotel_list_task.hotel_list_task',
                          args=(
                              args['source'],
                              args['city_id'],
                              args['check_in'],
                              args['part'],
                          ),
                          kwargs={'task_id': task_id},
                          queue='hotel_list_task',
                          routing_key='hotel_list_task')
            _count += 1

        # todo hotel base data init by Task
        if worker == 'hotel_base_data':
            # hotel_base_data.delay(args['source'], args['hotel_url'], args['other_info'], args['part'], task_id=task_id)
            kwargs = {}
            app.send_task('proj.hotel_tasks.hotel_base_data',
                          args=(
                              args['source'],
                              args['hotel_url'],
                              args['other_info'],
예제 #4
0
def full_site_spider(self, url, level, parent_url, parent_info, **kwargs):
    self.task_source = 'TripAdvisor'
    self.task_type = 'WholeSiteCrawl'
    with MySession() as session:
        try:
            page = session.get(url)
            if ('text/html' in page.headers['Content-type']) or (
                    'text/plain' in page.headers['Content-type']):
                # 解析
                img_url_set, pdf_url_set, next_url_set = full_website_parser(
                    page.text, url)

                # 保存已抓取页面 url
                urlSaver.add_url(parent_url, url)

                # 保存结果信息
                save_crawl_result(parent_url,
                                  parent_info,
                                  level,
                                  url,
                                  img_url=list(img_url_set),
                                  pdf_url=list(pdf_url_set),
                                  next_url=list(next_url_set))

                # 分发新的任务
                for next_url in next_url_set:
                    if not (urlSaver.has_crawled(parent_url, next_url)
                            or urlSaver.has_crawled('static_data', next_url)
                            or urlSaver.crawled_enough(parent_url)):
                        if level < MAX_LEVEL - 1:
                            # 发任务的时候就添加已抓取 url,防止因中间的时间间隔导致队列中任务指数暴增
                            urlSaver.add_url(parent_url, next_url)

                            # full_site_spider.delay(next_url, level + 1, parent_url, parent_info, **kwargs)
                            app.send_task(
                                'proj.full_website_spider_task.full_site_spider',
                                args=(
                                    next_url,
                                    level + 1,
                                    parent_url,
                                    parent_info,
                                ),
                                kwargs=kwargs,
                                queue='full_site_task',
                                routing_key='full_site_task')
            elif 'image' in page.headers['Content-type']:
                # 无法直接从页面信息中查看到是否为图片的页面,通过 Content-type 检测是否为图片,并入库保存
                urlSaver.add_url(parent_url, url)
                save_crawl_result(parent_url,
                                  parent_info,
                                  level,
                                  url,
                                  img_url=[
                                      url,
                                  ])

            elif 'application/pdf' in page.headers['Content-type']:
                # 无法直接从页面信息中查看到是否为 pdf 的页面,通过 Content-type 检测是否为 pdf,并入库保存
                urlSaver.add_url(parent_url, url)
                save_crawl_result(parent_url,
                                  parent_info,
                                  level,
                                  url,
                                  pdf_url=[
                                      url,
                                  ])
            else:
                # 将非 html 页面入 set 防止多次抓取,保存已抓取页面 url , 文件类型诸如 mp3 rar 等格式
                urlSaver.add_url('static_data', url)
                save_crawl_result(parent_url,
                                  parent_info,
                                  level,
                                  url,
                                  unknown_static_file=[
                                      url,
                                  ])

        except Exception as exc:
            session.update_proxy('23')
            self.retry(exc=traceback.format_exc(exc))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/7/15 下午11:00
# @Author  : Hou Rong
# @Site    :
# @File    : spider_init_by_mongo.py
# @Software: PyCharm
import os
from proj.celery import app
from proj.my_lib.task_module.mongo_task_func import get_task_total

if __name__ == '__main__':
    _count = 0
    for mongo_task_id, args in get_task_total(50000):
        _count += 1
        t_id = app.send_task('proj.file_downloader_task.file_downloader',
                             args=(
                                 args['source_url'],
                                 args['type'],
                                 os.path.join(
                                     '/data/nfs/image/hotel_whole_site',
                                     args['mid']),
                             ),
                             kwargs={'mongo_task_id': mongo_task_id},
                             queue='file_downloader',
                             routing_key='file_downloader')
    print _count
from proj.celery import app

client = pymongo.MongoClient(host='10.10.231.105')
collections = client['Task']['FullSite']

if __name__ == '__main__':
    kwargs = {}
    for line in collections.find().sort('select_time', 1).limit(500):
        _id = line['_id']
        mid = line['mid']
        website_url = line['website_url']
        collections.update({'_id': _id},
                           {'$set': {
                               'select_time': datetime.datetime.now()
                           }},
                           upsert=False,
                           multi=False)
        print(mid, website_url)
        app.send_task('proj.full_website_spider_task.full_site_spider',
                      args=(
                          website_url,
                          0,
                          website_url,
                          {
                              'id': mid
                          },
                      ),
                      kwargs=kwargs,
                      queue='full_site_task',
                      routing_key='full_site_task')