Exemplo n.º 1
0
def schedule():
    global filtered_newtasks
    while 1:
        try:
            if (get_message_queue_size('parse') <
                    config.max_task_queue_size * 2
                    and get_message_queue_size('pipeline') <
                    config.max_task_queue_size * 2
                    and get_message_queue_size('schedule') <
                    config.max_task_queue_size * 2
                    and get_message_queue_size('request') <
                    config.max_task_queue_size):
                for crawler in celeryconfig.crawlers:
                    #tasks[crawler['name'] + '.schedule'].delay(check_task=True)
                    #for task in tasks[crawler['name']+'.schedule'].scheduler.new_tasks_generator():
                    #    if task.get('priority', None):
                    #        #tasks[crawler['name']+'.request_priority'].delay(task)
                    #        tasks[crawler['name']+'.request_priority'].apply_async((task, ), compression='zlib')
                    #    else:
                    #        #tasks[crawler['name']+'.request'].delay(task)
                    #        tasks[crawler['name']+'.request'].apply_async((task, ), compression='zlib')
                    with scheduel_lock:
                        db = tasks[
                            crawler['name'] +
                            '.schedule'].scheduler.get_redis_connection()
                        new_tasks = db.lpop(config.filtered_task_pool_key)
                        if new_tasks:
                            new_tasks = json.loads(new_tasks)
                        else:
                            new_tasks = filtered_newtasks
                            filtered_newtasks = []

                        for task in new_tasks:
                            if task.get('priority', None):
                                #tasks[crawler['name']+'.request_priority'].delay(task)
                                tasks[crawler['name'] +
                                      '.request_priority'].apply_async(
                                          (task, ), compression='zlib')
                            else:
                                #tasks[crawler['name']+'.request'].delay(task)
                                tasks[crawler['name'] +
                                      '.request'].apply_async(
                                          (task, ), compression='zlib')
            else:
                gevent.sleep(1)
        except Exception as exc:
            logging.exception(exc)
            for crawler in celeryconfig.crawlers:
                tasks[crawler['name'] + '.schedule'].scheduler.save_filter()
            raise
Exemplo n.º 2
0
def schedule():
    global filtered_newtasks
    while 1:
        try:
            if (get_message_queue_size('parse') < config.max_task_queue_size * 2 and 
                get_message_queue_size('pipeline') < config.max_task_queue_size * 2 and
                get_message_queue_size('schedule') < config.max_task_queue_size * 2 and
                get_message_queue_size('request') < config.max_task_queue_size):
                for crawler in celeryconfig.crawlers:
                    #tasks[crawler['name'] + '.schedule'].delay(check_task=True)
                    #for task in tasks[crawler['name']+'.schedule'].scheduler.new_tasks_generator():
                    #    if task.get('priority', None):
                    #        #tasks[crawler['name']+'.request_priority'].delay(task)
                    #        tasks[crawler['name']+'.request_priority'].apply_async((task, ), compression='zlib')
                    #    else:
                    #        #tasks[crawler['name']+'.request'].delay(task)
                    #        tasks[crawler['name']+'.request'].apply_async((task, ), compression='zlib')
                    with scheduel_lock:
                        db = tasks[crawler['name']+'.schedule'].scheduler.get_redis_connection()
                        new_tasks = db.lpop(config.filtered_task_pool_key)
                        if new_tasks:
                            new_tasks = json.loads(new_tasks)
                        else:
                            new_tasks = filtered_newtasks
                            filtered_newtasks = []

                        for task in new_tasks:
                            if task.get('priority', None):
                                #tasks[crawler['name']+'.request_priority'].delay(task)
                                tasks[crawler['name']+'.request_priority'].apply_async((task, ), compression='zlib')
                            else:
                                #tasks[crawler['name']+'.request'].delay(task)
                                tasks[crawler['name']+'.request'].apply_async((task, ), compression='zlib')
            else:
                gevent.sleep(1)
        except Exception as exc:
            logging.exception(exc)
            for crawler in celeryconfig.crawlers:
                tasks[crawler['name']+'.schedule'].scheduler.save_filter()
            raise
Exemplo n.º 3
0
    def schedule(self, tasks=None, check_task=False, save_filter=False):
        if save_filter:
            self.scheduler.save_filter()
            return

        #  initcail tasks:
        if not tasks and not check_task:
            for task in self.scheduler.init_generator():
                if task.get('priority', None):
                    #app.tasks[self.crawler_name+'.request_priority'].delay(task)
                    app.tasks[self.crawler_name +
                              '.request_priority'].apply_async(
                                  (task, ), compression='zlib')
                else:
                    #app.tasks[self.crawler_name+'.request'].delay(task)
                    app.tasks[self.crawler_name + '.request'].apply_async(
                        (task, ), compression='zlib')
                #group(app.tasks[self.crawler_name + '.request'].s(task)
                #        | app.tasks[self.crawler_name + '.parse'].s()
                #        | app.tasks[self.crawler_name + '.pipeline'].s()
                #        ).delay()
            #app.tasks[self.crawler_name+'.schedule'].apply_async(
            #        args=[],
            #        kwargs = {
            #            'check_task': True,
            #            },
            #        eta=datetime.datetime.now())
        # add new tasks, call by task.apply
        elif tasks and not check_task:
            self.scheduler.add_new_task(tasks)
            #app.tasks[self.crawler_name+'.new_task'].delay(task)
        # schedule task
        elif check_task:
            #i = app.control.inspect()
            timedelta = config.new_task_check_interval
            if (get_message_queue_size('request') <
                    config.max_task_queue_size):
                for task in self.scheduler.new_tasks_generator():
                    if task.get('priority', None):
                        app.tasks[self.crawler_name +
                                  '.request_priority'].delay(task)
                    else:
                        app.tasks[self.crawler_name + '.request'].delay(task)
                timedelta = 1
Exemplo n.º 4
0
    def schedule(self, tasks=None, check_task=False, save_filter=False):
        if save_filter:
            self.scheduler.save_filter()
            return

        #  initcail tasks:
        if not tasks and not check_task:
            for task in self.scheduler.init_generator():
                if task.get('priority', None):
                    #app.tasks[self.crawler_name+'.request_priority'].delay(task)
                    app.tasks[self.crawler_name+'.request_priority'].apply_async((task, ), compression='zlib')
                else:
                    #app.tasks[self.crawler_name+'.request'].delay(task)
                    app.tasks[self.crawler_name+'.request'].apply_async((task, ), compression='zlib')
                #group(app.tasks[self.crawler_name + '.request'].s(task) 
                #        | app.tasks[self.crawler_name + '.parse'].s() 
                #        | app.tasks[self.crawler_name + '.pipeline'].s()
                #        ).delay()
            #app.tasks[self.crawler_name+'.schedule'].apply_async(
            #        args=[],
            #        kwargs = {
            #            'check_task': True,
            #            },
            #        eta=datetime.datetime.now())
        # add new tasks, call by task.apply
        elif tasks and not check_task:
            self.scheduler.add_new_task(tasks)
                #app.tasks[self.crawler_name+'.new_task'].delay(task)
        # schedule task
        elif check_task:
            #i = app.control.inspect()
            timedelta = config.new_task_check_interval
            if (get_message_queue_size('request') < config.max_task_queue_size):
                for task in self.scheduler.new_tasks_generator():
                    if task.get('priority', None):
                        app.tasks[self.crawler_name+'.request_priority'].delay(task)
                    else:
                        app.tasks[self.crawler_name+'.request'].delay(task)
                timedelta = 1