Пример #1
0
    def __init__(self, index):
        super(GetWorker, self).__init__()
        self.index = index
        self._batch_param = {}
        self.manager = RedisManager(RECORD_REDIS, QUEUE_REDIS, CACHE_REDIS)

        for batch_id in Record.instance().get_unfinished_batch():
            parameter = Record.instance().get_parameter(batch_id)
            total_count = Record.instance().get_total_number(batch_id)
            if total_count is None:
                continue
            self.manager.worker_init_distributed_queue(batch_id,
                                                       int(total_count))
            self._batch_param[batch_id] = parameter
Пример #2
0
    def run(self, *args, **kwargs):
        """ end, background_cleansing, status:
         None,   None,                 begin
         None,   1,                    begin cleaning
         None,   0,                    finish cleaning
         time,   0,                    begin delete
         time,   None,                 finish delete
         None,   0,                    finish cleaning then exception
        """
        for batch_id, queue_dict in self.manager.get_queue_with_priority():
            queue = queue_dict['queue']
            if Record.instance().is_finished(batch_id) is True:
                # this queue and queue object in distributed_queues can be
                # delete, but not needed. When run finish and
                # background_cleansing finsh, this process end.
                # BTW, worker instance will reboot every 10 minutes.
                # If another worker delete queue, I don't need to do anything.
                continue

            background = queue.get_background_cleaning_status()
            if background is None:
                self.work(batch_id, queue_dict, *args, **kwargs)
            elif background == '1':
                self.work(batch_id, queue_dict, *args, **kwargs)

            elif background == '0':
                pass
Пример #3
0
    def run(self, *args, **kwargs):
        """ end, background_cleansing, status:
         None,   None,                 begin
         None,   1,                    begin cleaning
         None,   0,                    finish cleaning

         time,   0,                    begin delete
         time,   None,                 finish delete

         None,   0,                    finish cleaning then exception
        """
        checkout_cache = {}
        for batch_id, queue_dict in self.manager.get_queue_with_priority():
            queue = queue_dict['queue']
            if Record.instance().is_finished(batch_id) is True:
                # this queue and queue object in distributed_queues can be
                # delete, but not needed. When run finish and
                # background_cleansing finsh, this process end.
                # BTW, worker instance will reboot every 10 minutes.
                # If another worker delete queue, I don't need to do anything.
                continue

            background = queue.get_background_cleaning_status()

            if background == '0':
                pass
            elif background is None or background == '1':
                checkout_cache[batch_id] = queue_dict


        today_str = datetime.now().strftime('%Y%m%d')
        while len(checkout_cache) > 0:
            removes = []
            batch_urlid = {}

            for batch_id, queue_dict in checkout_cache.iteritems(): # get url_ids from queue
                get_logger(batch_id, today_str, '/opt/service/log/').info('begin get items from queue')
                results = queue_dict['queue'].get(block=True, timeout=3, interval=1)
                get_logger(batch_id, today_str, '/opt/service/log/').info('finish get items from queue')

                if not results:
                    removes.append(batch_id)
                    continue
                batch_urlid[batch_id] = results
            [checkout_cache.pop(i) for i in removes]

            while len(batch_urlid) > 0:
                removes = []
                for batch_id, results in batch_urlid.iteritems(): # download and process
                    if len(results) > 0:
                        url_id = results.pop()
                        other_batch_process_time = self.get_other_batch_process_time( set(batch_urlid.keys()) - set([batch_id]) )
                        start = time.time()

                        self.work(batch_id, checkout_cache[batch_id], url_id, other_batch_process_time, *args, **kwargs)
                        self.update_process_time_of_this_batch(batch_id, start)
                    else:
                        removes.append(batch_id)
                [batch_urlid.pop(i) for i in removes]
Пример #4
0
def get_status(batch_id):
    if not hasattr(get_status, '_queue'):
        setattr(get_status, '_queue', Queue(batch_id))

    print('job {} remain {} urls need to crawl'.format(
        batch_id, get_status._queue.conn.scard(batch_id)))
    for key, value in Record.instance().conn.hgetall(batch_id).iteritems():
        print('\t{}: {}'.format(key, value))
Пример #5
0
 def worker_init_distributed_queue(self, batch_id, total_count):
     """ init distributed queue for worker side
     """
     priority = int( Record.instance().get_priority(batch_id) )
     thinhash = ThinHash(batch_id, total_count)
     queue = Queue(batch_id, priority=priority)
     self.set_distributed_queue(batch_id, queue, thinhash, priority, True)
     return self.cache[batch_id]
Пример #6
0
    def work(self, batch_id, queue_dict, *args, **kwargs):
        batch_key_filename = batch_id.rsplit('-', 1)[0].replace('-', '_')
        module = __import__('workers.{}'.format(batch_key_filename),
                            fromlist=['process'])

        while 1:
            results = queue_dict['queue'].get(block=True,
                                              timeout=3,
                                              interval=1)
            if results == []: break
            for url_id, count in results:
                url = queue_dict['thinhash'].hget(url_id)

                process_status = module.process(url,
                                                self._batch_param[batch_id],
                                                self.manager, *args, **kwargs)
                if process_status:
                    queue_dict['queue'].task_done(url_id)
                else:
                    Record.instance().increase_failed(batch_id)
Пример #7
0
    def init_distributed_queue(self,
                               batch_id,
                               parameter,
                               total_count,
                               priority=1,
                               timeout=180,
                               failure_times=3):
        """ init distributed queue for master side

        param total_count: can be a predetermined number larger than real total_count
        """
        # keep the step order
        Record.instance().begin(batch_id, parameter, total_count, priority)
        thinhash = ThinHash(batch_id, total_count)
        queue = Queue(batch_id,
                      priority=priority,
                      timeout=timeout,
                      failure_times=failure_times)

        self.set_distributed_queue(batch_id, queue, thinhash, priority, True)
        return self.cache[batch_id]
Пример #8
0
    def work(self, batch_id, queue_dict, url_id, other_batch_process_time, *args, **kwargs):
        try:
            batch_key_filename = batch_id.rsplit('-', 1)[0].replace('-', '_')
            module = __import__('workers.{}'.format(batch_key_filename), fromlist=['process'])
        except:
            module = __import__('workers.prefetch', fromlist=['process'])

        today_str = datetime.now().strftime('%Y%m%d')
        if kwargs and kwargs.get("debug"):
            get_logger(batch_id, today_str, '/opt/service/log/').info('begin get url from thinhash redis')

        # TODO change to hmget
        url = queue_dict['thinhash'].hget(url_id)

        if kwargs and kwargs.get("debug"):
            get_logger(batch_id, today_str, '/opt/service/log/').info('end get url from thinhash redis')

        try:
            process_status = module.process(url,
                                            batch_id,
                                            self._batch_param[batch_id],
                                            self.manager,
                                            other_batch_process_time,
                                            *args,
                                            **kwargs)
        except Exception as e:
            Record.instance().add_exception(batch_id, url, repr(e))
            queue_dict['queue'].task_done(url_id)
            return

        if process_status:
            if kwargs and kwargs.get("debug"):
                get_logger(batch_id, today_str, '/opt/service/log/').info('begin task done for record redis')

            queue_dict['queue'].task_done(url_id)
            Record.instance().increase_success(batch_id)

            if kwargs and kwargs.get("debug"):
                get_logger(batch_id, today_str, '/opt/service/log/').info('end task done for record redis')
Пример #9
0
    def delete_queue(self, batch_id):
        """
        :return: False error,
                 None  not finish yet,
                 True  delete queue, or already deleted.
        """
        distributed = self.get_distributed_queue(batch_id)
        if distributed is None:
            return False
        if distributed['queue'].get_background_cleaning_status() != '0':
            return

        # set failed queue value times to value url
        for field, times in distributed['queue'].get_failed_fields().iteritems():
            url = distributed['thinhash'].hget(field)
            distributed['queue'].set_failed_times_to_url(field, url)

        if Record.instance().if_not_finish_set(batch_id) == 1:
            distributed['thinhash'].delete()
            distributed['queue'].flush()
            self.cache.pop(batch_id)
            return True
        return True
Пример #10
0
    def run(self, *args, **kwargs):
        """ end, background_cleansing, status:
         None,   None,                 begin
         None,   1,                    begin cleaning
         None,   0,                    finish cleaning

         time,   0,                    begin delete
         time,   None,                 finish delete

         None,   0,                    finish cleaning then exception
        """
        for batch_id, queue_dict in self.manager.cache.iteritems():
            queue = queue_dict['queue']
            if Record.instance().is_finished(batch_id) is True:
                # this queue and queue object in distributed_queues can be
                # delete, but not needed. When run finish and
                # background_cleansing finsh, this process end.
                # BTW, worker instance will reboot every 10 minutes.
                # If another worker delete queue, I don't need to do anything.
                continue

            tasks = []
            background = queue.get_background_cleaning_status()
            if background is None:
                tasks.append(gevent.spawn(queue.background_cleaning))
                tasks.append(
                    gevent.spawn(self.work, batch_id, queue_dict, *args,
                                 **kwargs))
            elif background == '1':
                tasks.append(
                    gevent.spawn(self.work, batch_id, queue_dict, *args,
                                 **kwargs))

            elif background == '0':
                self.manager.delete_queue(batch_id)

            gevent.joinall(tasks)
Пример #11
0
    def work(self, batch_id, queue_dict, *args, **kwargs):
        batch_key_filename = batch_id.rsplit('-', 1)[0].replace('-', '_')
        try:
            module = __import__('workers.{}'.format(batch_key_filename),
                                fromlist=['process'])
        except:
            module = __import__('workers.prefetch', fromlist=['process'])

        while 1:
            today_str = datetime.now().strftime('%Y%m%d')

            if kwargs and kwargs.get("debug"):
                get_logger(
                    batch_id, today_str,
                    '/opt/service/log/').info('begin get items from queue')

            results = queue_dict['queue'].get(block=True,
                                              timeout=3,
                                              interval=1)

            if kwargs and kwargs.get("debug"):
                get_logger(
                    batch_id, today_str,
                    '/opt/service/log/').info('finish get items from queue')

            if not results:
                break

            for url_id in results:
                if kwargs and kwargs.get("debug"):
                    get_logger(batch_id, today_str, '/opt/service/log/').info(
                        'begin get url from thinhash redis')

                # TODO change to hmget
                url = queue_dict['thinhash'].hget(url_id)

                if kwargs and kwargs.get("debug"):
                    get_logger(batch_id, today_str, '/opt/service/log/').info(
                        'end get url from thinhash redis')

                try:
                    process_status = module.process(
                        url, batch_id, self._batch_param[batch_id],
                        self.manager, *args, **kwargs)
                except Exception as e:
                    Record.instance().add_exception(batch_id, url, repr(e))
                    queue_dict['queue'].task_done(url_id)
                    continue

                if process_status:
                    if kwargs and kwargs.get("debug"):
                        get_logger(batch_id, today_str,
                                   '/opt/service/log/').info(
                                       'begin task done for record redis')

                    queue_dict['queue'].task_done(url_id)
                    Record.instance().increase_success(batch_id)

                    if kwargs and kwargs.get("debug"):
                        get_logger(batch_id, today_str,
                                   '/opt/service/log/').info(
                                       'end task done for record redis')