Пример #1
0
def create_unit_of_work(process_name,
                        start_id,
                        end_id,
                        timeperiod='INVALID_TIMEPERIOD',
                        state=unit_of_work.STATE_REQUESTED,
                        creation_at=datetime.utcnow(),
                        uow_id=None):
    """ method creates and returns unit_of_work """
    try:
        source_collection = ProcessContext.get_source(process_name)
        target_collection = ProcessContext.get_sink(process_name)
    except KeyError:
        source_collection = None
        target_collection = None

    uow = UnitOfWork()
    uow.timeperiod = timeperiod
    uow.start_timeperiod = timeperiod
    uow.end_timeperiod = timeperiod
    uow.start_id = start_id
    uow.end_id = end_id
    uow.source = source_collection
    uow.sink = target_collection
    uow.state = state
    uow.created_at = creation_at
    uow.process_name = process_name
    uow.number_of_retries = 0

    if uow_id is not None:
        uow.document['_id'] = uow_id

    return uow
Пример #2
0
    def compute_scope_of_processing(self, process_name, start_timeperiod, end_timeperiod, job_record):
        """method reads collection and identify slice for processing"""
        source_collection_name = ProcessContext.get_source(process_name)
        target_collection_name = ProcessContext.get_sink(process_name)

        start_id = self.ds.highest_primary_key(source_collection_name, start_timeperiod, end_timeperiod)
        end_id = self.ds.lowest_primary_key(source_collection_name, start_timeperiod, end_timeperiod)

        uow = UnitOfWork()
        uow.timeperiod = start_timeperiod
        uow.start_id = str(start_id)
        uow.end_id = str(end_id)
        uow.start_timeperiod = start_timeperiod
        uow.end_timeperiod = end_timeperiod
        uow.created_at = datetime.utcnow()
        uow.source = source_collection_name
        uow.sink = target_collection_name
        uow.state = unit_of_work.STATE_REQUESTED
        uow.process_name = process_name
        uow.number_of_retries = 0
        uow_id = self.uow_dao.insert(uow)

        mq_request = WorkerMqRequest()
        mq_request.process_name = process_name
        mq_request.unit_of_work_id = uow_id

        publisher = self.publishers.get(process_name)
        publisher.publish(mq_request.document)
        publisher.release()

        msg = 'Published: UOW %r for %r in timeperiod %r.' % (uow_id, process_name, start_timeperiod)
        self._log_message(INFO, process_name, job_record, msg)
        return uow
Пример #3
0
    def __init__(self, process_name):
        """@param process_name: id of the process, the worker will be performing """
        super(AbstractMqWorker, self).__init__(process_name)
        self.queue_source = ProcessContext.get_source(self.process_name)
        self.queue_sink = ProcessContext.get_sink(self.process_name)
        self.consumer = None
        self._init_mq_consumer()

        self.main_thread = None
        self.performance_ticker = None
        self._init_performance_ticker(self.logger)

        msg_suffix = 'in Production Mode'
        if settings['under_test']:
            msg_suffix = 'in Testing Mode'
        self.logger.info('Started %s %s' % (self.process_name, msg_suffix))
Пример #4
0
    def _start_process(self, start_timeperiod, end_timeperiod, arguments):
        try:
            input_file = ProcessContext.get_source(self.process_name)

            self.logger.info('start: %s {' % self.process_name)
            p = psutil.Popen([settings['bash_shell'],
                              settings['pig_command'],
                              '-f', '/home/bmushkevych/git/synergy-pig/script.pig',
                              '-p', 'input_file=' + input_file + '/' + start_timeperiod,
                              '-p', 'timeperiod=' + start_timeperiod],
                             close_fds=True,
                             cwd=settings['process_cwd'],
                             stdin=PIPE,
                             stdout=PIPE,
                             stderr=PIPE)
            self.cli_process = p
            self.logger.info('Started %s with pid = %r' % (self.process_name, p.pid))
        except Exception:
            self.logger.error('Exception on starting: %s' % self.process_name, exc_info=True)
        finally:
            self.logger.info('}')
Пример #5
0
    def _mq_callback(self, message):
        """ try/except wrapper
        in case exception breaks the abstract method, this method:
        - catches the exception
        - logs the exception
        - marks unit of work as INVALID"""
        try:
            mq_request = WorkerMqRequest(message.body)
            uow = self.uow_dao.get_one(mq_request.unit_of_work_id)
            if uow.state in [unit_of_work.STATE_CANCELED, unit_of_work.STATE_PROCESSED]:
                # garbage collector might have reposted this UOW
                self.logger.warning('Skipping unit_of_work: id %s; state %s;' % (str(message.body), uow.state),
                                    exc_info=False)
                self.consumer.acknowledge(message.delivery_tag)
                return
        except Exception:
            self.logger.error('Safety fuse. Can not identify unit_of_work %s' % str(message.body), exc_info=True)
            self.consumer.acknowledge(message.delivery_tag)
            return

        try:
            start_id_obj = uow.start_id
            end_id_obj = uow.end_id
            start_timeperiod = uow.start_timeperiod
            end_timeperiod = uow.end_timeperiod

            uow.state = unit_of_work.STATE_IN_PROGRESS
            uow.started_at = datetime.utcnow()
            self.uow_dao.update(uow)
            self.performance_ticker.start_uow(uow)

            bulk_threshold = settings['bulk_threshold']
            iteration = 0
            while True:
                collection_name = ProcessContext.get_source(self.process_name)
                cursor = self.ds.cursor_for(collection_name,
                                            start_id_obj,
                                            end_id_obj,
                                            iteration,
                                            start_timeperiod,
                                            end_timeperiod,
                                            bulk_threshold)
                count = cursor.count(with_limit_and_skip=True)
                if count == 0 and iteration == 0:
                    msg = 'No entries in %s at range [%s : %s]' % (collection_name, uow.start_id, uow.end_id)
                    self.logger.warning(msg)
                    break
                else:
                    shall_continue, new_start_id = self._process_not_empty_cursor(cursor)
                    if shall_continue:
                        start_id_obj = new_start_id
                        iteration += 1
                    else:
                        break

            msg = 'Cursor exploited after %s iterations' % str(iteration)
            self.logger.info(msg)

            self.perform_post_processing(uow.timeperiod)
            number_of_aggregated_objects = self._flush_aggregated_objects()
            uow.number_of_aggregated_documents = number_of_aggregated_objects
            uow.number_of_processed_documents = self.performance_ticker.per_job
            uow.finished_at = datetime.utcnow()
            uow.state = unit_of_work.STATE_PROCESSED
            self.uow_dao.update(uow)
            self.performance_ticker.finish_uow()
        except Exception as e:
            uow.state = unit_of_work.STATE_INVALID
            self.uow_dao.update(uow)
            self.performance_ticker.cancel_uow()

            del self.aggregated_objects
            self.aggregated_objects = dict()
            gc.collect()

            self.logger.error('Safety fuse while processing unit_of_work %s in timeperiod %s : %r'
                              % (message.body, uow.timeperiod, e), exc_info=True)
        finally:
            self.consumer.acknowledge(message.delivery_tag)
            self.consumer.close()