class AbstractUowAwareWorker(AbstractMqWorker):
    """ Abstract class is inherited by all workers/aggregators
    that are aware of unit_of_work and capable of processing it """

    def __init__(self, process_name, perform_db_logging=False):
        super(AbstractUowAwareWorker, self).__init__(process_name)
        self.perform_db_logging = perform_db_logging
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.mq_transmitter = MqTransmitter(self.logger)

    def __del__(self):
        del self.mq_transmitter
        super(AbstractUowAwareWorker, self).__del__()

    # **************** Abstract Methods ************************
    def _init_performance_tracker(self, logger):
        self.performance_tracker = UowAwareTracker(logger)
        self.performance_tracker.start()

    def _process_uow(self, uow):
        """
        :param uow: unit_of_work to process
        :return: a tuple (number of processed items/documents/etc, desired unit_of_work state) or None
        if None is returned then it is assumed that the return tuple is (0, unit_of_work.STATE_PROCESSED)
        :raise an Exception if the UOW shall be marked as STATE_INVALID
        """
        raise NotImplementedError('method _process_uow must be implemented by {0}'.format(self.__class__.__name__))

    def _clean_up(self):
        """ method is called from the *finally* clause and is suppose to clean up after the uow processing """
        pass

    def _mq_callback(self, message):
        try:
            mq_request = MqTransmission.from_json(message.body)
            uow = self.uow_dao.get_one(mq_request.record_db_id)
            if not uow.is_requested:
                # accept only UOW in STATE_REQUESTED
                self.logger.warning('Skipping UOW: id {0}; state {1};'.format(message.body, uow.state),
                                    exc_info=False)
                self.consumer.acknowledge(message.delivery_tag)
                return
        except Exception:
            self.logger.error('Safety fuse. Can not identify UOW {0}'.format(message.body), exc_info=True)
            self.consumer.acknowledge(message.delivery_tag)
            return

        db_log_handler = UowLogHandler(self.logger, uow.db_id)
        try:
            uow.state = unit_of_work.STATE_IN_PROGRESS
            uow.started_at = datetime.utcnow()
            self.uow_dao.update(uow)
            self.performance_tracker.start_uow(uow)

            if self.perform_db_logging:
                db_log_handler.attach()

            result = self._process_uow(uow)
            if result is None:
                self.logger.warning('method {0}._process_uow returned None. Assuming happy flow.'
                                    .format(self.__class__.__name__))
                number_of_aggregated_objects, target_state = 0, unit_of_work.STATE_PROCESSED
            else:
                number_of_aggregated_objects, target_state = result

            uow.number_of_aggregated_documents = number_of_aggregated_objects
            uow.number_of_processed_documents = self.performance_tracker.success_per_job
            uow.finished_at = datetime.utcnow()
            uow.state = target_state
            self.uow_dao.update(uow)

            if uow.is_finished:
                self.performance_tracker.finish_uow()
            else:
                self.performance_tracker.cancel_uow()

        except Exception as e:
            fresh_uow = self.uow_dao.get_one(mq_request.record_db_id)
            self.performance_tracker.cancel_uow()
            if fresh_uow.is_canceled:
                self.logger.warning('UOW {0} for {1}@{2} was likely marked by MX as SKIPPED. No UOW update performed.'
                                    .format(uow.db_id, uow.process_name, uow.timeperiod), exc_info=False)
            else:
                self.logger.error('Safety fuse while processing UOW {0} for {1}@{2}: {3}'
                                  .format(uow.db_id, uow.process_name, uow.timeperiod, e), exc_info=True)
                uow.state = unit_of_work.STATE_INVALID
                self.uow_dao.update(uow)

        finally:
            self.consumer.acknowledge(message.delivery_tag)
            self.consumer.close()
            self._clean_up()
            db_log_handler.detach()

        try:
            self.mq_transmitter.publish_uow_status(uow)
            self.logger.info('UOW *{0}* status report published into MQ'.format(uow.state))
        except Exception:
            self.logger.error('Error on UOW status report publishing', exc_info=True)
Пример #2
0
class BashRunnable(threading.Thread):
    """Process starts remote or local bash script job, supervises its execution and updates mq"""

    def __init__(self, logger, message, consumer, performance_ticker):
        self.logger = logger
        self.message = message
        self.mq_request = SynergyMqTransmission.from_json(message.body)
        self.consumer = consumer
        self.performance_ticker = performance_ticker
        self.alive = False
        self.return_code = -1
        self.uow_dao = UnitOfWorkDao(self.logger)

        self.thread_name = str(self.mq_request)
        super(BashRunnable, self).__init__(name=self.thread_name)

    def _poll_process(self):
        return self.alive, self.return_code

    def _start_process(self):
        try:
            uow = self.uow_dao.get_one(self.mq_request.unit_of_work_id)
            if not uow.is_requested:
                # accept only UOW in STATE_REQUESTED
                self.logger.warn('Skipping unit_of_work: id %s; state %s;' % (str(self.message.body), uow.state),
                                 exc_info=False)
                self.consumer.acknowledge(self.message.delivery_tag)
                return
        except Exception:
            self.logger.error('Safety fuse. Can not identify unit_of_work %s' % str(self.message.body), exc_info=True)
            self.consumer.acknowledge(self.message.delivery_tag)
            return

        try:
            self.logger.info('start: %s {' % self.thread_name)
            self.alive = True

            uow.state = unit_of_work.STATE_IN_PROGRESS
            uow.started_at = datetime.utcnow()
            self.uow_dao.update(uow)

            fabric.operations.env.warn_only = True
            fabric.operations.env.abort_on_prompts = True
            fabric.operations.env.use_ssh_config = True
            fabric.operations.env.host_string = uow.arguments[ARGUMENT_CMD_HOST]

            command = os.path.join(uow.arguments[ARGUMENT_CMD_PATH],
                                   uow.arguments[ARGUMENT_CMD_FILE])
            command += ' %s' % uow.arguments[ARGUMENT_CMD_ARGS]

            run_result = fabric.operations.run(command, pty=False)
            if run_result.succeeded:
                self.return_code = 0

            uow.finished_at = datetime.utcnow()
            uow.state = unit_of_work.STATE_PROCESSED
            self.uow_dao.update(uow)

            self.logger.info('Completed %s with result = %r' % (self.thread_name, self.return_code))
        except Exception:
            self.logger.error('Exception on starting: %s' % self.thread_name, exc_info=True)
            uow.state = unit_of_work.STATE_INVALID
            self.uow_dao.update(uow)
        finally:
            self.logger.info('}')
            self.alive = False

    def run(self):
        try:
            self._start_process()
            code = None
            alive = True
            while alive:
                alive, code = self._poll_process()
                time.sleep(0.1)

            if code == 0:
                self.performance_ticker.tracker.increment_success()
            else:
                self.performance_ticker.tracker.increment_failure()

            self.logger.info('BashDriver for %s return code is %r' % (self.thread_name, code))
        except Exception as e:
            self.performance_ticker.tracker.increment_failure()
            self.logger.error('Safety fuse while processing request %r: %r' % (self.message.body, e), exc_info=True)
        finally:
            self.consumer.acknowledge(self.message.delivery_tag)
class AbstractStateMachine(object):
    """ Abstract state machine used to host common logic for the rest of registered State Machines """

    def __init__(self, logger, timetable, name):
        self.name = name
        self.logger = logger
        self.mq_transmitter = MqTransmitter(self.logger)
        self.timetable = timetable
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.job_dao = JobDao(self.logger)

    @property
    def run_on_active_timeperiod(self):
        """
        :return: True if given State Machine allows execution on *live* timeperiod, as opposed to the finished one
        """
        raise NotImplementedError('property run_on_active_timeperiod must be implemented by {0}'
                                  .format(self.__class__.__name__))

    def _log_message(self, level, process_name, timeperiod, msg):
        """ method performs logging into log file and Timetable's tree node"""
        self.timetable.add_log_entry(process_name, timeperiod, msg)
        self.logger.log(level, msg)

    @with_reconnect
    def _insert_uow(self, process_name, timeperiod, start_timeperiod, end_timeperiod, start_id, end_id):
        """creates unit_of_work and inserts it into the DB
            :raise DuplicateKeyError: if unit_of_work with given parameters already exists """
        uow = UnitOfWork()
        uow.process_name = process_name
        uow.timeperiod = timeperiod
        uow.start_id = str(start_id)
        uow.end_id = str(end_id)
        uow.start_timeperiod = start_timeperiod
        uow.end_timeperiod = end_timeperiod
        uow.created_at = datetime.utcnow()
        uow.submitted_at = datetime.utcnow()
        uow.source = context.process_context[process_name].source
        uow.sink = context.process_context[process_name].sink
        uow.state = unit_of_work.STATE_REQUESTED
        uow.unit_of_work_type = unit_of_work.TYPE_MANAGED
        uow.number_of_retries = 0
        uow.arguments = context.process_context[process_name].arguments
        uow.db_id = self.uow_dao.insert(uow)

        msg = 'Created: UOW {0} for {1}@{2}.'.format(uow.db_id, process_name, start_timeperiod)
        self._log_message(INFO, process_name, start_timeperiod, msg)
        return uow

    def _publish_uow(self, uow):
        self.mq_transmitter.publish_managed_uow(uow)
        msg = 'Published: UOW {0} for {1}@{2}.'.format(uow.db_id, uow.process_name, uow.start_timeperiod)
        self._log_message(INFO, uow.process_name, uow.start_timeperiod, msg)

    def insert_and_publish_uow(self, job_record, start_id, end_id):
        """ method creates and publishes a unit_of_work. it also handles DuplicateKeyError and attempts recovery
        :return: tuple (uow, is_duplicate)
        :raise UserWarning: if the recovery from DuplicateKeyError was unsuccessful
        """
        process_name = job_record.process_name
        timeperiod = job_record.timeperiod
        start_timeperiod = self.compute_start_timeperiod(job_record.process_name, job_record.timeperiod)
        end_timeperiod = self.compute_end_timeperiod(job_record.process_name, job_record.timeperiod)

        try:
            is_duplicate = False
            uow = self._insert_uow(process_name, timeperiod, start_timeperiod, end_timeperiod, start_id, end_id)
        except DuplicateKeyError as e:
            is_duplicate = True
            msg = 'Catching up with latest UOW {0}@{1}, because of: {2}' \
                  .format(process_name, start_timeperiod, e)
            self._log_message(WARNING, process_name, start_timeperiod, msg)
            uow = self.uow_dao.recover_from_duplicatekeyerror(e)

        if not uow:
            msg = 'PERSISTENT TIER ERROR! Unable to locate UOW for {0}@{1}' \
                  .format(process_name, start_timeperiod)
            self._log_message(WARNING, process_name, start_timeperiod, msg)
            raise UserWarning(msg)

        if uow.is_canceled:
            # this UOW was marked for re-processing. recycle it
            uow.created_at = datetime.utcnow()      # reset created_at to bypass GC cancellation logic
            uow.submitted_at = datetime.utcnow()    # reset submitted_at to allow 1 hour free of GC resubmitting
            del uow.started_at
            del uow.finished_at
            del uow.number_of_aggregated_documents
            del uow.number_of_processed_documents
            uow.state = unit_of_work.STATE_REQUESTED
            self.uow_dao.update(uow)

        # publish the created/recovered/recycled unit_of_work
        self._publish_uow(uow)
        return uow, is_duplicate

    def notify(self, uow):
        """ method is used by StateMachine's users to provide notifications about change in UOW
        if applicable, method will update job_record state and Timetable tree node state
        :assumptions: uow is in [STATE_NOOP, STATE_CANCELED, STATE_PROCESSED] """
        pass

    def compute_start_timeperiod(self, process_name, timeperiod):
        """ computes lowest *inclusive* timeperiod boundary for job to process
            for process with time_grouping == 1, it returns given timeperiod with no change
            for process with time_grouping != 1, it computes first timeperiod, not processed by the previous job run
            For instance: with time_grouping = 3, QUALIFIER_HOURLY, and timeperiod = 2016042018,
            the start_timeperiod will be = 2016042016 (computed as 2016042018 - 3 + 1)
        """
        time_grouping = context.process_context[process_name].time_grouping
        if time_grouping == 1:
            return timeperiod

        # step1: translate given timeperiod to the time grouped one
        process_hierarchy = self.timetable.get_tree(process_name).process_hierarchy
        timeperiod_dict = process_hierarchy[process_name].timeperiod_dict
        translated_timeperiod = timeperiod_dict._translate_timeperiod(timeperiod)

        # step 2: compute previous grouped period
        # NOTICE: simple `time_helper.increment_timeperiod(time_qualifier, timeperiod)` is insufficient
        #         as it does not address edge cases, such as the last day of the month or the last hour of the day
        # For instance: with time_grouping=3, QUALIFIER_DAILY, and 2016123100
        # the `increment_timeperiod` will yield 2016122800 instead of 2016123100
        time_qualifier = context.process_context[process_name].time_qualifier
        for i in range(1, time_grouping + 1):
            prev_timeperiod = time_helper.increment_timeperiod(time_qualifier, translated_timeperiod, delta=-i)
            if prev_timeperiod == timeperiod_dict._translate_timeperiod(prev_timeperiod):
                # prev_timeperiod is currently at the last grouped timeperiod
                break

        # step 3: compute first exclusive timeperiod after the *prev_timeperiod*,
        # which becomes first inclusive timeperiod for this job run
        over_the_edge_timeperiod = time_helper.increment_timeperiod(time_qualifier, prev_timeperiod, delta=-1)
        if prev_timeperiod != timeperiod_dict._translate_timeperiod(over_the_edge_timeperiod):
            # over_the_edge_timeperiod fell into previous day or month or year
            # *prev_timeperiod* points to the first month, first day of the month or 00 hour
            start_timeperiod = prev_timeperiod
        else:
            start_timeperiod = self.compute_end_timeperiod(process_name, prev_timeperiod)

        return start_timeperiod

    def compute_end_timeperiod(self, process_name, timeperiod):
        """ computes first *exclusive* timeperiod for job to process """
        time_qualifier = context.process_context[process_name].time_qualifier
        end_timeperiod = time_helper.increment_timeperiod(time_qualifier, timeperiod)
        return end_timeperiod

    def _is_noop_timeperiod(self, process_name, timeperiod):
        """ method verifies if the given timeperiod for given process is valid or falls in-between grouping checkpoints
        :param process_name: name of the process
        :param timeperiod: timeperiod to verify
        :return: False, if given process has no time_grouping set or it is equal to 1.
                 False, if time_grouping is custom but the given timeperiod matches the grouped timeperiod.
                 True, if the timeperiod falls in-between grouping cracks
        """
        time_grouping = context.process_context[process_name].time_grouping
        if time_grouping == 1:
            return False

        process_hierarchy = self.timetable.get_tree(process_name).process_hierarchy
        timeperiod_dict = process_hierarchy[process_name].timeperiod_dict
        return timeperiod_dict._translate_timeperiod(timeperiod) != timeperiod

    def _process_noop_timeperiod(self, job_record):
        """ method is valid for processes having time_grouping != 1.
            should a job record fall in-between grouped time milestones,
            its state should be set to STATE_NOOP without any processing """
        job_record.state = job.STATE_NOOP
        self.job_dao.update(job_record)

        time_grouping = context.process_context[job_record.process_name].time_grouping
        msg = 'Job {0}@{1} with time_grouping {2} was transferred to STATE_NOOP' \
              .format(job_record.process_name, job_record.timeperiod, time_grouping)
        self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

    def _process_state_embryo(self, job_record):
        """ method that takes care of processing job records in STATE_EMBRYO state"""
        pass

    def _process_state_in_progress(self, job_record):
        """ method that takes care of processing job records in STATE_IN_PROGRESS state"""
        pass

    def _process_state_final_run(self, job_record):
        """method takes care of processing job records in STATE_FINAL_RUN state"""
        uow = self.uow_dao.get_one(job_record.related_unit_of_work)
        if uow.is_processed:
            self.update_job(job_record, uow, job.STATE_PROCESSED)
        elif uow.is_noop:
            self.update_job(job_record, uow, job.STATE_NOOP)
        elif uow.is_canceled:
            self.update_job(job_record, uow, job.STATE_SKIPPED)
        elif uow.is_invalid:
            msg = 'Job {0}: UOW for {1}@{2} is in {3}; ' \
                  'relying on the Garbage Collector to either recycle or cancel the UOW.' \
                  .format(job_record.db_id, job_record.process_name, job_record.timeperiod, uow.state)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)
        else:
            msg = 'Suppressed creating UOW for {0}@{1}; Job is in {2}; uow is in {3}' \
                  .format(job_record.process_name, job_record.timeperiod, job_record.state, uow.state)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

        timetable_tree = self.timetable.get_tree(job_record.process_name)
        timetable_tree.build_tree()

    def _process_terminal_state(self, job_record):
        """ method logs a warning message notifying that the job is no longer govern by this state machine """
        msg = 'Job {0} for {1}@{2} is in the terminal state {3}, ' \
              'and is no further govern by the State Machine {4}' \
              .format(job_record.db_id, job_record.process_name, job_record.timeperiod, job_record.state, self.name)
        self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)

    def manage_job_with_blocking_children(self, job_record):
        """ method will trigger job processing only if:
            - all children are finished (STATE_PROCESSED, STATE_SKIPPED, STATE_NOOP)
            - all external dependencies are finalized (STATE_PROCESSED, STATE_SKIPPED, STATE_NOOP) """
        is_job_finalizable = self.timetable.is_job_record_finalizable(job_record)
        if is_job_finalizable:
            self.manage_job(job_record)
        else:
            msg = 'Job {0}@{1} is blocked by unprocessed children/dependencies. Waiting another tick' \
                  .format(job_record.process_name, job_record.timeperiod)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

    def manage_job_with_blocking_dependencies(self, job_record):
        """ method will trigger job processing only if:
            - all dependencies are in [STATE_PROCESSED, STATE_NOOP]
            NOTICE: method will transfer current job into STATE_SKIPPED if any dependency is in STATE_SKIPPED """
        composite_state = self.timetable.dependent_on_composite_state(job_record)
        assert isinstance(composite_state, NodesCompositeState)

        if composite_state.all_processed:
            self.manage_job(job_record)
        elif composite_state.skipped_present:
            # As soon as among <dependent on> periods are in STATE_SKIPPED
            # there is very little sense in waiting for them to become STATE_PROCESSED
            # Skip this timeperiod itself
            job_record.state = job.STATE_SKIPPED
            self.job_dao.update(job_record)
            self.mq_transmitter.publish_job_status(job_record)

            msg = 'Job {0}@{1} is blocked by STATE_SKIPPED dependencies. ' \
                  'Transferred the job to STATE_SKIPPED'.format(job_record.process_name, job_record.timeperiod)
            self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)
        else:
            msg = 'Job {0}@{1} is blocked by unprocessed dependencies. Waiting another tick' \
                  .format(job_record.process_name, job_record.timeperiod)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

    def manage_job(self, job_record):
        """ method main duty - is to _avoid_ publishing another unit_of_work, if previous was not yet processed
            In case the State Machine sees that the unit_of_work already exist it could either:
            - update boundaries of the processing; republish
            - wait another tick; idle """
        assert isinstance(job_record, Job)
        try:
            if self._is_noop_timeperiod(job_record.process_name, job_record.timeperiod):
                self._process_noop_timeperiod(job_record)
                return

            if job_record.is_embryo:
                self._process_state_embryo(job_record)

            elif job_record.is_in_progress:
                self._process_state_in_progress(job_record)

            elif job_record.is_final_run:
                self._process_state_final_run(job_record)

            elif job_record.is_skipped:
                self._process_terminal_state(job_record)

            elif job_record.is_processed:
                self._process_terminal_state(job_record)

            elif job_record.is_noop:
                self._process_terminal_state(job_record)

            else:
                msg = 'Unknown state {0} of the job {1}'.format(job_record.state, job_record.db_id)
                self._log_message(ERROR, job_record.process_name, job_record.timeperiod, msg)

        except LookupError as e:
            job_record.number_of_failures += 1
            self.job_dao.update(job_record)
            self.timetable.skip_if_needed(job_record)
            msg = 'Increasing fail counter for Job {0}@{1}, because of: {2}' \
                  .format(job_record.process_name, job_record.timeperiod, e)
            self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)

        finally:
            self.mq_transmitter.publish_job_status(job_record)

    def reprocess_job(self, job_record):
        """ method marks given job for reprocessing:
            - if no UOW was bind with the job record - it is transferred to STATE_EMBRYO only
            - if related UOW was not finished - we mark the UOW as STATE_CANCELED
                and pass the job record thru the _process_in_progress routine
            - otherwise job record is passed thru the _process_in_progress routine """
        original_job_state = job_record.state
        job_record.number_of_failures = 0

        if not job_record.related_unit_of_work:
            job_record.state = job.STATE_EMBRYO
            self.job_dao.update(job_record)
        else:
            uow = self.uow_dao.get_one(job_record.related_unit_of_work)
            if not uow.is_finished:
                uow.state = unit_of_work.STATE_CANCELED
                uow.submitted_at = datetime.utcnow()
                self.uow_dao.update(uow)
            self._process_state_in_progress(job_record)

        msg = 'Reprocessed Job {0} for {1}@{2}: state transfer {3} -> {4};' \
              .format(job_record.db_id, job_record.process_name, job_record.timeperiod,
                      original_job_state, job_record.state)
        self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)

    def skip_job(self, job_record):
        """ method transfers:
            - given job into STATE_SKIPPED if it is not not in finished state
            - UOW into STATE_CANCELED if it is not in finished state """
        original_job_state = job_record.state

        if not job_record.is_finished:
            job_record.state = job.STATE_SKIPPED
            self.job_dao.update(job_record)

        if job_record.related_unit_of_work:
            uow = self.uow_dao.get_one(job_record.related_unit_of_work)
            if not uow.is_finished:
                uow.state = unit_of_work.STATE_CANCELED
                uow.submitted_at = datetime.utcnow()
                self.uow_dao.update(uow)

        msg = 'Skipped Job {0} for {1}@{2}: state transfer {3} -> {4};' \
              .format(job_record.db_id, job_record.process_name, job_record.timeperiod,
                      original_job_state, job_record.state)
        self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)

    def create_job(self, process_name, timeperiod):
        """ method creates a job record in STATE_EMBRYO for given process_name and timeperiod
            :returns: created job record of type <Job>"""
        job_record = Job()
        job_record.state = job.STATE_EMBRYO
        job_record.timeperiod = timeperiod
        job_record.process_name = process_name
        self.job_dao.update(job_record)

        self.logger.info('Created Job {0} for {1}@{2}'
                         .format(job_record.db_id, job_record.process_name, job_record.timeperiod))
        return job_record

    def update_job(self, job_record, uow, new_state):
        """ method updates job record with a new unit_of_work and new state"""
        original_job_state = job_record.state
        job_record.state = new_state
        job_record.related_unit_of_work = uow.db_id
        self.job_dao.update(job_record)

        msg = 'Updated Job {0} for {1}@{2}: state transfer {3} -> {4};' \
              .format(job_record.db_id, job_record.process_name, job_record.timeperiod, original_job_state, new_state)
        self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)
Пример #4
0
class Timetable(object):
    """ Timetable holds all known process trees, where every node presents a timeperiod-driven job"""

    def __init__(self, logger):
        self.lock = RLock()
        self.logger = logger
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.job_dao = JobDao(self.logger)
        self.reprocess = dict()

        # self.trees contain all of the trees and manages much of their life cycle
        # remember to enlist there all trees the system is working with
        self.trees = self._construct_trees_from_context()

        self._register_callbacks()
        self._register_dependencies()
        self.load_tree()
        self.build_trees()
        self.validate()

    def _construct_trees_from_context(self):
        trees = dict()
        for tree_name, context_entry in context.timetable_context.items():
            tree = MultiLevelTree(process_names=context_entry.enclosed_processes,
                                  tree_name=tree_name,
                                  mx_name=context_entry.mx_name,
                                  mx_page=context_entry.mx_page)
            trees[tree_name] = tree
        return trees

    def _register_dependencies(self):
        """ register dependencies between trees"""
        for tree_name, context_entry in context.timetable_context.items():
            tree = self.trees[tree_name]
            assert isinstance(tree, MultiLevelTree)
            for dependent_on in context_entry.dependent_on:
                dependent_on_tree = self.trees[dependent_on]
                assert isinstance(dependent_on_tree, MultiLevelTree)
                tree.register_dependent_on(dependent_on_tree)

    def _register_callbacks(self):
        """ register logic that reacts on reprocessing request
        and create embryo timetable record request"""

        # reprocessing request
        for tree_name, tree in self.trees.items():
            tree.register_reprocess_callback(self._callback_reprocess)

        # skip request
        for tree_name, tree in self.trees.items():
            tree.register_skip_callback(self._callback_skip)

        # callbacks register
        for tree_name, tree in self.trees.items():
            tree.register_create_callbacks(self._callback_create_job_record)

    # *** Call-back methods ***
    def _find_dependant_trees(self, tree_obj):
        """ returns list of trees that are dependent_on given tree_obj """
        dependant_trees = []
        for tree_name, tree in self.trees.items():
            if tree_obj in tree.dependent_on:
                dependant_trees.append(tree)
        return dependant_trees

    def _find_dependant_tree_nodes(self, node_a):
        dependant_nodes = set()
        for tree_b in self._find_dependant_trees(node_a.tree):
            node_b = node_a.find_counterpart_in(tree_b)
            if node_b is None:
                continue
            dependant_nodes.add(node_b)
        return dependant_nodes

    def _reprocess_single_tree_node(self, tree_node):
        """ is called from tree to answer reprocessing request.
        It is possible that timetable record will be transferred to STATE_IN_PROGRESS with no related unit_of_work"""
        uow_id = tree_node.job_record.related_unit_of_work
        if uow_id is not None:
            tree_node.job_record.state = job.STATE_IN_PROGRESS
            uow = self.uow_dao.get_one(uow_id)
            uow.state = unit_of_work.STATE_INVALID
            uow.number_of_retries = 0
            uow.created_at = datetime.utcnow()
            self.uow_dao.update(uow)
            msg = 'Transferred job record %s for %s in timeperiod %s to %s; Transferred unit_of_work to %s' \
                  % (tree_node.job_record.db_id,
                     tree_node.process_name,
                     tree_node.job_record.timeperiod,
                     tree_node.job_record.state,
                     uow.state)

            if tree_node.process_name not in self.reprocess:
                self.reprocess[tree_node.process_name] = dict()
            self.reprocess[tree_node.process_name][tree_node.timeperiod] = tree_node

        else:
            tree_node.job_record.state = job.STATE_EMBRYO
            msg = 'Transferred job record %s for %s in timeperiod %s to %s;' \
                  % (tree_node.job_record.db_id,
                     tree_node.process_name,
                     tree_node.job_record.timeperiod,
                     tree_node.job_record.state)

        tree_node.job_record.number_of_failures = 0
        self.job_dao.update(tree_node.job_record)
        self.logger.warn(msg)
        tree_node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg])

    @thread_safe
    def _callback_reprocess(self, tree_node):
        """ is called from tree to answer reprocessing request.
        It is possible that job record will be transferred to STATE_IN_PROGRESS with no related unit_of_work"""
        if (tree_node.job_record.is_embryo and tree_node.job_record.number_of_failures == 0) \
            or (tree_node.process_name in self.reprocess
                and tree_node.timeperiod in self.reprocess[tree_node.process_name]):
            # the node has already been marked for re-processing or does not require one
            pass
        else:
            self._reprocess_single_tree_node(tree_node)

        reprocessing_nodes = self._find_dependant_tree_nodes(tree_node)
        for node in reprocessing_nodes:
            node.request_reprocess()

    @thread_safe
    def _callback_skip(self, tree_node):
        """ is called from tree to answer skip request"""
        tree_node.job_record.state = job.STATE_SKIPPED
        uow_id = tree_node.job_record.related_unit_of_work
        if uow_id is not None:
            uow = self.uow_dao.get_one(uow_id)
            uow.state = unit_of_work.STATE_CANCELED
            self.uow_dao.update(uow)
            msg = 'Transferred job record %s in timeperiod %s to %s; Transferred unit_of_work to %s' \
                  % (tree_node.job_record.db_id,
                     tree_node.job_record.timeperiod,
                     tree_node.job_record.state,
                     uow.state)
        else:
            msg = 'Transferred job record %s in timeperiod %s to %s;' \
                  % (tree_node.job_record.db_id,
                     tree_node.job_record.timeperiod,
                     tree_node.job_record.state)

        self.job_dao.update(tree_node.job_record)
        self.logger.warn(msg)
        tree_node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg])

        if tree_node.process_name in self.reprocess \
                and tree_node.timeperiod in self.reprocess[tree_node.process_name]:
            del self.reprocess[tree_node.process_name][tree_node.timeperiod]

    @thread_safe
    def _callback_create_job_record(self, tree_node):
        """ is called from a tree to create job record in STATE_EMBRYO and bind it to the given tree node"""

        try:
            job_record = self.job_dao.get_one(tree_node.process_name, tree_node.timeperiod)
        except LookupError:
            job_record = Job()
            job_record.state = job.STATE_EMBRYO
            job_record.timeperiod = tree_node.timeperiod
            job_record.process_name = tree_node.process_name

            tr_id = self.job_dao.update(job_record)
            self.logger.info('Created job record %s, with timeperiod %s for process %s'
                             % (str(tr_id), tree_node.timeperiod, tree_node.process_name))
        tree_node.job_record = job_record

    # *** Tree-manipulation methods ***
    @thread_safe
    def get_tree(self, process_name):
        """ return tree that is managing time-periods for given process"""
        for tree_name, tree in self.trees.items():
            if process_name in tree:
                return tree

    @thread_safe
    def _build_tree_by_level(self, collection_name, since):
        """ method iterated thru all documents in all job collections and builds a tree of known system state"""
        try:
            unsupported_records = dict()
            job_records = self.job_dao.get_all(collection_name, since)
            for job_record in job_records:
                tree = self.get_tree(job_record.process_name)
                if tree is not None:
                    tree.update_node(job_record)
                else:
                    unsupported_records[job_record.process_name] = \
                        unsupported_records.get(job_record.process_name, 0) + 1

            for name, counter in unsupported_records.items():
                self.logger.warn('Skipping %r Job records for %s as no tree is handling it.' % (counter, name))

        except LookupError:
            self.logger.warn('No Job Records in %s.' % str(collection_name))

    @thread_safe
    def load_tree(self):
        """ method iterates thru all objects older than synergy_start_timeperiod parameter in job collections
        and loads them into this timetable"""
        timeperiod = settings.settings['synergy_start_timeperiod']
        yearly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_YEARLY, timeperiod)
        monthly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_MONTHLY, timeperiod)
        daily_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_DAILY, timeperiod)
        hourly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_HOURLY, timeperiod)

        self._build_tree_by_level(COLLECTION_JOB_HOURLY, since=hourly_timeperiod)
        self._build_tree_by_level(COLLECTION_JOB_DAILY, since=daily_timeperiod)
        self._build_tree_by_level(COLLECTION_JOB_MONTHLY, since=monthly_timeperiod)
        self._build_tree_by_level(COLLECTION_JOB_YEARLY, since=yearly_timeperiod)

    @thread_safe
    def build_trees(self):
        """ method iterates thru all trees and ensures that all time-period nodes are created up till <utc_now>"""
        for tree_name, tree in self.trees.items():
            tree.build_tree()

    @thread_safe
    def validate(self):
        """validates that none of nodes in tree is improperly finalized and that every node has job_record"""
        for tree_name, tree in self.trees.items():
            tree.validate()

    @thread_safe
    def dependent_on_composite_state(self, job_record):
        """ :return instance of <NodesCompositeState> """
        assert isinstance(job_record, Job)
        tree = self.get_tree(job_record.process_name)
        node = tree.get_node(job_record.process_name, job_record.timeperiod)
        return node.dependent_on_composite_state()

    # *** Job manipulation methods ***
    @thread_safe
    def update_job_record(self, job_record, uow, new_state):
        """ method updates job record with a new unit_of_work and new state"""
        job_record.state = new_state
        job_record.related_unit_of_work = uow.db_id
        job_record.start_id = uow.start_id
        job_record.end_id = uow.end_id
        self.job_dao.update(job_record)

        tree = self.get_tree(job_record.process_name)
        tree.update_node(job_record)

        msg = 'Transferred job %s for %s in timeperiod %s to new state %s' \
              % (job_record.db_id, job_record.timeperiod, job_record.process_name, new_state)
        self.logger.info(msg)
        self.add_log_entry(job_record.process_name, job_record.timeperiod, msg)

    @thread_safe
    def failed_on_processing_job_record(self, process_name, timeperiod):
        """method increases node's inner counter of failed processing
        if _skip_node logic returns True - node is set to STATE_SKIP"""
        tree = self.get_tree(process_name)
        node = tree.get_node(process_name, timeperiod)
        node.job_record.number_of_failures += 1
        if tree._skip_the_node(node):
            node.request_skip()
        else:
            # job record is automatically updated in request_skip()
            # so if the node was not skipped - job record has to be updated explicitly
            self.job_dao.update(node.job_record)

    @thread_safe
    def get_next_job_record(self, process_name):
        """returns next job record to work on for given process"""
        if process_name in self.reprocess and len(self.reprocess[process_name]) > 0:
            timeperiod = sorted(self.reprocess[process_name].keys())[0]
            node = self.reprocess[process_name][timeperiod]
            del self.reprocess[process_name][timeperiod]
        else:
            tree = self.get_tree(process_name)
            node = tree.get_next_node(process_name)

        if node.job_record is None:
            node.request_embryo_job_record()
        return node.job_record

    @thread_safe
    def is_job_record_finalizable(self, job_record):
        """ :return True, if the node and all its children are either in STATE_PROCESSED or STATE_SKIPPED"""
        assert isinstance(job_record, Job)
        tree = self.get_tree(job_record.process_name)
        node = tree.get_node(job_record.process_name, job_record.timeperiod)
        return node.is_finalizable()

    @thread_safe
    def add_log_entry(self, process_name, timeperiod, msg):
        """ adds a log entry to the tree node. log entries has no persistence """
        tree = self.get_tree(process_name)
        node = tree.get_node(process_name, timeperiod)
        node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg])
class StateMachineFreerun(object):
    """ State Machine to handle freerun jobs/triggers """

    def __init__(self, logger, name=STATE_MACHINE_FREERUN):
        self.name = name
        self.logger = logger
        self.mq_transmitter = MqTransmitter(self.logger)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.sfe_dao = FreerunProcessDao(self.logger)

    @with_reconnect
    def _log_message(self, level, freerun_entry, msg):
        """ method performs logging into log file and the freerun_entry """
        self.logger.log(level, msg)

        assert isinstance(freerun_entry, FreerunProcessEntry)
        event_log = freerun_entry.event_log
        if len(event_log) > MAX_NUMBER_OF_EVENTS:
            del event_log[-1]
        event_log.insert(0, msg)
        self.sfe_dao.update(freerun_entry)

    @with_reconnect
    def _insert_uow(self, freerun_entry):
        """ creates unit_of_work and inserts it into the DB
            :raise DuplicateKeyError: if unit_of_work with given parameters already exists """
        current_timeperiod = time_helper.actual_timeperiod(QUALIFIER_REAL_TIME)
        process_entry = context.process_context[freerun_entry.process_name]

        uow = UnitOfWork()
        uow.process_name = freerun_entry.schedulable_name
        uow.timeperiod = current_timeperiod
        uow.start_id = 0
        uow.end_id = 0
        uow.start_timeperiod = current_timeperiod
        uow.end_timeperiod = current_timeperiod
        uow.created_at = datetime.utcnow()
        uow.submitted_at = datetime.utcnow()
        uow.source = process_entry.source if hasattr(process_entry, 'source') else None
        uow.sink = process_entry.sink if hasattr(process_entry, 'sink') else None
        uow.state = unit_of_work.STATE_REQUESTED
        uow.unit_of_work_type = unit_of_work.TYPE_FREERUN
        uow.number_of_retries = 0
        uow.arguments = freerun_entry.arguments
        uow.db_id = self.uow_dao.insert(uow)

        msg = 'Created: UOW {0} for {1}@{2}.' \
              .format(uow.db_id, freerun_entry.schedulable_name, current_timeperiod)
        self._log_message(INFO, freerun_entry, msg)
        return uow

    def _publish_uow(self, freerun_entry, uow):
        self.mq_transmitter.publish_freerun_uow(freerun_entry, uow)
        msg = 'Published: UOW {0} for {1}.'.format(uow.db_id, freerun_entry.schedulable_name)
        self._log_message(INFO, freerun_entry, msg)

    def insert_and_publish_uow(self, freerun_entry):
        try:
            uow = self._insert_uow(freerun_entry)
        except DuplicateKeyError as e:
            msg = 'Duplication of UOW found for {0}. Error msg: {1}'.format(freerun_entry.schedulable_name, e)
            self._log_message(WARNING, freerun_entry, msg)
            uow = self.uow_dao.recover_from_duplicatekeyerror(e)

        if uow is not None:
            # publish the created/caught up unit_of_work
            self._publish_uow(freerun_entry, uow)
            freerun_entry.related_unit_of_work = uow.db_id
            self.sfe_dao.update(freerun_entry)
        else:
            msg = 'PERSISTENT TIER ERROR! Unable to locate UOW for {0}' \
                  .format(freerun_entry.schedulable_name)
            self._log_message(WARNING, freerun_entry, msg)

    def _process_state_embryo(self, freerun_entry):
        """ method creates unit_of_work and associates it with the FreerunProcessEntry """
        self.insert_and_publish_uow(freerun_entry)

    def _process_state_in_progress(self, freerun_entry, uow):
        """ method that takes care of processing unit_of_work records in STATE_REQUESTED or STATE_IN_PROGRESS states"""
        self._publish_uow(freerun_entry, uow)

    def _process_terminal_state(self, freerun_entry, uow):
        """ method that takes care of processing unit_of_work records in
            STATE_PROCESSED, STATE_NOOP, STATE_INVALID, STATE_CANCELED states"""
        msg = 'UOW for {0} found in state {1}.'.format(freerun_entry.schedulable_name, uow.state)
        self._log_message(INFO, freerun_entry, msg)
        self.insert_and_publish_uow(freerun_entry)

    def manage_schedulable(self, freerun_entry):
        """ method main duty - is to _avoid_ publishing another unit_of_work, if previous was not yet processed
        In case the Scheduler sees that the unit_of_work is pending it will fire another WorkerMqRequest """

        assert isinstance(freerun_entry, FreerunProcessEntry)
        if freerun_entry.related_unit_of_work is None:
            uow = None
        else:
            uow = self.uow_dao.get_one(freerun_entry.related_unit_of_work)

        try:
            if uow is None:
                self._process_state_embryo(freerun_entry)

            elif uow.is_requested or uow.is_in_progress:
                self._process_state_in_progress(freerun_entry, uow)

            elif uow.is_finished or uow.is_invalid:
                self._process_terminal_state(freerun_entry, uow)

            else:
                msg = 'Unknown state {0} of the UOW {1}'.format(uow.state, uow.db_id)
                self._log_message(ERROR, freerun_entry, msg)

        except LookupError as e:
            msg = 'Lookup issue for schedulable: {0} in timeperiod {1}, because of: {2}' \
                  .format(freerun_entry.db_id, uow.timeperiod, e)
            self._log_message(WARNING, freerun_entry, msg)

    def cancel_uow(self, freerun_entry):
        uow_id = freerun_entry.related_unit_of_work
        if uow_id is None:
            msg = 'cancel_uow: no related UOW for {0}'.format(freerun_entry.schedulable_name)
        else:
            uow = self.uow_dao.get_one(uow_id)
            uow.state = unit_of_work.STATE_CANCELED
            self.uow_dao.update(uow)
            msg = 'cancel_uow: canceled UOW {0} for {1}'.format(uow_id, freerun_entry.schedulable_name)
        self._log_message(INFO, freerun_entry, msg)
Пример #6
0
class GarbageCollector(object):
    """ GC is triggered directly by Synergy Scheduler.
        It scans for invalid or stalled unit_of_work and re-triggers them.
        GC is vital for the health of the system.
        Deployment with no GC is considered invalid """

    def __init__(self, scheduler):
        self.logger = get_logger(PROCESS_GC, append_to_console=False, redirect_stdstream=False)
        self.managed_handlers = scheduler.managed_handlers
        self.mq_transmitter = MqTransmitter(self.logger)
        self.timetable = scheduler.timetable

        self.lock = Lock()
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.reprocess_uows = collections.defaultdict(PriorityQueue)
        self.timer = RepeatTimer(settings.settings['gc_run_interval'], self._run)

    @thread_safe
    def scan_uow_candidates(self):
        """ method performs two actions:
            - enlist stale or invalid units of work into reprocessing queue
            - cancel UOWs that are older than 2 days and have been submitted more than 1 hour ago """
        try:
            since = settings.settings['synergy_start_timeperiod']
            uow_list = self.uow_dao.get_reprocessing_candidates(since)
        except LookupError as e:
            self.logger.info('flow: no UOW candidates found for reprocessing: {0}'.format(e))
            return

        for uow in uow_list:
            try:
                if uow.process_name not in self.managed_handlers:
                    self.logger.debug('process {0} is not known to the Synergy Scheduler. Skipping its UOW.'
                                      .format(uow.process_name))
                    continue

                thread_handler = self.managed_handlers[uow.process_name]
                assert isinstance(thread_handler, ManagedThreadHandler)

                if not thread_handler.process_entry.is_on:
                    self.logger.debug('process {0} is inactive. Skipping its UOW.'.format(uow.process_name))
                    continue

                entry = PriorityEntry(uow)
                if entry in self.reprocess_uows[uow.process_name]:
                    # given UOW is already registered in the reprocessing queue
                    continue

                # ASSUMPTION: UOW is re-created by a state machine during reprocessing
                # thus - any UOW older 2 days could be marked as STATE_CANCELED
                if datetime.utcnow() - uow.created_at > timedelta(hours=settings.settings['gc_life_support_hours']):
                    self._cancel_uow(uow)
                    continue

                # if the UOW has been idle for more than 1 hour - resubmit it
                if datetime.utcnow() - uow.submitted_at > timedelta(hours=settings.settings['gc_resubmit_after_hours'])\
                        or uow.is_invalid:
                    # enlist the UOW into the reprocessing queue
                    self.reprocess_uows[uow.process_name].put(entry)

            except Exception as e:
                self.logger.error('flow exception: {0}'.format(e), exc_info=True)

    def _flush_queue(self, q, ignore_priority=False):
        """
        :param q: PriorityQueue instance holding GarbageCollector entries
        :param ignore_priority: If True - all GarbageCollector entries should be resubmitted
                If False - only those entries whose waiting time has expired will be resubmitted
        """
        assert isinstance(q, PriorityQueue)

        current_timestamp = compute_release_time(lag_in_minutes=0)
        for _ in range(len(q)):
            entry = q.pop()
            assert isinstance(entry, PriorityEntry)

            if ignore_priority or entry.release_time < current_timestamp:
                self._resubmit_uow(entry.entry)
            else:
                q.put(entry)
                break

    @thread_safe
    def flush(self, ignore_priority=False):
        """ method iterates over each reprocessing queues and re-submits UOW whose waiting time has expired """
        for process_name, q in self.reprocess_uows.items():
            self._flush_queue(q, ignore_priority)

    @thread_safe
    def validate(self):
        """ method iterates over the reprocessing queue and synchronizes state of every UOW with the DB
            should it change via the MX to STATE_CANCELED - remove the UOW from the queue """
        for process_name, q in self.reprocess_uows.items():
            if not q:
                continue

            invalid_entries = list()
            for entry in q.queue:
                assert isinstance(entry, PriorityEntry)
                uow = self.uow_dao.get_one(entry.entry.db_id)
                if uow.is_canceled:
                    invalid_entries.append(entry)

            for entry in invalid_entries:
                q.queue.remove(entry)

        self.logger.info('reprocessing queue validated')

    @thread_safe
    def flush_one(self, process_name, ignore_priority=False):
        """ method iterates over the reprocessing queue for the given process
            and re-submits UOW whose waiting time has expired """
        q = self.reprocess_uows[process_name]
        self._flush_queue(q, ignore_priority)

    def _resubmit_uow(self, uow):
        # re-read UOW from the DB, in case it was STATE_CANCELLED by MX
        uow = self.uow_dao.get_one(uow.db_id)
        if uow.is_canceled:
            self.logger.info('suppressed re-submission of UOW {0} for {1}@{2} in {3};'
                             .format(uow.db_id, uow.process_name, uow.timeperiod, uow.state))
            return

        if uow.is_invalid:
            uow.number_of_retries += 1

        uow.state = unit_of_work.STATE_REQUESTED
        uow.submitted_at = datetime.utcnow()
        self.uow_dao.update(uow)

        self.mq_transmitter.publish_managed_uow(uow)
        self.logger.info('re-submitted UOW {0} for {1}@{2}; attempt {3}'
                         .format(uow.db_id, uow.process_name, uow.timeperiod, uow.number_of_retries))

    def _cancel_uow(self, uow):
        uow.state = unit_of_work.STATE_CANCELED
        self.uow_dao.update(uow)

        self.mq_transmitter.publish_uow_status(uow)
        self.logger.info('canceled UOW {0} for {1}@{2}; attempt {3}; created at {4}'
                         .format(uow.db_id, uow.process_name, uow.timeperiod, uow.number_of_retries, uow.created_at))

    def _run(self):
        try:
            self.logger.info('run {')

            self.logger.debug('step 1: validate existing queue entries')
            self.validate()

            self.logger.debug('step 2: scan reprocessing candidates')
            self.scan_uow_candidates()

            self.logger.debug('step 3: repost after timeout')
            self.flush()

            self.logger.debug('step 4: timetable housekeeping')
            self.timetable.build_trees()

            self.logger.debug('step 5: timetable validation')
            self.timetable.validate()
        except Exception as e:
            self.logger.error('GC run exception: {0}'.format(e))
        finally:
            self.logger.info('}')

    def start(self):
        self.timer.start()

    def stop(self):
        self.timer.cancel()
class GarbageCollectorWorker(AbstractMqWorker):
    """ GC is triggered by an empty message from RabbitMQ. It scans for invalid or stalled unit_of_work
    and re-triggers them. GC is vital for the health of the system.
    Deployment with no running GC is considered invalid """

    def __init__(self, process_name):
        super(GarbageCollectorWorker, self).__init__(process_name)
        self.lock = Lock()
        self.publishers = PublishersPool(self.logger)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.managed_dao = ManagedProcessDao(self.logger)
        self.managed_entries = dict()

    def __del__(self):
        try:
            self.logger.info('Closing Flopsy Publishers Pool...')
            self.publishers.close()
        except Exception as e:
            self.logger.error('Exception caught while closing Flopsy Publishers Pool: %s' % str(e))

        super(GarbageCollectorWorker, self).__del__()

    @thread_safe
    def _mq_callback(self, message):
        """ method looks for stale or invalid units of work re-runs them if needed"""
        try:
            managed_entries = self.managed_dao.get_all()
            self._update_managed_entries(managed_entries)

            since = settings.settings['synergy_start_timeperiod']
            uow_list = self.uow_dao.get_reprocessing_candidates(since)
            for uow in uow_list:
                if uow.process_name not in self.managed_entries:
                    self.logger.debug('Process %r is not known to the Synergy Scheduler. Skipping its unit_of_work.'
                                      % uow.process_name)
                    continue

                process_entry = self.managed_entries[uow.process_name]
                assert isinstance(process_entry, ManagedProcessEntry)
                if not process_entry.is_on:
                    self.logger.debug('Process %r is inactive at the Synergy Scheduler. Skipping its unit_of_work.'
                                      % uow.process_name)
                    continue

                self._process_single_document(uow)

        except LookupError as e:
            self.logger.info('Normal behaviour. %r' % e)
        except Exception as e:
            self.logger.error('_mq_callback: %s' % str(e), exc_info=True)
        finally:
            self.consumer.acknowledge(message.delivery_tag)

    def _update_managed_entries(self, managed_entries):
        self.managed_entries = {me.process_name: me for me in managed_entries}

    def _process_single_document(self, uow):
        """ actually inspects UOW retrieved from the database"""
        repost = False
        if uow.is_invalid:
            repost = True

        elif uow.is_in_progress or uow.is_requested:
            last_activity = uow.started_at
            if last_activity is None:
                last_activity = uow.created_at

            if datetime.utcnow() - last_activity > timedelta(hours=REPOST_AFTER_HOURS):
                repost = True

        if repost:
            mq_request = SynergyMqTransmission(process_name=uow.process_name,
                                               unit_of_work_id=uow.db_id)

            if datetime.utcnow() - uow.created_at < timedelta(hours=LIFE_SUPPORT_HOURS):
                uow.state = unit_of_work.STATE_REQUESTED
                uow.number_of_retries += 1
                self.uow_dao.update(uow)

                publisher = self.publishers.get(uow.process_name)
                publisher.publish(mq_request.document)
                publisher.release()

                self.logger.info('UOW marked for re-processing: process %s; timeperiod %s; id %s; attempt %d'
                                 % (uow.process_name, uow.timeperiod, uow.db_id, uow.number_of_retries))
                self.performance_ticker.tracker.increment_success()
            else:
                uow.state = unit_of_work.STATE_CANCELED
                self.uow_dao.update(uow)

                publisher = self.publishers.get(QUEUE_UOW_REPORT)
                publisher.publish(mq_request.document)
                publisher.release()

                self.logger.info('UOW transferred to STATE_CANCELED: process %s; timeperiod %s; id %s; attempt %d'
                                 % (uow.process_name, uow.timeperiod, uow.db_id, uow.number_of_retries))
class AbstractUowAwareWorker(AbstractMqWorker):
    """ Abstract class is inherited by all workers/aggregators
    that are aware of unit_of_work and capable of processing it"""

    def __init__(self, process_name):
        super(AbstractUowAwareWorker, self).__init__(process_name)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.publishers = PublishersPool(self.logger)

    def __del__(self):
        self.publishers.close()
        super(AbstractUowAwareWorker, self).__del__()

    # **************** Abstract Methods ************************
    def _init_performance_ticker(self, logger):
        self.performance_ticker = UowAwareTracker(logger)
        self.performance_ticker.start()

    def _process_uow(self, uow):
        """
        :param uow: unit_of_work to process
        :return: a tuple (number of processed items/documents/etc, desired unit_of_work state) or None
        if None is returned then it is assumed that the return tuple is (0, unit_of_work.STATE_PROCESSED)
        :raise an Exception if the UOW shall be marked as STATE_INVALID
        """
        raise NotImplementedError('_process_uow must be overridden in the child class %s' % self.__class__.__name__)

    def _clean_up(self):
        """ method is called from the *finally* clause and is suppose to clean up after the uow processing """
        pass

    def _mq_callback(self, message):
        try:
            mq_request = SynergyMqTransmission.from_json(message.body)
            uow = self.uow_dao.get_one(mq_request.unit_of_work_id)
            if not uow.is_requested:
                # accept only UOW in STATE_REQUESTED
                self.logger.warn('Skipping unit_of_work: id %s; state %s;' % (str(message.body), uow.state),
                                 exc_info=False)
                self.consumer.acknowledge(message.delivery_tag)
                return
        except Exception:
            self.logger.error('Safety fuse. Can not identify unit_of_work %s' % str(message.body), exc_info=True)
            self.consumer.acknowledge(message.delivery_tag)
            return

        try:
            uow.state = unit_of_work.STATE_IN_PROGRESS
            uow.started_at = datetime.utcnow()
            self.uow_dao.update(uow)
            self.performance_ticker.start_uow(uow)

            result = self._process_uow(uow)
            if result is None or isinstance(result, str):
                number_of_aggregated_objects, target_state = 0, unit_of_work.STATE_PROCESSED
            else:
                number_of_aggregated_objects, target_state = result

            uow.number_of_aggregated_documents = number_of_aggregated_objects
            uow.number_of_processed_documents = self.performance_ticker.per_job
            uow.finished_at = datetime.utcnow()
            uow.state = target_state
            self.uow_dao.update(uow)
            self.performance_ticker.finish_uow()
        except Exception as e:
            fresh_uow = self.uow_dao.get_one(mq_request.unit_of_work_id)
            self.performance_ticker.cancel_uow()
            if fresh_uow.is_canceled:
                self.logger.warn('unit_of_work: id %s was likely marked by MX as SKIPPED. '
                                 'No unit_of_work update is performed.' % str(message.body),
                                 exc_info=False)
            else:
                self.logger.error('Safety fuse while processing unit_of_work %s in timeperiod %s : %r'
                                  % (message.body, uow.timeperiod, e), exc_info=True)
                uow.state = unit_of_work.STATE_INVALID
                self.uow_dao.update(uow)

        finally:
            self.consumer.acknowledge(message.delivery_tag)
            self.consumer.close()
            self._clean_up()

        try:
            publisher = self.publishers.get(QUEUE_UOW_REPORT)
            publisher.publish(mq_request.document)
            publisher.release()
            self.logger.info('Published unit_of_work status report into %s queue' % QUEUE_UOW_REPORT)
        except Exception:
            self.logger.error('Error on unit_of_work status report publishing', exc_info=True)
class FreerunActionHandler(AbstractActionHandler):
    def __init__(self, request, **values):
        super(FreerunActionHandler, self).__init__(request, **values)
        self.process_name = self.request_arguments.get('process_name')
        self.entry_name = self.request_arguments.get('entry_name')
        self.freerun_process_dao = FreerunProcessDao(self.logger)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.is_request_valid = True if self.process_name and self.entry_name else False

        if self.is_request_valid:
            self.process_name = self.process_name.strip()
            self.entry_name = self.entry_name.strip()
            self.is_requested_state_on = 'is_on' in self.request_arguments and self.request_arguments['is_on']

    @AbstractActionHandler.thread_handler.getter
    def thread_handler(self):
        handler_key = (self.process_name, self.entry_name)
        return self.scheduler.freerun_handlers[handler_key]

    @AbstractActionHandler.process_entry.getter
    def process_entry(self):
        return self.thread_handler.process_entry

    @valid_action_request
    def action_cancel_uow(self):
        uow_id = self.process_entry.related_unit_of_work
        if uow_id is None:
            resp = {'response': 'no related unit_of_work'}
        else:
            uow = self.uow_dao.get_one(uow_id)
            uow.state = unit_of_work.STATE_CANCELED
            self.uow_dao.update(uow)
            resp = {'response': 'updated unit_of_work %r' % uow_id}
        return resp

    @valid_action_request
    def action_get_uow(self):
        uow_id = self.process_entry.related_unit_of_work
        if uow_id is None:
            resp = {'response': 'no related unit_of_work'}
        else:
            resp = self.uow_dao.get_one(uow_id).document
            for key in resp:
                resp[key] = str(resp[key])
        return resp

    @valid_action_request
    def action_get_log(self):
        return {'log': self.process_entry.log}

    @valid_action_request
    def action_update_entry(self):
        if 'insert_button' in self.request_arguments:
            process_entry = FreerunProcessEntry()
            process_entry.process_name = self.process_name
            process_entry.entry_name = self.entry_name

            if self.request_arguments['arguments']:
                arguments = self.request_arguments['arguments'].decode('unicode-escape')
                process_entry.arguments = json.loads(arguments)
            else:
                process_entry.arguments = {}

            process_entry.description = self.request_arguments['description']
            process_entry.is_on = self.is_requested_state_on
            process_entry.trigger_frequency = self.request_arguments['trigger_frequency']
            self.freerun_process_dao.update(process_entry)

            self.scheduler._register_process_entry(process_entry, self.scheduler.fire_freerun_worker)

        elif 'update_button' in self.request_arguments:
            is_interval_changed = self.process_entry.trigger_frequency != self.request_arguments['trigger_frequency']

            if self.request_arguments['arguments']:
                arguments = self.request_arguments['arguments'].decode('unicode-escape')
                self.process_entry.arguments = json.loads(arguments)
            else:
                self.process_entry.arguments = {}

            self.process_entry.description = self.request_arguments['description']
            self.process_entry.is_on = self.is_requested_state_on
            self.process_entry.trigger_frequency = self.request_arguments['trigger_frequency']
            self.freerun_process_dao.update(self.process_entry)

            if is_interval_changed:
                self.action_change_interval()

            if self.process_entry.is_on != self.is_requested_state_on:
                if self.is_requested_state_on:
                    self.action_activate_trigger()
                else:
                    self.action_deactivate_trigger()

        elif 'delete_button' in self.request_arguments:
            handler_key = (self.process_name, self.entry_name)
            self.thread_handler.deactivate()
            self.freerun_process_dao.remove(handler_key)
            del self.scheduler.freerun_handlers[handler_key]

        elif 'cancel_button' in self.request_arguments:
            pass

        else:
            self.logger.error('Unknown action requested by schedulable_form.html')

        return self.reply_ok()