예제 #1
0
    def __init__(self, request, **values):
        super(FlowActionHandler, self).__init__(request, **values)
        self.flow_dao = FlowDao(self.logger)
        self.step_dao = StepDao(self.logger)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.log_recording_dao = LogRecordingDao(self.logger)

        self.process_name = self.request_arguments.get(ARGUMENT_PROCESS_NAME)
        self.unit_of_work_type = self.request_arguments.get(
            ARGUMENT_UNIT_OF_WORK_TYPE, TYPE_MANAGED)
        self.flow_name = self.request_arguments.get(ARGUMENT_FLOW_NAME)
        if not self.flow_name and self.process_name:
            process_entry = context.process_context[self.process_name]
            self.flow_name = process_entry.arguments.get(ARGUMENT_FLOW_NAME)

        self.step_name = self.request_arguments.get(ARGUMENT_STEP_NAME)
        self.timeperiod = self.request_arguments.get(ARGUMENT_TIMEPERIOD)
        self.is_request_valid = True if self.flow_name \
                                        and self.flow_name in flows.flows \
                                        and self.timeperiod \
                                else False

        if self.is_request_valid:
            self.flow_name = self.flow_name.strip()
            self.timeperiod = self.timeperiod.strip()

        self.run_mode = self.request_arguments.get(ARGUMENT_RUN_MODE, '')
        self.run_mode = self.run_mode.strip()
 def setUp(self):
     self.process_name = PROCESS_ALERT_DAILY
     self.logger = get_logger(self.process_name)
     self.uow_id = create_and_insert_unit_of_work(self.process_name,
                                                  'range_start',
                                                  'range_end')
     self.uow_id = str(self.uow_id)
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.log_recording_dao = LogRecordingDao(self.logger)
예제 #3
0
def create_and_insert_unit_of_work(process_name, start_id, end_id, state=unit_of_work.STATE_REQUESTED,
                                   timeperiod='INVALID_TIMEPERIOD'):
    """ method creates and inserts a unit_of_work into DB
    :return id of the created object in the db"""
    uow = create_unit_of_work(process_name, start_id, end_id, timeperiod, state)
    logger = get_logger(process_name)
    uow_dao = UnitOfWorkDao(logger)
    uow_id = uow_dao.insert(uow)
    return uow_id
    def tearDown(self):
        uow_dao = UnitOfWorkDao(self.worker.logger)
        uow_dao.remove(self.uow_id)

        for instance in self.worker.instances:
            self.conn.stop_instances(instance.id)

        # killing the worker
        del self.worker
예제 #5
0
def create_and_insert_unit_of_work(process_name,
                                   start_id,
                                   end_id,
                                   state=unit_of_work.STATE_REQUESTED,
                                   timeperiod='INVALID_TIMEPERIOD'):
    """ method creates and inserts a unit_of_work into DB
    :return id of the created object in the db"""
    uow = create_unit_of_work(process_name, start_id, end_id, timeperiod,
                              state)
    logger = get_logger(process_name)
    uow_dao = UnitOfWorkDao(logger)
    uow_id = uow_dao.insert(uow)
    return uow_id
class LogRecordingHandlerUnitTest(unittest.TestCase):
    """ Test flow:
        1. create a UOW in the database
        2. emulate mq message
        3. call _mq_callback method
        4. validate that all the messages are now found in the uow_log record
        5. remove UOW and uow_log record
    """
    def setUp(self):
        self.process_name = PROCESS_ALERT_DAILY
        self.logger = get_logger(self.process_name)
        self.uow_id = create_and_insert_unit_of_work(self.process_name,
                                                     'range_start',
                                                     'range_end')
        self.uow_id = str(self.uow_id)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.log_recording_dao = LogRecordingDao(self.logger)

    def tearDown(self):
        self.uow_dao.remove(self.uow_id)
        self.log_recording_dao.remove(self.uow_id)

    def test_logging(self):
        self.worker = ChattyWorker(self.process_name)
        message = TestMessage(process_name=self.process_name,
                              uow_id=self.uow_id)
        self.worker._mq_callback(message)

        uow_log = self.log_recording_dao.get_one(self.uow_id)
        messages = INFO_LOG_MESSAGES + WARN_LOG_MESSAGES  # + STD_MESSAGES
        self.assertLessEqual(len(messages), len(uow_log.log))

        for index, message in enumerate(messages):
            self.assertIn(message, uow_log.log[index])

    def test_exception_logging(self):
        self.worker = ExceptionWorker(self.process_name)
        message = TestMessage(process_name=self.process_name,
                              uow_id=self.uow_id)
        self.worker._mq_callback(message)

        uow_log = self.log_recording_dao.get_one(self.uow_id)
        messages = [
            'Exception: Artificially triggered exception to test Uow Exception Logging',
            'Method ExceptionWorker._process_uow returned None. Assuming happy flow.',
            'at INVALID_TIMEPERIOD: Success/Failure 0/0 entries'
        ]

        for index, message in enumerate(messages):
            self.assertIn(message, uow_log.log[index])
 def __init__(self, process_name):
     super(GarbageCollectorWorker, self).__init__(process_name)
     self.lock = Lock()
     self.publishers = PublishersPool(self.logger)
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.managed_dao = ManagedProcessDao(self.logger)
     self.managed_entries = dict()
 def __init__(self, logger, timetable, name):
     self.name = name
     self.logger = logger
     self.publishers = PublishersPool(self.logger)
     self.timetable = timetable
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.job_dao = JobDao(self.logger)
예제 #9
0
class FreerunStatements(object):
    def __init__(self, logger, freerun_handlers):
        self.lock = RLock()
        self.logger = logger
        self.freerun_handlers = freerun_handlers
        self.uow_dao = UnitOfWorkDao(self.logger)

    @thread_safe
    def retrieve_records(self, timeperiod, include_running,
                         include_processed, include_noop, include_failed, include_disabled):
        """ method looks for suitable UOW records and returns them as a dict"""
        resp = dict()
        try:
            query = unit_of_work_dao.QUERY_GET_FREERUN_SINCE(timeperiod, include_running,
                                                             include_processed, include_noop, include_failed)
            records_list = self.uow_dao.run_query(query)
            if len(records_list) == 0:
                self.logger.warning('MX: no Freerun UOW records found since {0}.'.format(timeperiod))

            for uow_record in records_list:
                if uow_record.process_name not in self.freerun_handlers:
                    continue

                thread_handler = self.freerun_handlers[uow_record.process_name]
                if not include_disabled and not thread_handler.process_entry.is_on:
                    continue

                resp[uow_record.key] = uow_record.document
        except Exception as e:
            self.logger.error('MX Dashboard FreerunStatements error: {0}'.format(e))
        return resp
예제 #10
0
 def __init__(self, scheduler):
     self.scheduler = scheduler
     self.timetable = scheduler.timetable
     self.logger = scheduler.logger
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.consumer = Consumer(QUEUE_UOW_REPORT)
     self.main_thread = None
예제 #11
0
 def setUp(self):
     self.process_name = PROCESS_ALERT_DAILY
     self.logger = get_logger(self.process_name)
     self.uow_id = create_and_insert_unit_of_work(self.process_name, 'range_start', 'range_end')
     self.uow_id = str(self.uow_id)
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.uow_log_dao = UowLogDao(self.logger)
예제 #12
0
 def __init__(self, logger, timetable, name):
     self.name = name
     self.logger = logger
     self.mq_transmitter = MqTransmitter(self.logger)
     self.timetable = timetable
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.job_dao = JobDao(self.logger)
예제 #13
0
class UowLogHandlerUnitTest(unittest.TestCase):
    """ Test flow:
        1. create a UOW in the database
        2. emulate mq message
        3. call _mq_callback method
        4. validate that all the messages are now found in the uow_log record
        5. remove UOW and uow_log record
    """

    def setUp(self):
        self.process_name = PROCESS_ALERT_DAILY
        self.logger = get_logger(self.process_name)
        self.uow_id = create_and_insert_unit_of_work(self.process_name, 'range_start', 'range_end')
        self.uow_id = str(self.uow_id)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.uow_log_dao = UowLogDao(self.logger)

    def tearDown(self):
        self.uow_dao.remove(self.uow_id)
        self.uow_log_dao.remove(self.uow_id)

    def test_logging(self):
        self.worker = ChattyWorker(self.process_name)
        message = TestMessage(process_name=self.process_name, uow_id=self.uow_id)
        self.worker._mq_callback(message)

        uow_log = self.uow_log_dao.get_one(self.uow_id)
        messages = INFO_LOG_MESSAGES + WARN_LOG_MESSAGES  # + STD_MESSAGES
        self.assertLessEqual(len(messages), len(uow_log.log))

        for index, message in enumerate(messages):
            self.assertIn(message, uow_log.log[index])

    def test_exception_logging(self):
        self.worker = ExceptionWorker(self.process_name)
        message = TestMessage(process_name=self.process_name, uow_id=self.uow_id)
        self.worker._mq_callback(message)

        uow_log = self.uow_log_dao.get_one(self.uow_id)
        messages = ['Exception: Artificially triggered exception to test Uow Exception Logging',
                    'method ExceptionWorker._process_uow returned None. Assuming happy flow.',
                    'at INVALID_TIMEPERIOD: Success/Failure 0/0']

        for index, message in enumerate(messages):
            self.assertIn(message, uow_log.log[index])
예제 #14
0
    def test_select_reprocessing_candidates(self):
        logger = get_logger(PROCESS_UNIT_TEST)
        uow_dao = UnitOfWorkDao(logger)

        try:
            initial_candidates = uow_dao.get_reprocessing_candidates()
        except:
            initial_candidates = []

        try:
            initial_positive_candidates = uow_dao.get_reprocessing_candidates(
                '2010123123')
        except:
            initial_positive_candidates = []

        positive_timeperiods = {
            u'2010123123': PROCESS_SITE_HOURLY,  # hourly time qualifier
            u'2010123100': PROCESS_SITE_DAILY,  # daily time qualifier
            u'2010120000': PROCESS_SITE_MONTHLY,  # monthly time qualifier
            u'2010000000': PROCESS_SITE_YEARLY
        }  # yearly time qualifier

        negative_timeperiods = {
            u'2009123123': PROCESS_SITE_HOURLY,  # hourly time qualifier
            u'2009123100': PROCESS_SITE_DAILY,  # daily time qualifier
            u'2009120000': PROCESS_SITE_MONTHLY,  # monthly time qualifier
            u'2009000000': PROCESS_SITE_YEARLY
        }  # yearly time qualifier

        all_timeperiods = dict()
        all_timeperiods.update(positive_timeperiods)
        all_timeperiods.update(negative_timeperiods)

        created_uow = []
        for timeperiod, process_name in all_timeperiods.items():
            created_uow.append(
                create_and_insert_unit_of_work(
                    process_name,
                    0,
                    1,
                    timeperiod=timeperiod,
                    state=unit_of_work.STATE_INVALID))

        candidates = uow_dao.get_reprocessing_candidates('2010123123')
        self.assertEqual(
            len(candidates) - len(initial_positive_candidates),
            len(positive_timeperiods))

        candidates = uow_dao.get_reprocessing_candidates()
        self.assertEqual(
            len(candidates) - len(initial_candidates), len(all_timeperiods))

        for uow_id in created_uow:
            uow_dao.remove(uow_id)
예제 #15
0
    def __init__(self, scheduler):
        self.logger = get_logger(PROCESS_GC, append_to_console=False, redirect_stdstream=False)
        self.managed_handlers = scheduler.managed_handlers
        self.mq_transmitter = MqTransmitter(self.logger)
        self.timetable = scheduler.timetable

        self.lock = Lock()
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.reprocess_uows = collections.defaultdict(PriorityQueue)
        self.timer = RepeatTimer(settings.settings['gc_run_interval'], self._run)
예제 #16
0
    def __init__(self, logger, message, consumer, performance_ticker):
        self.logger = logger
        self.message = message
        self.mq_request = SynergyMqTransmission.from_json(message.body)
        self.consumer = consumer
        self.performance_ticker = performance_ticker
        self.alive = False
        self.return_code = -1
        self.uow_dao = UnitOfWorkDao(self.logger)

        self.thread_name = str(self.mq_request)
        super(BashRunnable, self).__init__(name=self.thread_name)
    def __init__(self, request, **values):
        super(FreerunActionHandler, self).__init__(request, **values)
        self.process_name = self.request_arguments.get('process_name')
        self.entry_name = self.request_arguments.get('entry_name')
        self.freerun_process_dao = FreerunProcessDao(self.logger)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.is_request_valid = True if self.process_name and self.entry_name else False

        if self.is_request_valid:
            self.process_name = self.process_name.strip()
            self.entry_name = self.entry_name.strip()
            self.is_requested_state_on = 'is_on' in self.request_arguments and self.request_arguments['is_on']
예제 #18
0
    def __init__(self, logger):
        self.lock = RLock()
        self.logger = logger
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.job_dao = JobDao(self.logger)
        self.reprocess = dict()

        # self.trees contain all of the trees and manages much of their life cycle
        # remember to enlist there all trees the system is working with
        self.trees = self._construct_trees_from_context()

        self._register_callbacks()
        self._register_dependencies()
        self.load_tree()
        self.build_trees()
        self.validate()
    def test_select_reprocessing_candidates(self):
        logger = get_logger(PROCESS_UNIT_TEST)
        uow_dao = UnitOfWorkDao(logger)

        try:
            initial_candidates = uow_dao.get_reprocessing_candidates()
        except:
            initial_candidates = []

        try:
            initial_positive_candidates = uow_dao.get_reprocessing_candidates('2010123123')
        except:
            initial_positive_candidates = []

        positive_timeperiods = {u'2010123123': PROCESS_SITE_HOURLY,    # hourly time qualifier
                                u'2010123100': PROCESS_SITE_DAILY,     # daily time qualifier
                                u'2010120000': PROCESS_SITE_MONTHLY,   # monthly time qualifier
                                u'2010000000': PROCESS_SITE_YEARLY}    # yearly time qualifier

        negative_timeperiods = {u'2009123123': PROCESS_SITE_HOURLY,    # hourly time qualifier
                                u'2009123100': PROCESS_SITE_DAILY,     # daily time qualifier
                                u'2009120000': PROCESS_SITE_MONTHLY,   # monthly time qualifier
                                u'2009000000': PROCESS_SITE_YEARLY}    # yearly time qualifier

        all_timeperiods = dict()
        all_timeperiods.update(positive_timeperiods)
        all_timeperiods.update(negative_timeperiods)

        created_uow = []
        for timeperiod, process_name in all_timeperiods.items():
            created_uow.append(create_and_insert_unit_of_work(process_name,
                                                              0,
                                                              1,
                                                              timeperiod=timeperiod,
                                                              state=unit_of_work.STATE_INVALID))

        candidates = uow_dao.get_reprocessing_candidates('2010123123')
        self.assertEqual(len(candidates) - len(initial_positive_candidates), len(positive_timeperiods))

        candidates = uow_dao.get_reprocessing_candidates()
        self.assertEqual(len(candidates) - len(initial_candidates), len(all_timeperiods))

        for uow_id in created_uow:
            uow_dao.remove(uow_id)
예제 #20
0
class FreerunStatements(object):
    def __init__(self, logger):
        self.lock = RLock()
        self.logger = logger
        self.uow_dao = UnitOfWorkDao(self.logger)

    @thread_safe
    def retrieve_records(self, timeperiod, unprocessed_only):
        """ method looks for suitable UOW records and returns them as a dict"""
        resp = dict()
        try:
            query = unit_of_work_dao.QUERY_GET_FREERUN_SINCE(timeperiod, unprocessed_only)
            records_list = self.uow_dao.run_query(query)
            if len(records_list) == 0:
                self.logger.warn('No Freerun UOW records found since {0}.'.format(timeperiod))

            for uow_record in records_list:
                resp[uow_record.key] = uow_record.document
        except Exception as e:
            self.logger.error('DashboardHandler error: {0}'.format(e))
        return resp
class GarbageCollectorWorker(AbstractMqWorker):
    """ GC is triggered by an empty message from RabbitMQ. It scans for invalid or stalled unit_of_work
    and re-triggers them. GC is vital for the health of the system.
    Deployment with no running GC is considered invalid """

    def __init__(self, process_name):
        super(GarbageCollectorWorker, self).__init__(process_name)
        self.lock = Lock()
        self.publishers = PublishersPool(self.logger)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.managed_dao = ManagedProcessDao(self.logger)
        self.managed_entries = dict()

    def __del__(self):
        try:
            self.logger.info('Closing Flopsy Publishers Pool...')
            self.publishers.close()
        except Exception as e:
            self.logger.error('Exception caught while closing Flopsy Publishers Pool: %s' % str(e))

        super(GarbageCollectorWorker, self).__del__()

    @thread_safe
    def _mq_callback(self, message):
        """ method looks for stale or invalid units of work re-runs them if needed"""
        try:
            managed_entries = self.managed_dao.get_all()
            self._update_managed_entries(managed_entries)

            since = settings.settings['synergy_start_timeperiod']
            uow_list = self.uow_dao.get_reprocessing_candidates(since)
            for uow in uow_list:
                if uow.process_name not in self.managed_entries:
                    self.logger.debug('Process %r is not known to the Synergy Scheduler. Skipping its unit_of_work.'
                                      % uow.process_name)
                    continue

                process_entry = self.managed_entries[uow.process_name]
                assert isinstance(process_entry, ManagedProcessEntry)
                if not process_entry.is_on:
                    self.logger.debug('Process %r is inactive at the Synergy Scheduler. Skipping its unit_of_work.'
                                      % uow.process_name)
                    continue

                self._process_single_document(uow)

        except LookupError as e:
            self.logger.info('Normal behaviour. %r' % e)
        except Exception as e:
            self.logger.error('_mq_callback: %s' % str(e), exc_info=True)
        finally:
            self.consumer.acknowledge(message.delivery_tag)

    def _update_managed_entries(self, managed_entries):
        self.managed_entries = {me.process_name: me for me in managed_entries}

    def _process_single_document(self, uow):
        """ actually inspects UOW retrieved from the database"""
        repost = False
        if uow.is_invalid:
            repost = True

        elif uow.is_in_progress or uow.is_requested:
            last_activity = uow.started_at
            if last_activity is None:
                last_activity = uow.created_at

            if datetime.utcnow() - last_activity > timedelta(hours=REPOST_AFTER_HOURS):
                repost = True

        if repost:
            mq_request = SynergyMqTransmission(process_name=uow.process_name,
                                               unit_of_work_id=uow.db_id)

            if datetime.utcnow() - uow.created_at < timedelta(hours=LIFE_SUPPORT_HOURS):
                uow.state = unit_of_work.STATE_REQUESTED
                uow.number_of_retries += 1
                self.uow_dao.update(uow)

                publisher = self.publishers.get(uow.process_name)
                publisher.publish(mq_request.document)
                publisher.release()

                self.logger.info('UOW marked for re-processing: process %s; timeperiod %s; id %s; attempt %d'
                                 % (uow.process_name, uow.timeperiod, uow.db_id, uow.number_of_retries))
                self.performance_ticker.tracker.increment_success()
            else:
                uow.state = unit_of_work.STATE_CANCELED
                self.uow_dao.update(uow)

                publisher = self.publishers.get(QUEUE_UOW_REPORT)
                publisher.publish(mq_request.document)
                publisher.release()

                self.logger.info('UOW transferred to STATE_CANCELED: process %s; timeperiod %s; id %s; attempt %d'
                                 % (uow.process_name, uow.timeperiod, uow.db_id, uow.number_of_retries))
예제 #22
0
class FlowActionHandler(BaseRequestHandler):
    def __init__(self, request, **values):
        super(FlowActionHandler, self).__init__(request, **values)
        self.flow_dao = FlowDao(self.logger)
        self.step_dao = StepDao(self.logger)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.log_recording_dao = LogRecordingDao(self.logger)

        self.process_name = self.request_arguments.get(ARGUMENT_PROCESS_NAME)
        self.unit_of_work_type = self.request_arguments.get(
            ARGUMENT_UNIT_OF_WORK_TYPE, TYPE_MANAGED)
        self.flow_name = self.request_arguments.get(ARGUMENT_FLOW_NAME)
        if not self.flow_name and self.process_name:
            process_entry = context.process_context[self.process_name]
            self.flow_name = process_entry.arguments.get(ARGUMENT_FLOW_NAME)

        self.step_name = self.request_arguments.get(ARGUMENT_STEP_NAME)
        self.timeperiod = self.request_arguments.get(ARGUMENT_TIMEPERIOD)
        self.is_request_valid = True if self.flow_name \
                                        and self.flow_name in flows.flows \
                                        and self.timeperiod \
                                else False

        if self.is_request_valid:
            self.flow_name = self.flow_name.strip()
            self.timeperiod = self.timeperiod.strip()

        self.run_mode = self.request_arguments.get(ARGUMENT_RUN_MODE, '')
        self.run_mode = self.run_mode.strip()

    def _get_tree_node(self):
        tree = self.scheduler.timetable.get_tree(self.process_name)
        if tree is None:
            raise UserWarning(
                'No Timetable tree is registered for process {0}'.format(
                    self.process_name))

        time_qualifier = context.process_context[
            self.process_name].time_qualifier
        self.timeperiod = time_helper.cast_to_time_qualifier(
            time_qualifier, self.timeperiod)
        node = tree.get_node(self.process_name, self.timeperiod)
        return node

    @property
    def job_record(self):
        node = self._get_tree_node()
        return node.job_record

    @property
    def managed_uow_record(self):
        node = self._get_tree_node()
        uow_id = node.job_record.related_unit_of_work
        if not uow_id:
            return None
        return self.uow_dao.get_one(uow_id)

    @property
    def freerun_uow_records(self):
        valid_freerun_uow = list()
        records_list = find_all_flow_uows(self.uow_dao, self.process_name,
                                          self.flow_name, self.timeperiod)
        if len(records_list) == 0:
            self.logger.warning(
                'MX: no Freerun UOW records found for {0}@{1} -> {2}.'.format(
                    self.process_name, self.timeperiod, self.flow_name))
            return valid_freerun_uow

        for uow_record in records_list:
            # freerun uow.process_name is a composite in format <process_name::entry_name>
            handler_key = split_schedulable_name(uow_record.process_name)
            if handler_key not in self.scheduler.freerun_handlers:
                # skip UOW records that have no active freerun handler
                continue

            valid_freerun_uow.append(uow_record)
        return valid_freerun_uow

    @property
    def freerun_process_entry(self):
        """ :returns run-time only instance of the FreerunProcessEntry """
        entry_name = build_schedulable_name(self.flow_name, self.step_name)
        handler_key = (self.process_name, entry_name)
        if handler_key not in self.scheduler.freerun_handlers:
            classname = context.process_context[self.process_name].classname
            entry = freerun_context_entry(
                process_name=self.process_name,
                entry_name=entry_name,
                classname=classname,
                token=entry_name,
                trigger_frequency='every {0}'.format(SECONDS_IN_CENTURY),
                is_on=False,
                description=
                'Runtime freerun object to facilitate CUSTOM RUN MODES for workflow'
            )

            # find uow for workflow step if any
            uow = find_flow_step_uow(self.uow_dao, self.process_name,
                                     self.flow_name, self.step_name,
                                     self.timeperiod)
            if uow:
                entry.related_unit_of_work = uow.db_id

            self.scheduler.freerun_handlers[handler_key] = entry
        return self.scheduler.freerun_handlers[handler_key]

    @property
    def flow_record(self):
        return self.flow_dao.get_one([self.flow_name, self.timeperiod])

    @property
    def step_record(self):
        return self.step_dao.get_one(
            [self.flow_name, self.step_name, self.timeperiod])

    @property
    def flow_graph_obj(self):
        _flow_graph_obj = copy.deepcopy(flows.flows[self.flow_name])
        _flow_graph_obj.context = ExecutionContext(self.flow_name,
                                                   self.timeperiod, None, None,
                                                   settings.settings)

        try:
            flow_entry = self.flow_dao.get_one(
                [self.flow_name, self.timeperiod])
            _flow_graph_obj.context.flow_entry = flow_entry
            _flow_graph_obj.context.start_timeperiod = flow_entry.start_timeperiod
            _flow_graph_obj.context.end_timeperiod = flow_entry.end_timeperiod

            steps = self.step_dao.get_all_by_flow_id(flow_entry.db_id)
            for s in steps:
                assert isinstance(s, Step)
                _flow_graph_obj[s.step_name].step_entry = s
                _flow_graph_obj.yielded.append(s)
        except LookupError:
            pass
        return _flow_graph_obj

    @cached_property
    @valid_action_request
    def flow_details(self):
        rest_model = create_rest_flow(self.flow_graph_obj)
        return rest_model.document

    @cached_property
    def process_name(self):
        return self.process_name

    @cached_property
    def unit_of_work_type(self):
        return self.unit_of_work_type

    @cached_property
    def active_run_mode(self):
        return self.flow_dao.managed_run_mode(self.process_name,
                                              self.flow_name, self.timeperiod)

    @cached_property
    @valid_action_request
    def step_details(self):
        graph_node_obj = self.flow_graph_obj._dict[self.step_name]
        rest_model = create_rest_step(graph_node_obj)
        return rest_model.document

    @valid_action_request
    def set_run_mode(self):
        """
        - set a flag for ProcessEntry.arguments[ARGUMENT_RUN_MODE] = RUN_MODE_RECOVERY
        - trigger standard reprocessing
        """
        if not self.job_record or not self.run_mode:
            return RESPONSE_NOT_OK

        try:
            msg = 'MX: setting RUN MODE for {0}@{1} -> {2} to {3}' \
                .format(self.process_name, self.timeperiod, self.flow_name, self.run_mode)
            self.scheduler.timetable.add_log_entry(self.process_name,
                                                   self.timeperiod, msg)
            self.logger.info(msg + ' {')

            local_record = self.flow_record
            local_record.run_mode = self.run_mode
            self.flow_dao.update(local_record)

            return RESPONSE_OK
        except (ValidationError, LookupError):
            return RESPONSE_NOT_OK
        finally:
            self.logger.info('}')

    def perform_freerun_action(self, run_mode):
        """
        - make sure that the job is finished
          i.e. the job is in [STATE_NOOP, STATE_PROCESSED, STATE_SKIPPED]
        - submit a FREERUN UOW for given (process_name::flow_name::step_name, timeperiod)
        :return RESPONSE_OK if the UOW was submitted and RESPONSE_NOT_OK otherwise
        """
        if not self.job_record or not self.job_record.is_finished:
            return RESPONSE_NOT_OK

        uow = self.managed_uow_record
        if not uow:
            # for skipped job that has no UOW associated with it
            return RESPONSE_NOT_OK

        flow_request = FlowRequest(self.process_name, self.flow_name,
                                   self.step_name, run_mode, self.timeperiod,
                                   uow.start_timeperiod, uow.end_timeperiod)

        state_machine = self.scheduler.timetable.state_machines[
            STATE_MACHINE_FREERUN]
        state_machine.manage_schedulable(self.freerun_process_entry,
                                         flow_request)
        return RESPONSE_OK

    @valid_action_request
    def run_one_step(self):
        return self.perform_freerun_action(RUN_MODE_RUN_ONE)

    @valid_action_request
    def run_from_step(self):
        """
        - make sure that the job is finished
          i.e. the job is in [STATE_NOOP, STATE_PROCESSED, STATE_SKIPPED]
        - submit a FREERUN UOW for given (process_name::flow_name::step_name, timeperiod)
        :return RESPONSE_OK if the UOW was submitted and RESPONSE_NOT_OK otherwise
        """
        return self.perform_freerun_action(RUN_MODE_RUN_FROM)

    @valid_action_request
    @safe_json_response
    def get_step_log(self):
        try:
            resp = self.log_recording_dao.get_one(
                self.step_record.db_id).document
        except (TypeError, LookupError):
            resp = {'response': 'no related step log'}
        return resp

    @valid_action_request
    @safe_json_response
    def get_flow_log(self):
        try:
            resp = self.log_recording_dao.get_one(
                self.flow_record.db_id).document
        except (TypeError, LookupError):
            resp = {'response': 'no related workflow log'}
        return resp
예제 #23
0
class GarbageCollector(object):
    """ GC is triggered directly by Synergy Scheduler.
        It scans for invalid or stalled unit_of_work and re-triggers them.
        GC is vital for the health of the system.
        Deployment with no GC is considered invalid """

    def __init__(self, scheduler):
        self.logger = get_logger(PROCESS_GC, append_to_console=False, redirect_stdstream=False)
        self.managed_handlers = scheduler.managed_handlers
        self.mq_transmitter = MqTransmitter(self.logger)
        self.timetable = scheduler.timetable

        self.lock = Lock()
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.reprocess_uows = collections.defaultdict(PriorityQueue)
        self.timer = RepeatTimer(settings.settings['gc_run_interval'], self._run)

    @thread_safe
    def scan_uow_candidates(self):
        """ method performs two actions:
            - enlist stale or invalid units of work into reprocessing queue
            - cancel UOWs that are older than 2 days and have been submitted more than 1 hour ago """
        try:
            since = settings.settings['synergy_start_timeperiod']
            uow_list = self.uow_dao.get_reprocessing_candidates(since)
        except LookupError as e:
            self.logger.info('flow: no UOW candidates found for reprocessing: {0}'.format(e))
            return

        for uow in uow_list:
            try:
                if uow.process_name not in self.managed_handlers:
                    self.logger.debug('process {0} is not known to the Synergy Scheduler. Skipping its UOW.'
                                      .format(uow.process_name))
                    continue

                thread_handler = self.managed_handlers[uow.process_name]
                assert isinstance(thread_handler, ManagedThreadHandler)

                if not thread_handler.process_entry.is_on:
                    self.logger.debug('process {0} is inactive. Skipping its UOW.'.format(uow.process_name))
                    continue

                entry = PriorityEntry(uow)
                if entry in self.reprocess_uows[uow.process_name]:
                    # given UOW is already registered in the reprocessing queue
                    continue

                # ASSUMPTION: UOW is re-created by a state machine during reprocessing
                # thus - any UOW older 2 days could be marked as STATE_CANCELED
                if datetime.utcnow() - uow.created_at > timedelta(hours=settings.settings['gc_life_support_hours']):
                    self._cancel_uow(uow)
                    continue

                # if the UOW has been idle for more than 1 hour - resubmit it
                if datetime.utcnow() - uow.submitted_at > timedelta(hours=settings.settings['gc_resubmit_after_hours'])\
                        or uow.is_invalid:
                    # enlist the UOW into the reprocessing queue
                    self.reprocess_uows[uow.process_name].put(entry)

            except Exception as e:
                self.logger.error('flow exception: {0}'.format(e), exc_info=True)

    def _flush_queue(self, q, ignore_priority=False):
        """
        :param q: PriorityQueue instance holding GarbageCollector entries
        :param ignore_priority: If True - all GarbageCollector entries should be resubmitted
                If False - only those entries whose waiting time has expired will be resubmitted
        """
        assert isinstance(q, PriorityQueue)

        current_timestamp = compute_release_time(lag_in_minutes=0)
        for _ in range(len(q)):
            entry = q.pop()
            assert isinstance(entry, PriorityEntry)

            if ignore_priority or entry.release_time < current_timestamp:
                self._resubmit_uow(entry.entry)
            else:
                q.put(entry)
                break

    @thread_safe
    def flush(self, ignore_priority=False):
        """ method iterates over each reprocessing queues and re-submits UOW whose waiting time has expired """
        for process_name, q in self.reprocess_uows.items():
            self._flush_queue(q, ignore_priority)

    @thread_safe
    def validate(self):
        """ method iterates over the reprocessing queue and synchronizes state of every UOW with the DB
            should it change via the MX to STATE_CANCELED - remove the UOW from the queue """
        for process_name, q in self.reprocess_uows.items():
            if not q:
                continue

            invalid_entries = list()
            for entry in q.queue:
                assert isinstance(entry, PriorityEntry)
                uow = self.uow_dao.get_one(entry.entry.db_id)
                if uow.is_canceled:
                    invalid_entries.append(entry)

            for entry in invalid_entries:
                q.queue.remove(entry)

        self.logger.info('reprocessing queue validated')

    @thread_safe
    def flush_one(self, process_name, ignore_priority=False):
        """ method iterates over the reprocessing queue for the given process
            and re-submits UOW whose waiting time has expired """
        q = self.reprocess_uows[process_name]
        self._flush_queue(q, ignore_priority)

    def _resubmit_uow(self, uow):
        # re-read UOW from the DB, in case it was STATE_CANCELLED by MX
        uow = self.uow_dao.get_one(uow.db_id)
        if uow.is_canceled:
            self.logger.info('suppressed re-submission of UOW {0} for {1}@{2} in {3};'
                             .format(uow.db_id, uow.process_name, uow.timeperiod, uow.state))
            return

        if uow.is_invalid:
            uow.number_of_retries += 1

        uow.state = unit_of_work.STATE_REQUESTED
        uow.submitted_at = datetime.utcnow()
        self.uow_dao.update(uow)

        self.mq_transmitter.publish_managed_uow(uow)
        self.logger.info('re-submitted UOW {0} for {1}@{2}; attempt {3}'
                         .format(uow.db_id, uow.process_name, uow.timeperiod, uow.number_of_retries))

    def _cancel_uow(self, uow):
        uow.state = unit_of_work.STATE_CANCELED
        self.uow_dao.update(uow)

        self.mq_transmitter.publish_uow_status(uow)
        self.logger.info('canceled UOW {0} for {1}@{2}; attempt {3}; created at {4}'
                         .format(uow.db_id, uow.process_name, uow.timeperiod, uow.number_of_retries, uow.created_at))

    def _run(self):
        try:
            self.logger.info('run {')

            self.logger.debug('step 1: validate existing queue entries')
            self.validate()

            self.logger.debug('step 2: scan reprocessing candidates')
            self.scan_uow_candidates()

            self.logger.debug('step 3: repost after timeout')
            self.flush()

            self.logger.debug('step 4: timetable housekeeping')
            self.timetable.build_trees()

            self.logger.debug('step 5: timetable validation')
            self.timetable.validate()
        except Exception as e:
            self.logger.error('GC run exception: {0}'.format(e))
        finally:
            self.logger.info('}')

    def start(self):
        self.timer.start()

    def stop(self):
        self.timer.cancel()
예제 #24
0
class StateMachineFreerun(object):
    """ State Machine to handle freerun jobs/triggers """

    def __init__(self, logger, name=STATE_MACHINE_FREERUN):
        self.name = name
        self.logger = logger
        self.mq_transmitter = MqTransmitter(self.logger)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.sfe_dao = FreerunProcessDao(self.logger)

    @with_reconnect
    def _log_message(self, level, freerun_entry, msg):
        """ method performs logging into log file and the freerun_entry """
        self.logger.log(level, msg)

        assert isinstance(freerun_entry, FreerunProcessEntry)
        event_log = freerun_entry.event_log
        if len(event_log) > MAX_NUMBER_OF_EVENTS:
            del event_log[-1]
        event_log.insert(0, msg)
        self.sfe_dao.update(freerun_entry)

    @with_reconnect
    def _insert_uow(self, freerun_entry):
        """ creates unit_of_work and inserts it into the DB
            :raise DuplicateKeyError: if unit_of_work with given parameters already exists """
        current_timeperiod = time_helper.actual_timeperiod(QUALIFIER_REAL_TIME)
        process_entry = context.process_context[freerun_entry.process_name]

        uow = UnitOfWork()
        uow.process_name = freerun_entry.schedulable_name
        uow.timeperiod = current_timeperiod
        uow.start_id = 0
        uow.end_id = 0
        uow.start_timeperiod = current_timeperiod
        uow.end_timeperiod = current_timeperiod
        uow.created_at = datetime.utcnow()
        uow.submitted_at = datetime.utcnow()
        uow.source = process_entry.source if hasattr(process_entry, 'source') else None
        uow.sink = process_entry.sink if hasattr(process_entry, 'sink') else None
        uow.state = unit_of_work.STATE_REQUESTED
        uow.unit_of_work_type = unit_of_work.TYPE_FREERUN
        uow.number_of_retries = 0
        uow.arguments = freerun_entry.arguments
        uow.db_id = self.uow_dao.insert(uow)

        msg = 'Created: UOW {0} for {1}@{2}.' \
              .format(uow.db_id, freerun_entry.schedulable_name, current_timeperiod)
        self._log_message(INFO, freerun_entry, msg)
        return uow

    def _publish_uow(self, freerun_entry, uow):
        self.mq_transmitter.publish_freerun_uow(freerun_entry, uow)
        msg = 'Published: UOW {0} for {1}.'.format(uow.db_id, freerun_entry.schedulable_name)
        self._log_message(INFO, freerun_entry, msg)

    def insert_and_publish_uow(self, freerun_entry):
        try:
            uow = self._insert_uow(freerun_entry)
        except DuplicateKeyError as e:
            msg = 'Duplication of UOW found for {0}. Error msg: {1}'.format(freerun_entry.schedulable_name, e)
            self._log_message(WARNING, freerun_entry, msg)
            uow = self.uow_dao.recover_from_duplicatekeyerror(e)

        if uow is not None:
            # publish the created/caught up unit_of_work
            self._publish_uow(freerun_entry, uow)
            freerun_entry.related_unit_of_work = uow.db_id
            self.sfe_dao.update(freerun_entry)
        else:
            msg = 'PERSISTENT TIER ERROR! Unable to locate UOW for {0}' \
                  .format(freerun_entry.schedulable_name)
            self._log_message(WARNING, freerun_entry, msg)

    def _process_state_embryo(self, freerun_entry):
        """ method creates unit_of_work and associates it with the FreerunProcessEntry """
        self.insert_and_publish_uow(freerun_entry)

    def _process_state_in_progress(self, freerun_entry, uow):
        """ method that takes care of processing unit_of_work records in STATE_REQUESTED or STATE_IN_PROGRESS states"""
        self._publish_uow(freerun_entry, uow)

    def _process_terminal_state(self, freerun_entry, uow):
        """ method that takes care of processing unit_of_work records in
            STATE_PROCESSED, STATE_NOOP, STATE_INVALID, STATE_CANCELED states"""
        msg = 'UOW for {0} found in state {1}.'.format(freerun_entry.schedulable_name, uow.state)
        self._log_message(INFO, freerun_entry, msg)
        self.insert_and_publish_uow(freerun_entry)

    def manage_schedulable(self, freerun_entry):
        """ method main duty - is to _avoid_ publishing another unit_of_work, if previous was not yet processed
        In case the Scheduler sees that the unit_of_work is pending it will fire another WorkerMqRequest """

        assert isinstance(freerun_entry, FreerunProcessEntry)
        if freerun_entry.related_unit_of_work is None:
            uow = None
        else:
            uow = self.uow_dao.get_one(freerun_entry.related_unit_of_work)

        try:
            if uow is None:
                self._process_state_embryo(freerun_entry)

            elif uow.is_requested or uow.is_in_progress:
                self._process_state_in_progress(freerun_entry, uow)

            elif uow.is_finished or uow.is_invalid:
                self._process_terminal_state(freerun_entry, uow)

            else:
                msg = 'Unknown state {0} of the UOW {1}'.format(uow.state, uow.db_id)
                self._log_message(ERROR, freerun_entry, msg)

        except LookupError as e:
            msg = 'Lookup issue for schedulable: {0} in timeperiod {1}, because of: {2}' \
                  .format(freerun_entry.db_id, uow.timeperiod, e)
            self._log_message(WARNING, freerun_entry, msg)

    def cancel_uow(self, freerun_entry):
        uow_id = freerun_entry.related_unit_of_work
        if uow_id is None:
            msg = 'cancel_uow: no related UOW for {0}'.format(freerun_entry.schedulable_name)
        else:
            uow = self.uow_dao.get_one(uow_id)
            uow.state = unit_of_work.STATE_CANCELED
            self.uow_dao.update(uow)
            msg = 'cancel_uow: canceled UOW {0} for {1}'.format(uow_id, freerun_entry.schedulable_name)
        self._log_message(INFO, freerun_entry, msg)
예제 #25
0
 def __init__(self, logger, freerun_handlers):
     self.lock = RLock()
     self.logger = logger
     self.freerun_handlers = freerun_handlers
     self.uow_dao = UnitOfWorkDao(self.logger)
예제 #26
0
 def tearDown(self):
     # cleaning up DB
     self.virtual_tear_down()
     uow_dao = UnitOfWorkDao(self.aggregator.logger)
     uow_dao.remove(self.uow_id)
     del self.aggregator
 def __init__(self, request, **values):
     super(AbstractActionHandler, self).__init__(request, **values)
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.log_recording_dao = LogRecordingDao(self.logger)
class AbstractUowAwareWorker(AbstractMqWorker):
    """ Abstract class is inherited by all workers/aggregators
    that are aware of unit_of_work and capable of processing it """

    def __init__(self, process_name, perform_db_logging=False):
        super(AbstractUowAwareWorker, self).__init__(process_name)
        self.perform_db_logging = perform_db_logging
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.mq_transmitter = MqTransmitter(self.logger)

    def __del__(self):
        del self.mq_transmitter
        super(AbstractUowAwareWorker, self).__del__()

    # **************** Abstract Methods ************************
    def _init_performance_tracker(self, logger):
        self.performance_tracker = UowAwareTracker(logger)
        self.performance_tracker.start()

    def _process_uow(self, uow):
        """
        :param uow: unit_of_work to process
        :return: a tuple (number of processed items/documents/etc, desired unit_of_work state) or None
        if None is returned then it is assumed that the return tuple is (0, unit_of_work.STATE_PROCESSED)
        :raise an Exception if the UOW shall be marked as STATE_INVALID
        """
        raise NotImplementedError('method _process_uow must be implemented by {0}'.format(self.__class__.__name__))

    def _clean_up(self):
        """ method is called from the *finally* clause and is suppose to clean up after the uow processing """
        pass

    def _mq_callback(self, message):
        try:
            mq_request = MqTransmission.from_json(message.body)
            uow = self.uow_dao.get_one(mq_request.record_db_id)
            if not uow.is_requested:
                # accept only UOW in STATE_REQUESTED
                self.logger.warning('Skipping UOW: id {0}; state {1};'.format(message.body, uow.state),
                                    exc_info=False)
                self.consumer.acknowledge(message.delivery_tag)
                return
        except Exception:
            self.logger.error('Safety fuse. Can not identify UOW {0}'.format(message.body), exc_info=True)
            self.consumer.acknowledge(message.delivery_tag)
            return

        db_log_handler = UowLogHandler(self.logger, uow.db_id)
        try:
            uow.state = unit_of_work.STATE_IN_PROGRESS
            uow.started_at = datetime.utcnow()
            self.uow_dao.update(uow)
            self.performance_tracker.start_uow(uow)

            if self.perform_db_logging:
                db_log_handler.attach()

            result = self._process_uow(uow)
            if result is None:
                self.logger.warning('method {0}._process_uow returned None. Assuming happy flow.'
                                    .format(self.__class__.__name__))
                number_of_aggregated_objects, target_state = 0, unit_of_work.STATE_PROCESSED
            else:
                number_of_aggregated_objects, target_state = result

            uow.number_of_aggregated_documents = number_of_aggregated_objects
            uow.number_of_processed_documents = self.performance_tracker.success_per_job
            uow.finished_at = datetime.utcnow()
            uow.state = target_state
            self.uow_dao.update(uow)

            if uow.is_finished:
                self.performance_tracker.finish_uow()
            else:
                self.performance_tracker.cancel_uow()

        except Exception as e:
            fresh_uow = self.uow_dao.get_one(mq_request.record_db_id)
            self.performance_tracker.cancel_uow()
            if fresh_uow.is_canceled:
                self.logger.warning('UOW {0} for {1}@{2} was likely marked by MX as SKIPPED. No UOW update performed.'
                                    .format(uow.db_id, uow.process_name, uow.timeperiod), exc_info=False)
            else:
                self.logger.error('Safety fuse while processing UOW {0} for {1}@{2}: {3}'
                                  .format(uow.db_id, uow.process_name, uow.timeperiod, e), exc_info=True)
                uow.state = unit_of_work.STATE_INVALID
                self.uow_dao.update(uow)

        finally:
            self.consumer.acknowledge(message.delivery_tag)
            self.consumer.close()
            self._clean_up()
            db_log_handler.detach()

        try:
            self.mq_transmitter.publish_uow_status(uow)
            self.logger.info('UOW *{0}* status report published into MQ'.format(uow.state))
        except Exception:
            self.logger.error('Error on UOW status report publishing', exc_info=True)
class AbstractActionHandler(BaseRequestHandler):
    def __init__(self, request, **values):
        super(AbstractActionHandler, self).__init__(request, **values)
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.log_recording_dao = LogRecordingDao(self.logger)

    @property
    def thread_handler(self):
        raise NotImplementedError('property thread_handler must be implemented by {0}'.format(self.__class__.__name__))

    @property
    def process_entry(self):
        raise NotImplementedError('property process_entry must be implemented by {0}'.format(self.__class__.__name__))

    @property
    def uow_id(self):
        raise NotImplementedError('property uow_id must be implemented by {0}'.format(self.__class__.__name__))

    def get_event_log(self):
        raise NotImplementedError('method action_get_event_log must be implemented by {0}'
                                  .format(self.__class__.__name__))

    @safe_json_response
    def get_uow(self):
        if self.uow_id is None:
            resp = {'response': 'no related unit_of_work'}
        else:
            resp = self.uow_dao.get_one(self.uow_id).document
        return resp

    @safe_json_response
    def get_uow_log(self):
        try:
            resp = self.log_recording_dao.get_one(self.uow_id).document
        except (TypeError, LookupError):
            resp = {'response': 'no related uow log'}
        return resp

    @valid_action_request
    def change_interval(self):
        resp = dict()
        new_interval = self.request_arguments['interval']
        if new_interval is not None:
            self.thread_handler.change_interval(new_interval)
            msg = 'changed interval for {0} to {1}'.format(self.thread_handler.key, new_interval)
            self.logger.info('MX: {0}'.format(msg))
            resp['status'] = msg

        return resp

    @valid_action_request
    def trigger_now(self):
        self.thread_handler.trigger()
        self.logger.info('MX: triggered thread handler {0}'.format(self.thread_handler.key))
        return self.reply_ok()

    @valid_action_request
    def activate_trigger(self):
        self.thread_handler.activate()
        self.logger.info('MX: activated thread handler {0}'.format(self.thread_handler.key))
        return self.reply_ok()

    @valid_action_request
    def deactivate_trigger(self):
        self.thread_handler.deactivate()
        self.logger.info('MX: deactivated thread handler {0}'.format(self.thread_handler.key))
        return self.reply_ok()
예제 #30
0
class Timetable(object):
    """ Timetable holds all known process trees, where every node presents a timeperiod-driven job"""

    def __init__(self, logger):
        self.lock = RLock()
        self.logger = logger
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.job_dao = JobDao(self.logger)
        self.reprocess = dict()

        # self.trees contain all of the trees and manages much of their life cycle
        # remember to enlist there all trees the system is working with
        self.trees = self._construct_trees_from_context()

        self._register_callbacks()
        self._register_dependencies()
        self.load_tree()
        self.build_trees()
        self.validate()

    def _construct_trees_from_context(self):
        trees = dict()
        for tree_name, context_entry in context.timetable_context.items():
            tree = MultiLevelTree(process_names=context_entry.enclosed_processes,
                                  tree_name=tree_name,
                                  mx_name=context_entry.mx_name,
                                  mx_page=context_entry.mx_page)
            trees[tree_name] = tree
        return trees

    def _register_dependencies(self):
        """ register dependencies between trees"""
        for tree_name, context_entry in context.timetable_context.items():
            tree = self.trees[tree_name]
            assert isinstance(tree, MultiLevelTree)
            for dependent_on in context_entry.dependent_on:
                dependent_on_tree = self.trees[dependent_on]
                assert isinstance(dependent_on_tree, MultiLevelTree)
                tree.register_dependent_on(dependent_on_tree)

    def _register_callbacks(self):
        """ register logic that reacts on reprocessing request
        and create embryo timetable record request"""

        # reprocessing request
        for tree_name, tree in self.trees.items():
            tree.register_reprocess_callback(self._callback_reprocess)

        # skip request
        for tree_name, tree in self.trees.items():
            tree.register_skip_callback(self._callback_skip)

        # callbacks register
        for tree_name, tree in self.trees.items():
            tree.register_create_callbacks(self._callback_create_job_record)

    # *** Call-back methods ***
    def _find_dependant_trees(self, tree_obj):
        """ returns list of trees that are dependent_on given tree_obj """
        dependant_trees = []
        for tree_name, tree in self.trees.items():
            if tree_obj in tree.dependent_on:
                dependant_trees.append(tree)
        return dependant_trees

    def _find_dependant_tree_nodes(self, node_a):
        dependant_nodes = set()
        for tree_b in self._find_dependant_trees(node_a.tree):
            node_b = node_a.find_counterpart_in(tree_b)
            if node_b is None:
                continue
            dependant_nodes.add(node_b)
        return dependant_nodes

    def _reprocess_single_tree_node(self, tree_node):
        """ is called from tree to answer reprocessing request.
        It is possible that timetable record will be transferred to STATE_IN_PROGRESS with no related unit_of_work"""
        uow_id = tree_node.job_record.related_unit_of_work
        if uow_id is not None:
            tree_node.job_record.state = job.STATE_IN_PROGRESS
            uow = self.uow_dao.get_one(uow_id)
            uow.state = unit_of_work.STATE_INVALID
            uow.number_of_retries = 0
            uow.created_at = datetime.utcnow()
            self.uow_dao.update(uow)
            msg = 'Transferred job record %s for %s in timeperiod %s to %s; Transferred unit_of_work to %s' \
                  % (tree_node.job_record.db_id,
                     tree_node.process_name,
                     tree_node.job_record.timeperiod,
                     tree_node.job_record.state,
                     uow.state)

            if tree_node.process_name not in self.reprocess:
                self.reprocess[tree_node.process_name] = dict()
            self.reprocess[tree_node.process_name][tree_node.timeperiod] = tree_node

        else:
            tree_node.job_record.state = job.STATE_EMBRYO
            msg = 'Transferred job record %s for %s in timeperiod %s to %s;' \
                  % (tree_node.job_record.db_id,
                     tree_node.process_name,
                     tree_node.job_record.timeperiod,
                     tree_node.job_record.state)

        tree_node.job_record.number_of_failures = 0
        self.job_dao.update(tree_node.job_record)
        self.logger.warn(msg)
        tree_node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg])

    @thread_safe
    def _callback_reprocess(self, tree_node):
        """ is called from tree to answer reprocessing request.
        It is possible that job record will be transferred to STATE_IN_PROGRESS with no related unit_of_work"""
        if (tree_node.job_record.is_embryo and tree_node.job_record.number_of_failures == 0) \
            or (tree_node.process_name in self.reprocess
                and tree_node.timeperiod in self.reprocess[tree_node.process_name]):
            # the node has already been marked for re-processing or does not require one
            pass
        else:
            self._reprocess_single_tree_node(tree_node)

        reprocessing_nodes = self._find_dependant_tree_nodes(tree_node)
        for node in reprocessing_nodes:
            node.request_reprocess()

    @thread_safe
    def _callback_skip(self, tree_node):
        """ is called from tree to answer skip request"""
        tree_node.job_record.state = job.STATE_SKIPPED
        uow_id = tree_node.job_record.related_unit_of_work
        if uow_id is not None:
            uow = self.uow_dao.get_one(uow_id)
            uow.state = unit_of_work.STATE_CANCELED
            self.uow_dao.update(uow)
            msg = 'Transferred job record %s in timeperiod %s to %s; Transferred unit_of_work to %s' \
                  % (tree_node.job_record.db_id,
                     tree_node.job_record.timeperiod,
                     tree_node.job_record.state,
                     uow.state)
        else:
            msg = 'Transferred job record %s in timeperiod %s to %s;' \
                  % (tree_node.job_record.db_id,
                     tree_node.job_record.timeperiod,
                     tree_node.job_record.state)

        self.job_dao.update(tree_node.job_record)
        self.logger.warn(msg)
        tree_node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg])

        if tree_node.process_name in self.reprocess \
                and tree_node.timeperiod in self.reprocess[tree_node.process_name]:
            del self.reprocess[tree_node.process_name][tree_node.timeperiod]

    @thread_safe
    def _callback_create_job_record(self, tree_node):
        """ is called from a tree to create job record in STATE_EMBRYO and bind it to the given tree node"""

        try:
            job_record = self.job_dao.get_one(tree_node.process_name, tree_node.timeperiod)
        except LookupError:
            job_record = Job()
            job_record.state = job.STATE_EMBRYO
            job_record.timeperiod = tree_node.timeperiod
            job_record.process_name = tree_node.process_name

            tr_id = self.job_dao.update(job_record)
            self.logger.info('Created job record %s, with timeperiod %s for process %s'
                             % (str(tr_id), tree_node.timeperiod, tree_node.process_name))
        tree_node.job_record = job_record

    # *** Tree-manipulation methods ***
    @thread_safe
    def get_tree(self, process_name):
        """ return tree that is managing time-periods for given process"""
        for tree_name, tree in self.trees.items():
            if process_name in tree:
                return tree

    @thread_safe
    def _build_tree_by_level(self, collection_name, since):
        """ method iterated thru all documents in all job collections and builds a tree of known system state"""
        try:
            unsupported_records = dict()
            job_records = self.job_dao.get_all(collection_name, since)
            for job_record in job_records:
                tree = self.get_tree(job_record.process_name)
                if tree is not None:
                    tree.update_node(job_record)
                else:
                    unsupported_records[job_record.process_name] = \
                        unsupported_records.get(job_record.process_name, 0) + 1

            for name, counter in unsupported_records.items():
                self.logger.warn('Skipping %r Job records for %s as no tree is handling it.' % (counter, name))

        except LookupError:
            self.logger.warn('No Job Records in %s.' % str(collection_name))

    @thread_safe
    def load_tree(self):
        """ method iterates thru all objects older than synergy_start_timeperiod parameter in job collections
        and loads them into this timetable"""
        timeperiod = settings.settings['synergy_start_timeperiod']
        yearly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_YEARLY, timeperiod)
        monthly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_MONTHLY, timeperiod)
        daily_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_DAILY, timeperiod)
        hourly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_HOURLY, timeperiod)

        self._build_tree_by_level(COLLECTION_JOB_HOURLY, since=hourly_timeperiod)
        self._build_tree_by_level(COLLECTION_JOB_DAILY, since=daily_timeperiod)
        self._build_tree_by_level(COLLECTION_JOB_MONTHLY, since=monthly_timeperiod)
        self._build_tree_by_level(COLLECTION_JOB_YEARLY, since=yearly_timeperiod)

    @thread_safe
    def build_trees(self):
        """ method iterates thru all trees and ensures that all time-period nodes are created up till <utc_now>"""
        for tree_name, tree in self.trees.items():
            tree.build_tree()

    @thread_safe
    def validate(self):
        """validates that none of nodes in tree is improperly finalized and that every node has job_record"""
        for tree_name, tree in self.trees.items():
            tree.validate()

    @thread_safe
    def dependent_on_composite_state(self, job_record):
        """ :return instance of <NodesCompositeState> """
        assert isinstance(job_record, Job)
        tree = self.get_tree(job_record.process_name)
        node = tree.get_node(job_record.process_name, job_record.timeperiod)
        return node.dependent_on_composite_state()

    # *** Job manipulation methods ***
    @thread_safe
    def update_job_record(self, job_record, uow, new_state):
        """ method updates job record with a new unit_of_work and new state"""
        job_record.state = new_state
        job_record.related_unit_of_work = uow.db_id
        job_record.start_id = uow.start_id
        job_record.end_id = uow.end_id
        self.job_dao.update(job_record)

        tree = self.get_tree(job_record.process_name)
        tree.update_node(job_record)

        msg = 'Transferred job %s for %s in timeperiod %s to new state %s' \
              % (job_record.db_id, job_record.timeperiod, job_record.process_name, new_state)
        self.logger.info(msg)
        self.add_log_entry(job_record.process_name, job_record.timeperiod, msg)

    @thread_safe
    def failed_on_processing_job_record(self, process_name, timeperiod):
        """method increases node's inner counter of failed processing
        if _skip_node logic returns True - node is set to STATE_SKIP"""
        tree = self.get_tree(process_name)
        node = tree.get_node(process_name, timeperiod)
        node.job_record.number_of_failures += 1
        if tree._skip_the_node(node):
            node.request_skip()
        else:
            # job record is automatically updated in request_skip()
            # so if the node was not skipped - job record has to be updated explicitly
            self.job_dao.update(node.job_record)

    @thread_safe
    def get_next_job_record(self, process_name):
        """returns next job record to work on for given process"""
        if process_name in self.reprocess and len(self.reprocess[process_name]) > 0:
            timeperiod = sorted(self.reprocess[process_name].keys())[0]
            node = self.reprocess[process_name][timeperiod]
            del self.reprocess[process_name][timeperiod]
        else:
            tree = self.get_tree(process_name)
            node = tree.get_next_node(process_name)

        if node.job_record is None:
            node.request_embryo_job_record()
        return node.job_record

    @thread_safe
    def is_job_record_finalizable(self, job_record):
        """ :return True, if the node and all its children are either in STATE_PROCESSED or STATE_SKIPPED"""
        assert isinstance(job_record, Job)
        tree = self.get_tree(job_record.process_name)
        node = tree.get_node(job_record.process_name, job_record.timeperiod)
        return node.is_finalizable()

    @thread_safe
    def add_log_entry(self, process_name, timeperiod, msg):
        """ adds a log entry to the tree node. log entries has no persistence """
        tree = self.get_tree(process_name)
        node = tree.get_node(process_name, timeperiod)
        node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg])
예제 #31
0
 def __init__(self, logger):
     self.lock = RLock()
     self.logger = logger
     self.uow_dao = UnitOfWorkDao(self.logger)
예제 #32
0
 def tearDown(self):
     # cleaning up DB
     self.virtual_tear_down()
     uow_dao = UnitOfWorkDao(self.aggregator.logger)
     uow_dao.remove(self.uow_id)
     del self.aggregator
예제 #33
0
 def __init__(self, logger, name=STATE_MACHINE_FREERUN):
     self.name = name
     self.logger = logger
     self.mq_transmitter = MqTransmitter(self.logger)
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.sfe_dao = FreerunProcessDao(self.logger)
예제 #34
0
class AbstractStateMachine(object):
    """ Abstract state machine used to host common logic for the rest of registered State Machines """

    def __init__(self, logger, timetable, name):
        self.name = name
        self.logger = logger
        self.mq_transmitter = MqTransmitter(self.logger)
        self.timetable = timetable
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.job_dao = JobDao(self.logger)

    @property
    def run_on_active_timeperiod(self):
        """
        :return: True if given State Machine allows execution on *live* timeperiod, as opposed to the finished one
        """
        raise NotImplementedError('property run_on_active_timeperiod must be implemented by {0}'
                                  .format(self.__class__.__name__))

    def _log_message(self, level, process_name, timeperiod, msg):
        """ method performs logging into log file and Timetable's tree node"""
        self.timetable.add_log_entry(process_name, timeperiod, msg)
        self.logger.log(level, msg)

    @with_reconnect
    def _insert_uow(self, process_name, timeperiod, start_timeperiod, end_timeperiod, start_id, end_id):
        """creates unit_of_work and inserts it into the DB
            :raise DuplicateKeyError: if unit_of_work with given parameters already exists """
        uow = UnitOfWork()
        uow.process_name = process_name
        uow.timeperiod = timeperiod
        uow.start_id = str(start_id)
        uow.end_id = str(end_id)
        uow.start_timeperiod = start_timeperiod
        uow.end_timeperiod = end_timeperiod
        uow.created_at = datetime.utcnow()
        uow.submitted_at = datetime.utcnow()
        uow.source = context.process_context[process_name].source
        uow.sink = context.process_context[process_name].sink
        uow.state = unit_of_work.STATE_REQUESTED
        uow.unit_of_work_type = unit_of_work.TYPE_MANAGED
        uow.number_of_retries = 0
        uow.arguments = context.process_context[process_name].arguments
        uow.db_id = self.uow_dao.insert(uow)

        msg = 'Created: UOW {0} for {1}@{2}.'.format(uow.db_id, process_name, start_timeperiod)
        self._log_message(INFO, process_name, start_timeperiod, msg)
        return uow

    def _publish_uow(self, uow):
        self.mq_transmitter.publish_managed_uow(uow)
        msg = 'Published: UOW {0} for {1}@{2}.'.format(uow.db_id, uow.process_name, uow.start_timeperiod)
        self._log_message(INFO, uow.process_name, uow.start_timeperiod, msg)

    def insert_and_publish_uow(self, job_record, start_id, end_id):
        """ method creates and publishes a unit_of_work. it also handles DuplicateKeyError and attempts recovery
        :return: tuple (uow, is_duplicate)
        :raise UserWarning: if the recovery from DuplicateKeyError was unsuccessful
        """
        process_name = job_record.process_name
        timeperiod = job_record.timeperiod
        start_timeperiod = self.compute_start_timeperiod(job_record.process_name, job_record.timeperiod)
        end_timeperiod = self.compute_end_timeperiod(job_record.process_name, job_record.timeperiod)

        try:
            is_duplicate = False
            uow = self._insert_uow(process_name, timeperiod, start_timeperiod, end_timeperiod, start_id, end_id)
        except DuplicateKeyError as e:
            is_duplicate = True
            msg = 'Catching up with latest UOW {0}@{1}, because of: {2}' \
                  .format(process_name, start_timeperiod, e)
            self._log_message(WARNING, process_name, start_timeperiod, msg)
            uow = self.uow_dao.recover_from_duplicatekeyerror(e)

        if not uow:
            msg = 'PERSISTENT TIER ERROR! Unable to locate UOW for {0}@{1}' \
                  .format(process_name, start_timeperiod)
            self._log_message(WARNING, process_name, start_timeperiod, msg)
            raise UserWarning(msg)

        if uow.is_canceled:
            # this UOW was marked for re-processing. recycle it
            uow.created_at = datetime.utcnow()      # reset created_at to bypass GC cancellation logic
            uow.submitted_at = datetime.utcnow()    # reset submitted_at to allow 1 hour free of GC resubmitting
            del uow.started_at
            del uow.finished_at
            del uow.number_of_aggregated_documents
            del uow.number_of_processed_documents
            uow.state = unit_of_work.STATE_REQUESTED
            self.uow_dao.update(uow)

        # publish the created/recovered/recycled unit_of_work
        self._publish_uow(uow)
        return uow, is_duplicate

    def notify(self, uow):
        """ method is used by StateMachine's users to provide notifications about change in UOW
        if applicable, method will update job_record state and Timetable tree node state
        :assumptions: uow is in [STATE_NOOP, STATE_CANCELED, STATE_PROCESSED] """
        pass

    def compute_start_timeperiod(self, process_name, timeperiod):
        """ computes lowest *inclusive* timeperiod boundary for job to process
            for process with time_grouping == 1, it returns given timeperiod with no change
            for process with time_grouping != 1, it computes first timeperiod, not processed by the previous job run
            For instance: with time_grouping = 3, QUALIFIER_HOURLY, and timeperiod = 2016042018,
            the start_timeperiod will be = 2016042016 (computed as 2016042018 - 3 + 1)
        """
        time_grouping = context.process_context[process_name].time_grouping
        if time_grouping == 1:
            return timeperiod

        # step1: translate given timeperiod to the time grouped one
        process_hierarchy = self.timetable.get_tree(process_name).process_hierarchy
        timeperiod_dict = process_hierarchy[process_name].timeperiod_dict
        translated_timeperiod = timeperiod_dict._translate_timeperiod(timeperiod)

        # step 2: compute previous grouped period
        # NOTICE: simple `time_helper.increment_timeperiod(time_qualifier, timeperiod)` is insufficient
        #         as it does not address edge cases, such as the last day of the month or the last hour of the day
        # For instance: with time_grouping=3, QUALIFIER_DAILY, and 2016123100
        # the `increment_timeperiod` will yield 2016122800 instead of 2016123100
        time_qualifier = context.process_context[process_name].time_qualifier
        for i in range(1, time_grouping + 1):
            prev_timeperiod = time_helper.increment_timeperiod(time_qualifier, translated_timeperiod, delta=-i)
            if prev_timeperiod == timeperiod_dict._translate_timeperiod(prev_timeperiod):
                # prev_timeperiod is currently at the last grouped timeperiod
                break

        # step 3: compute first exclusive timeperiod after the *prev_timeperiod*,
        # which becomes first inclusive timeperiod for this job run
        over_the_edge_timeperiod = time_helper.increment_timeperiod(time_qualifier, prev_timeperiod, delta=-1)
        if prev_timeperiod != timeperiod_dict._translate_timeperiod(over_the_edge_timeperiod):
            # over_the_edge_timeperiod fell into previous day or month or year
            # *prev_timeperiod* points to the first month, first day of the month or 00 hour
            start_timeperiod = prev_timeperiod
        else:
            start_timeperiod = self.compute_end_timeperiod(process_name, prev_timeperiod)

        return start_timeperiod

    def compute_end_timeperiod(self, process_name, timeperiod):
        """ computes first *exclusive* timeperiod for job to process """
        time_qualifier = context.process_context[process_name].time_qualifier
        end_timeperiod = time_helper.increment_timeperiod(time_qualifier, timeperiod)
        return end_timeperiod

    def _is_noop_timeperiod(self, process_name, timeperiod):
        """ method verifies if the given timeperiod for given process is valid or falls in-between grouping checkpoints
        :param process_name: name of the process
        :param timeperiod: timeperiod to verify
        :return: False, if given process has no time_grouping set or it is equal to 1.
                 False, if time_grouping is custom but the given timeperiod matches the grouped timeperiod.
                 True, if the timeperiod falls in-between grouping cracks
        """
        time_grouping = context.process_context[process_name].time_grouping
        if time_grouping == 1:
            return False

        process_hierarchy = self.timetable.get_tree(process_name).process_hierarchy
        timeperiod_dict = process_hierarchy[process_name].timeperiod_dict
        return timeperiod_dict._translate_timeperiod(timeperiod) != timeperiod

    def _process_noop_timeperiod(self, job_record):
        """ method is valid for processes having time_grouping != 1.
            should a job record fall in-between grouped time milestones,
            its state should be set to STATE_NOOP without any processing """
        job_record.state = job.STATE_NOOP
        self.job_dao.update(job_record)

        time_grouping = context.process_context[job_record.process_name].time_grouping
        msg = 'Job {0}@{1} with time_grouping {2} was transferred to STATE_NOOP' \
              .format(job_record.process_name, job_record.timeperiod, time_grouping)
        self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

    def _process_state_embryo(self, job_record):
        """ method that takes care of processing job records in STATE_EMBRYO state"""
        pass

    def _process_state_in_progress(self, job_record):
        """ method that takes care of processing job records in STATE_IN_PROGRESS state"""
        pass

    def _process_state_final_run(self, job_record):
        """method takes care of processing job records in STATE_FINAL_RUN state"""
        uow = self.uow_dao.get_one(job_record.related_unit_of_work)
        if uow.is_processed:
            self.update_job(job_record, uow, job.STATE_PROCESSED)
        elif uow.is_noop:
            self.update_job(job_record, uow, job.STATE_NOOP)
        elif uow.is_canceled:
            self.update_job(job_record, uow, job.STATE_SKIPPED)
        elif uow.is_invalid:
            msg = 'Job {0}: UOW for {1}@{2} is in {3}; ' \
                  'relying on the Garbage Collector to either recycle or cancel the UOW.' \
                  .format(job_record.db_id, job_record.process_name, job_record.timeperiod, uow.state)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)
        else:
            msg = 'Suppressed creating UOW for {0}@{1}; Job is in {2}; uow is in {3}' \
                  .format(job_record.process_name, job_record.timeperiod, job_record.state, uow.state)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

        timetable_tree = self.timetable.get_tree(job_record.process_name)
        timetable_tree.build_tree()

    def _process_terminal_state(self, job_record):
        """ method logs a warning message notifying that the job is no longer govern by this state machine """
        msg = 'Job {0} for {1}@{2} is in the terminal state {3}, ' \
              'and is no further govern by the State Machine {4}' \
              .format(job_record.db_id, job_record.process_name, job_record.timeperiod, job_record.state, self.name)
        self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)

    def manage_job_with_blocking_children(self, job_record):
        """ method will trigger job processing only if:
            - all children are finished (STATE_PROCESSED, STATE_SKIPPED, STATE_NOOP)
            - all external dependencies are finalized (STATE_PROCESSED, STATE_SKIPPED, STATE_NOOP) """
        is_job_finalizable = self.timetable.is_job_record_finalizable(job_record)
        if is_job_finalizable:
            self.manage_job(job_record)
        else:
            msg = 'Job {0}@{1} is blocked by unprocessed children/dependencies. Waiting another tick' \
                  .format(job_record.process_name, job_record.timeperiod)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

    def manage_job_with_blocking_dependencies(self, job_record):
        """ method will trigger job processing only if:
            - all dependencies are in [STATE_PROCESSED, STATE_NOOP]
            NOTICE: method will transfer current job into STATE_SKIPPED if any dependency is in STATE_SKIPPED """
        composite_state = self.timetable.dependent_on_composite_state(job_record)
        assert isinstance(composite_state, NodesCompositeState)

        if composite_state.all_processed:
            self.manage_job(job_record)
        elif composite_state.skipped_present:
            # As soon as among <dependent on> periods are in STATE_SKIPPED
            # there is very little sense in waiting for them to become STATE_PROCESSED
            # Skip this timeperiod itself
            job_record.state = job.STATE_SKIPPED
            self.job_dao.update(job_record)
            self.mq_transmitter.publish_job_status(job_record)

            msg = 'Job {0}@{1} is blocked by STATE_SKIPPED dependencies. ' \
                  'Transferred the job to STATE_SKIPPED'.format(job_record.process_name, job_record.timeperiod)
            self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)
        else:
            msg = 'Job {0}@{1} is blocked by unprocessed dependencies. Waiting another tick' \
                  .format(job_record.process_name, job_record.timeperiod)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

    def manage_job(self, job_record):
        """ method main duty - is to _avoid_ publishing another unit_of_work, if previous was not yet processed
            In case the State Machine sees that the unit_of_work already exist it could either:
            - update boundaries of the processing; republish
            - wait another tick; idle """
        assert isinstance(job_record, Job)
        try:
            if self._is_noop_timeperiod(job_record.process_name, job_record.timeperiod):
                self._process_noop_timeperiod(job_record)
                return

            if job_record.is_embryo:
                self._process_state_embryo(job_record)

            elif job_record.is_in_progress:
                self._process_state_in_progress(job_record)

            elif job_record.is_final_run:
                self._process_state_final_run(job_record)

            elif job_record.is_skipped:
                self._process_terminal_state(job_record)

            elif job_record.is_processed:
                self._process_terminal_state(job_record)

            elif job_record.is_noop:
                self._process_terminal_state(job_record)

            else:
                msg = 'Unknown state {0} of the job {1}'.format(job_record.state, job_record.db_id)
                self._log_message(ERROR, job_record.process_name, job_record.timeperiod, msg)

        except LookupError as e:
            job_record.number_of_failures += 1
            self.job_dao.update(job_record)
            self.timetable.skip_if_needed(job_record)
            msg = 'Increasing fail counter for Job {0}@{1}, because of: {2}' \
                  .format(job_record.process_name, job_record.timeperiod, e)
            self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)

        finally:
            self.mq_transmitter.publish_job_status(job_record)

    def reprocess_job(self, job_record):
        """ method marks given job for reprocessing:
            - if no UOW was bind with the job record - it is transferred to STATE_EMBRYO only
            - if related UOW was not finished - we mark the UOW as STATE_CANCELED
                and pass the job record thru the _process_in_progress routine
            - otherwise job record is passed thru the _process_in_progress routine """
        original_job_state = job_record.state
        job_record.number_of_failures = 0

        if not job_record.related_unit_of_work:
            job_record.state = job.STATE_EMBRYO
            self.job_dao.update(job_record)
        else:
            uow = self.uow_dao.get_one(job_record.related_unit_of_work)
            if not uow.is_finished:
                uow.state = unit_of_work.STATE_CANCELED
                uow.submitted_at = datetime.utcnow()
                self.uow_dao.update(uow)
            self._process_state_in_progress(job_record)

        msg = 'Reprocessed Job {0} for {1}@{2}: state transfer {3} -> {4};' \
              .format(job_record.db_id, job_record.process_name, job_record.timeperiod,
                      original_job_state, job_record.state)
        self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)

    def skip_job(self, job_record):
        """ method transfers:
            - given job into STATE_SKIPPED if it is not not in finished state
            - UOW into STATE_CANCELED if it is not in finished state """
        original_job_state = job_record.state

        if not job_record.is_finished:
            job_record.state = job.STATE_SKIPPED
            self.job_dao.update(job_record)

        if job_record.related_unit_of_work:
            uow = self.uow_dao.get_one(job_record.related_unit_of_work)
            if not uow.is_finished:
                uow.state = unit_of_work.STATE_CANCELED
                uow.submitted_at = datetime.utcnow()
                self.uow_dao.update(uow)

        msg = 'Skipped Job {0} for {1}@{2}: state transfer {3} -> {4};' \
              .format(job_record.db_id, job_record.process_name, job_record.timeperiod,
                      original_job_state, job_record.state)
        self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)

    def create_job(self, process_name, timeperiod):
        """ method creates a job record in STATE_EMBRYO for given process_name and timeperiod
            :returns: created job record of type <Job>"""
        job_record = Job()
        job_record.state = job.STATE_EMBRYO
        job_record.timeperiod = timeperiod
        job_record.process_name = process_name
        self.job_dao.update(job_record)

        self.logger.info('Created Job {0} for {1}@{2}'
                         .format(job_record.db_id, job_record.process_name, job_record.timeperiod))
        return job_record

    def update_job(self, job_record, uow, new_state):
        """ method updates job record with a new unit_of_work and new state"""
        original_job_state = job_record.state
        job_record.state = new_state
        job_record.related_unit_of_work = uow.db_id
        self.job_dao.update(job_record)

        msg = 'Updated Job {0} for {1}@{2}: state transfer {3} -> {4};' \
              .format(job_record.db_id, job_record.process_name, job_record.timeperiod, original_job_state, new_state)
        self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)
class AbstractStateMachine(object):
    """ Abstract state machine used to govern all processes and their states """

    def __init__(self, logger, timetable, name):
        self.name = name
        self.logger = logger
        self.publishers = PublishersPool(self.logger)
        self.timetable = timetable
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.job_dao = JobDao(self.logger)

    def __del__(self):
        try:
            self.logger.info('Closing Flopsy Publishers Pool...')
            self.publishers.close()
        except Exception as e:
            self.logger.error('Exception caught while closing Flopsy Publishers Pool: %s' % str(e))

    def _log_message(self, level, process_name, timeperiod, msg):
        """ method performs logging into log file and Timetable's tree node"""
        self.timetable.add_log_entry(process_name, timeperiod, msg)
        self.logger.log(level, msg)

    @with_reconnect
    def _insert_uow(self, process_name, start_timeperiod, end_timeperiod, start_id, end_id):
        """creates unit_of_work and inserts it into the DB
            :raise DuplicateKeyError: if unit_of_work with given parameters already exists """
        uow = UnitOfWork()
        uow.process_name = process_name
        uow.timeperiod = start_timeperiod
        uow.start_id = str(start_id)
        uow.end_id = str(end_id)
        uow.start_timeperiod = start_timeperiod
        uow.end_timeperiod = end_timeperiod
        uow.created_at = datetime.utcnow()
        uow.source = context.process_context[process_name].source
        uow.sink = context.process_context[process_name].sink
        uow.state = unit_of_work.STATE_REQUESTED
        uow.unit_of_work_type = TYPE_MANAGED
        uow.number_of_retries = 0
        uow.arguments = context.process_context[process_name].arguments
        uow.db_id = self.uow_dao.insert(uow)

        msg = 'Created: UOW %s for %s in timeperiod [%s:%s).' \
              % (uow.db_id, process_name, start_timeperiod, end_timeperiod)
        self._log_message(INFO, process_name, start_timeperiod, msg)
        return uow

    def _publish_uow(self, uow):
        mq_request = SynergyMqTransmission(process_name=uow.process_name, unit_of_work_id=uow.db_id)

        publisher = self.publishers.get(uow.process_name)
        publisher.publish(mq_request.document)
        publisher.release()

        msg = 'Published: UOW %r for %r in timeperiod %r.' % (uow.db_id, uow.process_name, uow.start_timeperiod)
        self._log_message(INFO, uow.process_name, uow.start_timeperiod, msg)

    def insert_and_publish_uow(self, process_name, start_timeperiod, end_timeperiod, start_id, end_id):
        """ method creates and publishes a unit_of_work. it also handles DuplicateKeyError and attempts recovery
        :return: tuple (uow, is_duplicate)
        :raise UserWarning: if the recovery from DuplicateKeyError was unsuccessful
        """
        is_duplicate = False
        try:
            uow = self._insert_uow(process_name, start_timeperiod, end_timeperiod, start_id, end_id)
        except DuplicateKeyError as e:
            is_duplicate = True
            msg = 'Catching up with latest unit_of_work %s in timeperiod %s, because of: %r' \
                  % (process_name, start_timeperiod, e)
            self._log_message(WARNING, process_name, start_timeperiod, msg)
            uow = self.uow_dao.recover_from_duplicatekeyerror(e)

        if uow is None:
            msg = 'MANUAL INTERVENTION REQUIRED! Unable to locate unit_of_work for %s in %s' \
                  % (process_name, start_timeperiod)
            self._log_message(WARNING, process_name, start_timeperiod, msg)
            raise UserWarning(msg)

        # publish the created/caught up unit_of_work
        self._publish_uow(uow)
        return uow, is_duplicate

    def shallow_state_update(self, uow):
        """ method does not trigger any new actions
        if applicable, it will update job_record state and Timetable tree node state
        :assumptions: uow is either in STATE_CANCELED or STATE_PROCESSED """
        pass

    def _process_state_embryo(self, job_record):
        """ method that takes care of processing job records in STATE_EMBRYO state"""
        pass

    def _process_state_in_progress(self, job_record):
        """ method that takes care of processing job records in STATE_IN_PROGRESS state"""
        pass

    def _process_state_final_run(self, job_record):
        """method takes care of processing job records in STATE_FINAL_RUN state"""
        pass

    def _process_state_skipped(self, job_record):
        """method takes care of processing job records in STATE_FINAL_SKIPPED state"""
        pass

    def _process_state_processed(self, job_record):
        """method takes care of processing job records in STATE_FINAL_SKIPPED state"""
        pass

    def manage_job_with_blocking_children(self, job_record, run_on_active_timeperiod):
        """ method will trigger job processing only if all children are in STATE_PROCESSED or STATE_SKIPPED
         and if all external dependencies are finalized (i.e. in STATE_PROCESSED or STATE_SKIPPED) """
        is_job_finalizable = self.timetable.is_job_record_finalizable(job_record)
        composite_state = self.timetable.dependent_on_composite_state(job_record)

        if is_job_finalizable:
            self.manage_job(job_record)
        elif composite_state.all_healthy and run_on_active_timeperiod:
            self.manage_job(job_record)
        else:
            msg = '%s for timeperiod %r is blocked by unprocessed children/dependencies. Waiting another tick' \
                  % (job_record.process_name, job_record.timeperiod)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

    def manage_job_with_blocking_dependencies(self, job_record, run_on_active_timeperiod):
        """ method will trigger job processing only if _all_ dependencies are in STATE_PROCESSED
         method will transfer current job into STATE_SKIPPED if any dependency is in STATE_SKIPPED """
        composite_state = self.timetable.dependent_on_composite_state(job_record)
        assert isinstance(composite_state, NodesCompositeState)

        if composite_state.all_processed:
            self.manage_job(job_record)
        elif composite_state.all_healthy and run_on_active_timeperiod:
            self.manage_job(job_record)
        elif composite_state.skipped_present:
            # As soon as among <dependent on> periods are in STATE_SKIPPED
            # there is very little sense in waiting for them to become STATE_PROCESSED
            # Skip this timeperiod itself
            job_record.state = job.STATE_SKIPPED
            self.job_dao.update(job_record)
            tree = self.timetable.get_tree(job_record.process_name)
            tree.update_node(job_record)

            msg = '%s for timeperiod %r is blocked by STATE_SKIPPED dependencies. ' \
                  'Transferred the job to STATE_SKIPPED' % (job_record.process_name, job_record.timeperiod)
            self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)
        else:
            msg = '%s for timeperiod %r is blocked by unprocessed dependencies. Waiting another tick' \
                  % (job_record.process_name, job_record.timeperiod)
            self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)

    def manage_job(self, job_record):
        """ method main duty - is to _avoid_ publishing another unit_of_work, if previous was not yet processed
        In case the Scheduler sees that the unit_of_work is pending it could either update boundaries of the processing
        or wait another tick """
        assert isinstance(job_record, Job)
        try:
            if job_record.is_embryo:
                self._process_state_embryo(job_record)

            elif job_record.is_in_progress:
                self._process_state_in_progress(job_record)

            elif job_record.is_final_run:
                self._process_state_final_run(job_record)

            elif job_record.is_skipped:
                self._process_state_skipped(job_record)

            elif job_record.is_processed:
                self._process_state_processed(job_record)

            else:
                msg = 'Unknown state %s of the job %s' % (job_record.state, job_record.db_id)
                self._log_message(ERROR, job_record.process_name, job_record.timeperiod, msg)

        except LookupError as e:
            self.timetable.failed_on_processing_job_record(job_record.process_name, job_record.timeperiod)
            msg = 'Increasing fail counter for %s in timeperiod %s, because of: %r' \
                  % (job_record.process_name, job_record.timeperiod, e)
            self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)
예제 #36
0
class BashRunnable(threading.Thread):
    """Process starts remote or local bash script job, supervises its execution and updates mq"""

    def __init__(self, logger, message, consumer, performance_ticker):
        self.logger = logger
        self.message = message
        self.mq_request = SynergyMqTransmission.from_json(message.body)
        self.consumer = consumer
        self.performance_ticker = performance_ticker
        self.alive = False
        self.return_code = -1
        self.uow_dao = UnitOfWorkDao(self.logger)

        self.thread_name = str(self.mq_request)
        super(BashRunnable, self).__init__(name=self.thread_name)

    def _poll_process(self):
        return self.alive, self.return_code

    def _start_process(self):
        try:
            uow = self.uow_dao.get_one(self.mq_request.unit_of_work_id)
            if not uow.is_requested:
                # accept only UOW in STATE_REQUESTED
                self.logger.warn('Skipping unit_of_work: id %s; state %s;' % (str(self.message.body), uow.state),
                                 exc_info=False)
                self.consumer.acknowledge(self.message.delivery_tag)
                return
        except Exception:
            self.logger.error('Safety fuse. Can not identify unit_of_work %s' % str(self.message.body), exc_info=True)
            self.consumer.acknowledge(self.message.delivery_tag)
            return

        try:
            self.logger.info('start: %s {' % self.thread_name)
            self.alive = True

            uow.state = unit_of_work.STATE_IN_PROGRESS
            uow.started_at = datetime.utcnow()
            self.uow_dao.update(uow)

            fabric.operations.env.warn_only = True
            fabric.operations.env.abort_on_prompts = True
            fabric.operations.env.use_ssh_config = True
            fabric.operations.env.host_string = uow.arguments[ARGUMENT_CMD_HOST]

            command = os.path.join(uow.arguments[ARGUMENT_CMD_PATH],
                                   uow.arguments[ARGUMENT_CMD_FILE])
            command += ' %s' % uow.arguments[ARGUMENT_CMD_ARGS]

            run_result = fabric.operations.run(command, pty=False)
            if run_result.succeeded:
                self.return_code = 0

            uow.finished_at = datetime.utcnow()
            uow.state = unit_of_work.STATE_PROCESSED
            self.uow_dao.update(uow)

            self.logger.info('Completed %s with result = %r' % (self.thread_name, self.return_code))
        except Exception:
            self.logger.error('Exception on starting: %s' % self.thread_name, exc_info=True)
            uow.state = unit_of_work.STATE_INVALID
            self.uow_dao.update(uow)
        finally:
            self.logger.info('}')
            self.alive = False

    def run(self):
        try:
            self._start_process()
            code = None
            alive = True
            while alive:
                alive, code = self._poll_process()
                time.sleep(0.1)

            if code == 0:
                self.performance_ticker.tracker.increment_success()
            else:
                self.performance_ticker.tracker.increment_failure()

            self.logger.info('BashDriver for %s return code is %r' % (self.thread_name, code))
        except Exception as e:
            self.performance_ticker.tracker.increment_failure()
            self.logger.error('Safety fuse while processing request %r: %r' % (self.message.body, e), exc_info=True)
        finally:
            self.consumer.acknowledge(self.message.delivery_tag)
 def __init__(self, process_name, perform_db_logging=False):
     super(AbstractUowAwareWorker, self).__init__(process_name)
     self.perform_db_logging = perform_db_logging
     self.uow_dao = UnitOfWorkDao(self.logger)
     self.mq_transmitter = MqTransmitter(self.logger)
예제 #38
0
class StatusBusListener(object):
    """ class instance listens to the QUEUE_UOW_REPORT queue and updates Timetable records correspondingly """

    def __init__(self, scheduler):
        self.scheduler = scheduler
        self.timetable = scheduler.timetable
        self.logger = scheduler.logger
        self.uow_dao = UnitOfWorkDao(self.logger)
        self.consumer = Consumer(QUEUE_UOW_REPORT)
        self.main_thread = None

    def __del__(self):
        try:
            self.logger.info('StatusBusListener: Closing Flopsy Consumer...')
            self.consumer.close()
        except Exception as e:
            self.logger.error('StatusBusListener: Exception caught while closing Flopsy Consumer: %s' % str(e))

    # ********************** thread-related methods ****************************
    def _mq_callback(self, message):
        """ method processes messages from Synergy Worker and updates corresponding Timetable record,
        as well as the job itself
        :param message: <SynergyMqTransmission> mq message """
        try:
            self.logger.info('StatusBusListener {')

            mq_request = SynergyMqTransmission.from_json(message.body)
            uow = self.uow_dao.get_one(mq_request.unit_of_work_id)
            if uow.unit_of_work_type != TYPE_MANAGED:
                self.logger.info('Received transmission from TYPE_FREERUN execution. Ignoring it.')
                return

            tree = self.timetable.get_tree(uow.process_name)
            node = tree.get_node(uow.process_name, uow.timeperiod)

            if uow.db_id != node.job_record.related_unit_of_work:
                self.logger.info('Received transmission is likely outdated. Ignoring it.')
                return

            if not uow.is_finished:
                # rely on Garbage Collector to re-trigger the failing unit_of_work
                self.logger.info('Received unit_of_work status report from %s at %s in non-final state %s. Ignoring it.'
                                 % (uow.process_name, uow.timeperiod, uow.state))
                return

            process_entry = self.scheduler.managed_handlers[uow.process_name].process_entry
            state_machine = self.scheduler.state_machines[process_entry.state_machine_name]
            assert isinstance(state_machine, AbstractStateMachine)

            self.logger.info('Commencing shallow state update for unit_of_work from %s at %s in %s.'
                             % (uow.process_name, uow.timeperiod, uow.state))
            state_machine.shallow_state_update(uow)

        except KeyError:
            self.logger.error('Access error for %s' % str(message.body), exc_info=True)
        except LookupError:
            self.logger.error('Can not perform shallow state update for %s' % str(message.body), exc_info=True)
        except Exception:
            self.logger.error('Unexpected error during shallow state update for %s' % str(message.body), exc_info=True)
        finally:
            self.consumer.acknowledge(message.delivery_tag)
            self.logger.info('StatusBusListener }')

    def _run_mq_listener(self):
        try:
            self.consumer.register(self._mq_callback)
            self.logger.info('StatusBusListener: instantiated and activated.')
            self.consumer.wait()
        except (AMQPError, IOError) as e:
            self.logger.error('StatusBusListener: AMQPError %s' % str(e))
        finally:
            self.__del__()
            self.logger.info('StatusBusListener: Shutting down... All auxiliary threads stopped.')

    def start(self, *_):
        self.main_thread = Thread(target=self._run_mq_listener)
        self.main_thread.daemon = True
        self.main_thread.start()