def __init__(self, request, **values): super(FlowActionHandler, self).__init__(request, **values) self.flow_dao = FlowDao(self.logger) self.step_dao = StepDao(self.logger) self.uow_dao = UnitOfWorkDao(self.logger) self.log_recording_dao = LogRecordingDao(self.logger) self.process_name = self.request_arguments.get(ARGUMENT_PROCESS_NAME) self.unit_of_work_type = self.request_arguments.get( ARGUMENT_UNIT_OF_WORK_TYPE, TYPE_MANAGED) self.flow_name = self.request_arguments.get(ARGUMENT_FLOW_NAME) if not self.flow_name and self.process_name: process_entry = context.process_context[self.process_name] self.flow_name = process_entry.arguments.get(ARGUMENT_FLOW_NAME) self.step_name = self.request_arguments.get(ARGUMENT_STEP_NAME) self.timeperiod = self.request_arguments.get(ARGUMENT_TIMEPERIOD) self.is_request_valid = True if self.flow_name \ and self.flow_name in flows.flows \ and self.timeperiod \ else False if self.is_request_valid: self.flow_name = self.flow_name.strip() self.timeperiod = self.timeperiod.strip() self.run_mode = self.request_arguments.get(ARGUMENT_RUN_MODE, '') self.run_mode = self.run_mode.strip()
def setUp(self): self.process_name = PROCESS_ALERT_DAILY self.logger = get_logger(self.process_name) self.uow_id = create_and_insert_unit_of_work(self.process_name, 'range_start', 'range_end') self.uow_id = str(self.uow_id) self.uow_dao = UnitOfWorkDao(self.logger) self.log_recording_dao = LogRecordingDao(self.logger)
def create_and_insert_unit_of_work(process_name, start_id, end_id, state=unit_of_work.STATE_REQUESTED, timeperiod='INVALID_TIMEPERIOD'): """ method creates and inserts a unit_of_work into DB :return id of the created object in the db""" uow = create_unit_of_work(process_name, start_id, end_id, timeperiod, state) logger = get_logger(process_name) uow_dao = UnitOfWorkDao(logger) uow_id = uow_dao.insert(uow) return uow_id
def tearDown(self): uow_dao = UnitOfWorkDao(self.worker.logger) uow_dao.remove(self.uow_id) for instance in self.worker.instances: self.conn.stop_instances(instance.id) # killing the worker del self.worker
class LogRecordingHandlerUnitTest(unittest.TestCase): """ Test flow: 1. create a UOW in the database 2. emulate mq message 3. call _mq_callback method 4. validate that all the messages are now found in the uow_log record 5. remove UOW and uow_log record """ def setUp(self): self.process_name = PROCESS_ALERT_DAILY self.logger = get_logger(self.process_name) self.uow_id = create_and_insert_unit_of_work(self.process_name, 'range_start', 'range_end') self.uow_id = str(self.uow_id) self.uow_dao = UnitOfWorkDao(self.logger) self.log_recording_dao = LogRecordingDao(self.logger) def tearDown(self): self.uow_dao.remove(self.uow_id) self.log_recording_dao.remove(self.uow_id) def test_logging(self): self.worker = ChattyWorker(self.process_name) message = TestMessage(process_name=self.process_name, uow_id=self.uow_id) self.worker._mq_callback(message) uow_log = self.log_recording_dao.get_one(self.uow_id) messages = INFO_LOG_MESSAGES + WARN_LOG_MESSAGES # + STD_MESSAGES self.assertLessEqual(len(messages), len(uow_log.log)) for index, message in enumerate(messages): self.assertIn(message, uow_log.log[index]) def test_exception_logging(self): self.worker = ExceptionWorker(self.process_name) message = TestMessage(process_name=self.process_name, uow_id=self.uow_id) self.worker._mq_callback(message) uow_log = self.log_recording_dao.get_one(self.uow_id) messages = [ 'Exception: Artificially triggered exception to test Uow Exception Logging', 'Method ExceptionWorker._process_uow returned None. Assuming happy flow.', 'at INVALID_TIMEPERIOD: Success/Failure 0/0 entries' ] for index, message in enumerate(messages): self.assertIn(message, uow_log.log[index])
def __init__(self, process_name): super(GarbageCollectorWorker, self).__init__(process_name) self.lock = Lock() self.publishers = PublishersPool(self.logger) self.uow_dao = UnitOfWorkDao(self.logger) self.managed_dao = ManagedProcessDao(self.logger) self.managed_entries = dict()
def __init__(self, logger, timetable, name): self.name = name self.logger = logger self.publishers = PublishersPool(self.logger) self.timetable = timetable self.uow_dao = UnitOfWorkDao(self.logger) self.job_dao = JobDao(self.logger)
class FreerunStatements(object): def __init__(self, logger, freerun_handlers): self.lock = RLock() self.logger = logger self.freerun_handlers = freerun_handlers self.uow_dao = UnitOfWorkDao(self.logger) @thread_safe def retrieve_records(self, timeperiod, include_running, include_processed, include_noop, include_failed, include_disabled): """ method looks for suitable UOW records and returns them as a dict""" resp = dict() try: query = unit_of_work_dao.QUERY_GET_FREERUN_SINCE(timeperiod, include_running, include_processed, include_noop, include_failed) records_list = self.uow_dao.run_query(query) if len(records_list) == 0: self.logger.warning('MX: no Freerun UOW records found since {0}.'.format(timeperiod)) for uow_record in records_list: if uow_record.process_name not in self.freerun_handlers: continue thread_handler = self.freerun_handlers[uow_record.process_name] if not include_disabled and not thread_handler.process_entry.is_on: continue resp[uow_record.key] = uow_record.document except Exception as e: self.logger.error('MX Dashboard FreerunStatements error: {0}'.format(e)) return resp
def __init__(self, scheduler): self.scheduler = scheduler self.timetable = scheduler.timetable self.logger = scheduler.logger self.uow_dao = UnitOfWorkDao(self.logger) self.consumer = Consumer(QUEUE_UOW_REPORT) self.main_thread = None
def setUp(self): self.process_name = PROCESS_ALERT_DAILY self.logger = get_logger(self.process_name) self.uow_id = create_and_insert_unit_of_work(self.process_name, 'range_start', 'range_end') self.uow_id = str(self.uow_id) self.uow_dao = UnitOfWorkDao(self.logger) self.uow_log_dao = UowLogDao(self.logger)
def __init__(self, logger, timetable, name): self.name = name self.logger = logger self.mq_transmitter = MqTransmitter(self.logger) self.timetable = timetable self.uow_dao = UnitOfWorkDao(self.logger) self.job_dao = JobDao(self.logger)
class UowLogHandlerUnitTest(unittest.TestCase): """ Test flow: 1. create a UOW in the database 2. emulate mq message 3. call _mq_callback method 4. validate that all the messages are now found in the uow_log record 5. remove UOW and uow_log record """ def setUp(self): self.process_name = PROCESS_ALERT_DAILY self.logger = get_logger(self.process_name) self.uow_id = create_and_insert_unit_of_work(self.process_name, 'range_start', 'range_end') self.uow_id = str(self.uow_id) self.uow_dao = UnitOfWorkDao(self.logger) self.uow_log_dao = UowLogDao(self.logger) def tearDown(self): self.uow_dao.remove(self.uow_id) self.uow_log_dao.remove(self.uow_id) def test_logging(self): self.worker = ChattyWorker(self.process_name) message = TestMessage(process_name=self.process_name, uow_id=self.uow_id) self.worker._mq_callback(message) uow_log = self.uow_log_dao.get_one(self.uow_id) messages = INFO_LOG_MESSAGES + WARN_LOG_MESSAGES # + STD_MESSAGES self.assertLessEqual(len(messages), len(uow_log.log)) for index, message in enumerate(messages): self.assertIn(message, uow_log.log[index]) def test_exception_logging(self): self.worker = ExceptionWorker(self.process_name) message = TestMessage(process_name=self.process_name, uow_id=self.uow_id) self.worker._mq_callback(message) uow_log = self.uow_log_dao.get_one(self.uow_id) messages = ['Exception: Artificially triggered exception to test Uow Exception Logging', 'method ExceptionWorker._process_uow returned None. Assuming happy flow.', 'at INVALID_TIMEPERIOD: Success/Failure 0/0'] for index, message in enumerate(messages): self.assertIn(message, uow_log.log[index])
def test_select_reprocessing_candidates(self): logger = get_logger(PROCESS_UNIT_TEST) uow_dao = UnitOfWorkDao(logger) try: initial_candidates = uow_dao.get_reprocessing_candidates() except: initial_candidates = [] try: initial_positive_candidates = uow_dao.get_reprocessing_candidates( '2010123123') except: initial_positive_candidates = [] positive_timeperiods = { u'2010123123': PROCESS_SITE_HOURLY, # hourly time qualifier u'2010123100': PROCESS_SITE_DAILY, # daily time qualifier u'2010120000': PROCESS_SITE_MONTHLY, # monthly time qualifier u'2010000000': PROCESS_SITE_YEARLY } # yearly time qualifier negative_timeperiods = { u'2009123123': PROCESS_SITE_HOURLY, # hourly time qualifier u'2009123100': PROCESS_SITE_DAILY, # daily time qualifier u'2009120000': PROCESS_SITE_MONTHLY, # monthly time qualifier u'2009000000': PROCESS_SITE_YEARLY } # yearly time qualifier all_timeperiods = dict() all_timeperiods.update(positive_timeperiods) all_timeperiods.update(negative_timeperiods) created_uow = [] for timeperiod, process_name in all_timeperiods.items(): created_uow.append( create_and_insert_unit_of_work( process_name, 0, 1, timeperiod=timeperiod, state=unit_of_work.STATE_INVALID)) candidates = uow_dao.get_reprocessing_candidates('2010123123') self.assertEqual( len(candidates) - len(initial_positive_candidates), len(positive_timeperiods)) candidates = uow_dao.get_reprocessing_candidates() self.assertEqual( len(candidates) - len(initial_candidates), len(all_timeperiods)) for uow_id in created_uow: uow_dao.remove(uow_id)
def __init__(self, scheduler): self.logger = get_logger(PROCESS_GC, append_to_console=False, redirect_stdstream=False) self.managed_handlers = scheduler.managed_handlers self.mq_transmitter = MqTransmitter(self.logger) self.timetable = scheduler.timetable self.lock = Lock() self.uow_dao = UnitOfWorkDao(self.logger) self.reprocess_uows = collections.defaultdict(PriorityQueue) self.timer = RepeatTimer(settings.settings['gc_run_interval'], self._run)
def __init__(self, logger, message, consumer, performance_ticker): self.logger = logger self.message = message self.mq_request = SynergyMqTransmission.from_json(message.body) self.consumer = consumer self.performance_ticker = performance_ticker self.alive = False self.return_code = -1 self.uow_dao = UnitOfWorkDao(self.logger) self.thread_name = str(self.mq_request) super(BashRunnable, self).__init__(name=self.thread_name)
def __init__(self, request, **values): super(FreerunActionHandler, self).__init__(request, **values) self.process_name = self.request_arguments.get('process_name') self.entry_name = self.request_arguments.get('entry_name') self.freerun_process_dao = FreerunProcessDao(self.logger) self.uow_dao = UnitOfWorkDao(self.logger) self.is_request_valid = True if self.process_name and self.entry_name else False if self.is_request_valid: self.process_name = self.process_name.strip() self.entry_name = self.entry_name.strip() self.is_requested_state_on = 'is_on' in self.request_arguments and self.request_arguments['is_on']
def __init__(self, logger): self.lock = RLock() self.logger = logger self.uow_dao = UnitOfWorkDao(self.logger) self.job_dao = JobDao(self.logger) self.reprocess = dict() # self.trees contain all of the trees and manages much of their life cycle # remember to enlist there all trees the system is working with self.trees = self._construct_trees_from_context() self._register_callbacks() self._register_dependencies() self.load_tree() self.build_trees() self.validate()
def test_select_reprocessing_candidates(self): logger = get_logger(PROCESS_UNIT_TEST) uow_dao = UnitOfWorkDao(logger) try: initial_candidates = uow_dao.get_reprocessing_candidates() except: initial_candidates = [] try: initial_positive_candidates = uow_dao.get_reprocessing_candidates('2010123123') except: initial_positive_candidates = [] positive_timeperiods = {u'2010123123': PROCESS_SITE_HOURLY, # hourly time qualifier u'2010123100': PROCESS_SITE_DAILY, # daily time qualifier u'2010120000': PROCESS_SITE_MONTHLY, # monthly time qualifier u'2010000000': PROCESS_SITE_YEARLY} # yearly time qualifier negative_timeperiods = {u'2009123123': PROCESS_SITE_HOURLY, # hourly time qualifier u'2009123100': PROCESS_SITE_DAILY, # daily time qualifier u'2009120000': PROCESS_SITE_MONTHLY, # monthly time qualifier u'2009000000': PROCESS_SITE_YEARLY} # yearly time qualifier all_timeperiods = dict() all_timeperiods.update(positive_timeperiods) all_timeperiods.update(negative_timeperiods) created_uow = [] for timeperiod, process_name in all_timeperiods.items(): created_uow.append(create_and_insert_unit_of_work(process_name, 0, 1, timeperiod=timeperiod, state=unit_of_work.STATE_INVALID)) candidates = uow_dao.get_reprocessing_candidates('2010123123') self.assertEqual(len(candidates) - len(initial_positive_candidates), len(positive_timeperiods)) candidates = uow_dao.get_reprocessing_candidates() self.assertEqual(len(candidates) - len(initial_candidates), len(all_timeperiods)) for uow_id in created_uow: uow_dao.remove(uow_id)
class FreerunStatements(object): def __init__(self, logger): self.lock = RLock() self.logger = logger self.uow_dao = UnitOfWorkDao(self.logger) @thread_safe def retrieve_records(self, timeperiod, unprocessed_only): """ method looks for suitable UOW records and returns them as a dict""" resp = dict() try: query = unit_of_work_dao.QUERY_GET_FREERUN_SINCE(timeperiod, unprocessed_only) records_list = self.uow_dao.run_query(query) if len(records_list) == 0: self.logger.warn('No Freerun UOW records found since {0}.'.format(timeperiod)) for uow_record in records_list: resp[uow_record.key] = uow_record.document except Exception as e: self.logger.error('DashboardHandler error: {0}'.format(e)) return resp
class GarbageCollectorWorker(AbstractMqWorker): """ GC is triggered by an empty message from RabbitMQ. It scans for invalid or stalled unit_of_work and re-triggers them. GC is vital for the health of the system. Deployment with no running GC is considered invalid """ def __init__(self, process_name): super(GarbageCollectorWorker, self).__init__(process_name) self.lock = Lock() self.publishers = PublishersPool(self.logger) self.uow_dao = UnitOfWorkDao(self.logger) self.managed_dao = ManagedProcessDao(self.logger) self.managed_entries = dict() def __del__(self): try: self.logger.info('Closing Flopsy Publishers Pool...') self.publishers.close() except Exception as e: self.logger.error('Exception caught while closing Flopsy Publishers Pool: %s' % str(e)) super(GarbageCollectorWorker, self).__del__() @thread_safe def _mq_callback(self, message): """ method looks for stale or invalid units of work re-runs them if needed""" try: managed_entries = self.managed_dao.get_all() self._update_managed_entries(managed_entries) since = settings.settings['synergy_start_timeperiod'] uow_list = self.uow_dao.get_reprocessing_candidates(since) for uow in uow_list: if uow.process_name not in self.managed_entries: self.logger.debug('Process %r is not known to the Synergy Scheduler. Skipping its unit_of_work.' % uow.process_name) continue process_entry = self.managed_entries[uow.process_name] assert isinstance(process_entry, ManagedProcessEntry) if not process_entry.is_on: self.logger.debug('Process %r is inactive at the Synergy Scheduler. Skipping its unit_of_work.' % uow.process_name) continue self._process_single_document(uow) except LookupError as e: self.logger.info('Normal behaviour. %r' % e) except Exception as e: self.logger.error('_mq_callback: %s' % str(e), exc_info=True) finally: self.consumer.acknowledge(message.delivery_tag) def _update_managed_entries(self, managed_entries): self.managed_entries = {me.process_name: me for me in managed_entries} def _process_single_document(self, uow): """ actually inspects UOW retrieved from the database""" repost = False if uow.is_invalid: repost = True elif uow.is_in_progress or uow.is_requested: last_activity = uow.started_at if last_activity is None: last_activity = uow.created_at if datetime.utcnow() - last_activity > timedelta(hours=REPOST_AFTER_HOURS): repost = True if repost: mq_request = SynergyMqTransmission(process_name=uow.process_name, unit_of_work_id=uow.db_id) if datetime.utcnow() - uow.created_at < timedelta(hours=LIFE_SUPPORT_HOURS): uow.state = unit_of_work.STATE_REQUESTED uow.number_of_retries += 1 self.uow_dao.update(uow) publisher = self.publishers.get(uow.process_name) publisher.publish(mq_request.document) publisher.release() self.logger.info('UOW marked for re-processing: process %s; timeperiod %s; id %s; attempt %d' % (uow.process_name, uow.timeperiod, uow.db_id, uow.number_of_retries)) self.performance_ticker.tracker.increment_success() else: uow.state = unit_of_work.STATE_CANCELED self.uow_dao.update(uow) publisher = self.publishers.get(QUEUE_UOW_REPORT) publisher.publish(mq_request.document) publisher.release() self.logger.info('UOW transferred to STATE_CANCELED: process %s; timeperiod %s; id %s; attempt %d' % (uow.process_name, uow.timeperiod, uow.db_id, uow.number_of_retries))
class FlowActionHandler(BaseRequestHandler): def __init__(self, request, **values): super(FlowActionHandler, self).__init__(request, **values) self.flow_dao = FlowDao(self.logger) self.step_dao = StepDao(self.logger) self.uow_dao = UnitOfWorkDao(self.logger) self.log_recording_dao = LogRecordingDao(self.logger) self.process_name = self.request_arguments.get(ARGUMENT_PROCESS_NAME) self.unit_of_work_type = self.request_arguments.get( ARGUMENT_UNIT_OF_WORK_TYPE, TYPE_MANAGED) self.flow_name = self.request_arguments.get(ARGUMENT_FLOW_NAME) if not self.flow_name and self.process_name: process_entry = context.process_context[self.process_name] self.flow_name = process_entry.arguments.get(ARGUMENT_FLOW_NAME) self.step_name = self.request_arguments.get(ARGUMENT_STEP_NAME) self.timeperiod = self.request_arguments.get(ARGUMENT_TIMEPERIOD) self.is_request_valid = True if self.flow_name \ and self.flow_name in flows.flows \ and self.timeperiod \ else False if self.is_request_valid: self.flow_name = self.flow_name.strip() self.timeperiod = self.timeperiod.strip() self.run_mode = self.request_arguments.get(ARGUMENT_RUN_MODE, '') self.run_mode = self.run_mode.strip() def _get_tree_node(self): tree = self.scheduler.timetable.get_tree(self.process_name) if tree is None: raise UserWarning( 'No Timetable tree is registered for process {0}'.format( self.process_name)) time_qualifier = context.process_context[ self.process_name].time_qualifier self.timeperiod = time_helper.cast_to_time_qualifier( time_qualifier, self.timeperiod) node = tree.get_node(self.process_name, self.timeperiod) return node @property def job_record(self): node = self._get_tree_node() return node.job_record @property def managed_uow_record(self): node = self._get_tree_node() uow_id = node.job_record.related_unit_of_work if not uow_id: return None return self.uow_dao.get_one(uow_id) @property def freerun_uow_records(self): valid_freerun_uow = list() records_list = find_all_flow_uows(self.uow_dao, self.process_name, self.flow_name, self.timeperiod) if len(records_list) == 0: self.logger.warning( 'MX: no Freerun UOW records found for {0}@{1} -> {2}.'.format( self.process_name, self.timeperiod, self.flow_name)) return valid_freerun_uow for uow_record in records_list: # freerun uow.process_name is a composite in format <process_name::entry_name> handler_key = split_schedulable_name(uow_record.process_name) if handler_key not in self.scheduler.freerun_handlers: # skip UOW records that have no active freerun handler continue valid_freerun_uow.append(uow_record) return valid_freerun_uow @property def freerun_process_entry(self): """ :returns run-time only instance of the FreerunProcessEntry """ entry_name = build_schedulable_name(self.flow_name, self.step_name) handler_key = (self.process_name, entry_name) if handler_key not in self.scheduler.freerun_handlers: classname = context.process_context[self.process_name].classname entry = freerun_context_entry( process_name=self.process_name, entry_name=entry_name, classname=classname, token=entry_name, trigger_frequency='every {0}'.format(SECONDS_IN_CENTURY), is_on=False, description= 'Runtime freerun object to facilitate CUSTOM RUN MODES for workflow' ) # find uow for workflow step if any uow = find_flow_step_uow(self.uow_dao, self.process_name, self.flow_name, self.step_name, self.timeperiod) if uow: entry.related_unit_of_work = uow.db_id self.scheduler.freerun_handlers[handler_key] = entry return self.scheduler.freerun_handlers[handler_key] @property def flow_record(self): return self.flow_dao.get_one([self.flow_name, self.timeperiod]) @property def step_record(self): return self.step_dao.get_one( [self.flow_name, self.step_name, self.timeperiod]) @property def flow_graph_obj(self): _flow_graph_obj = copy.deepcopy(flows.flows[self.flow_name]) _flow_graph_obj.context = ExecutionContext(self.flow_name, self.timeperiod, None, None, settings.settings) try: flow_entry = self.flow_dao.get_one( [self.flow_name, self.timeperiod]) _flow_graph_obj.context.flow_entry = flow_entry _flow_graph_obj.context.start_timeperiod = flow_entry.start_timeperiod _flow_graph_obj.context.end_timeperiod = flow_entry.end_timeperiod steps = self.step_dao.get_all_by_flow_id(flow_entry.db_id) for s in steps: assert isinstance(s, Step) _flow_graph_obj[s.step_name].step_entry = s _flow_graph_obj.yielded.append(s) except LookupError: pass return _flow_graph_obj @cached_property @valid_action_request def flow_details(self): rest_model = create_rest_flow(self.flow_graph_obj) return rest_model.document @cached_property def process_name(self): return self.process_name @cached_property def unit_of_work_type(self): return self.unit_of_work_type @cached_property def active_run_mode(self): return self.flow_dao.managed_run_mode(self.process_name, self.flow_name, self.timeperiod) @cached_property @valid_action_request def step_details(self): graph_node_obj = self.flow_graph_obj._dict[self.step_name] rest_model = create_rest_step(graph_node_obj) return rest_model.document @valid_action_request def set_run_mode(self): """ - set a flag for ProcessEntry.arguments[ARGUMENT_RUN_MODE] = RUN_MODE_RECOVERY - trigger standard reprocessing """ if not self.job_record or not self.run_mode: return RESPONSE_NOT_OK try: msg = 'MX: setting RUN MODE for {0}@{1} -> {2} to {3}' \ .format(self.process_name, self.timeperiod, self.flow_name, self.run_mode) self.scheduler.timetable.add_log_entry(self.process_name, self.timeperiod, msg) self.logger.info(msg + ' {') local_record = self.flow_record local_record.run_mode = self.run_mode self.flow_dao.update(local_record) return RESPONSE_OK except (ValidationError, LookupError): return RESPONSE_NOT_OK finally: self.logger.info('}') def perform_freerun_action(self, run_mode): """ - make sure that the job is finished i.e. the job is in [STATE_NOOP, STATE_PROCESSED, STATE_SKIPPED] - submit a FREERUN UOW for given (process_name::flow_name::step_name, timeperiod) :return RESPONSE_OK if the UOW was submitted and RESPONSE_NOT_OK otherwise """ if not self.job_record or not self.job_record.is_finished: return RESPONSE_NOT_OK uow = self.managed_uow_record if not uow: # for skipped job that has no UOW associated with it return RESPONSE_NOT_OK flow_request = FlowRequest(self.process_name, self.flow_name, self.step_name, run_mode, self.timeperiod, uow.start_timeperiod, uow.end_timeperiod) state_machine = self.scheduler.timetable.state_machines[ STATE_MACHINE_FREERUN] state_machine.manage_schedulable(self.freerun_process_entry, flow_request) return RESPONSE_OK @valid_action_request def run_one_step(self): return self.perform_freerun_action(RUN_MODE_RUN_ONE) @valid_action_request def run_from_step(self): """ - make sure that the job is finished i.e. the job is in [STATE_NOOP, STATE_PROCESSED, STATE_SKIPPED] - submit a FREERUN UOW for given (process_name::flow_name::step_name, timeperiod) :return RESPONSE_OK if the UOW was submitted and RESPONSE_NOT_OK otherwise """ return self.perform_freerun_action(RUN_MODE_RUN_FROM) @valid_action_request @safe_json_response def get_step_log(self): try: resp = self.log_recording_dao.get_one( self.step_record.db_id).document except (TypeError, LookupError): resp = {'response': 'no related step log'} return resp @valid_action_request @safe_json_response def get_flow_log(self): try: resp = self.log_recording_dao.get_one( self.flow_record.db_id).document except (TypeError, LookupError): resp = {'response': 'no related workflow log'} return resp
class GarbageCollector(object): """ GC is triggered directly by Synergy Scheduler. It scans for invalid or stalled unit_of_work and re-triggers them. GC is vital for the health of the system. Deployment with no GC is considered invalid """ def __init__(self, scheduler): self.logger = get_logger(PROCESS_GC, append_to_console=False, redirect_stdstream=False) self.managed_handlers = scheduler.managed_handlers self.mq_transmitter = MqTransmitter(self.logger) self.timetable = scheduler.timetable self.lock = Lock() self.uow_dao = UnitOfWorkDao(self.logger) self.reprocess_uows = collections.defaultdict(PriorityQueue) self.timer = RepeatTimer(settings.settings['gc_run_interval'], self._run) @thread_safe def scan_uow_candidates(self): """ method performs two actions: - enlist stale or invalid units of work into reprocessing queue - cancel UOWs that are older than 2 days and have been submitted more than 1 hour ago """ try: since = settings.settings['synergy_start_timeperiod'] uow_list = self.uow_dao.get_reprocessing_candidates(since) except LookupError as e: self.logger.info('flow: no UOW candidates found for reprocessing: {0}'.format(e)) return for uow in uow_list: try: if uow.process_name not in self.managed_handlers: self.logger.debug('process {0} is not known to the Synergy Scheduler. Skipping its UOW.' .format(uow.process_name)) continue thread_handler = self.managed_handlers[uow.process_name] assert isinstance(thread_handler, ManagedThreadHandler) if not thread_handler.process_entry.is_on: self.logger.debug('process {0} is inactive. Skipping its UOW.'.format(uow.process_name)) continue entry = PriorityEntry(uow) if entry in self.reprocess_uows[uow.process_name]: # given UOW is already registered in the reprocessing queue continue # ASSUMPTION: UOW is re-created by a state machine during reprocessing # thus - any UOW older 2 days could be marked as STATE_CANCELED if datetime.utcnow() - uow.created_at > timedelta(hours=settings.settings['gc_life_support_hours']): self._cancel_uow(uow) continue # if the UOW has been idle for more than 1 hour - resubmit it if datetime.utcnow() - uow.submitted_at > timedelta(hours=settings.settings['gc_resubmit_after_hours'])\ or uow.is_invalid: # enlist the UOW into the reprocessing queue self.reprocess_uows[uow.process_name].put(entry) except Exception as e: self.logger.error('flow exception: {0}'.format(e), exc_info=True) def _flush_queue(self, q, ignore_priority=False): """ :param q: PriorityQueue instance holding GarbageCollector entries :param ignore_priority: If True - all GarbageCollector entries should be resubmitted If False - only those entries whose waiting time has expired will be resubmitted """ assert isinstance(q, PriorityQueue) current_timestamp = compute_release_time(lag_in_minutes=0) for _ in range(len(q)): entry = q.pop() assert isinstance(entry, PriorityEntry) if ignore_priority or entry.release_time < current_timestamp: self._resubmit_uow(entry.entry) else: q.put(entry) break @thread_safe def flush(self, ignore_priority=False): """ method iterates over each reprocessing queues and re-submits UOW whose waiting time has expired """ for process_name, q in self.reprocess_uows.items(): self._flush_queue(q, ignore_priority) @thread_safe def validate(self): """ method iterates over the reprocessing queue and synchronizes state of every UOW with the DB should it change via the MX to STATE_CANCELED - remove the UOW from the queue """ for process_name, q in self.reprocess_uows.items(): if not q: continue invalid_entries = list() for entry in q.queue: assert isinstance(entry, PriorityEntry) uow = self.uow_dao.get_one(entry.entry.db_id) if uow.is_canceled: invalid_entries.append(entry) for entry in invalid_entries: q.queue.remove(entry) self.logger.info('reprocessing queue validated') @thread_safe def flush_one(self, process_name, ignore_priority=False): """ method iterates over the reprocessing queue for the given process and re-submits UOW whose waiting time has expired """ q = self.reprocess_uows[process_name] self._flush_queue(q, ignore_priority) def _resubmit_uow(self, uow): # re-read UOW from the DB, in case it was STATE_CANCELLED by MX uow = self.uow_dao.get_one(uow.db_id) if uow.is_canceled: self.logger.info('suppressed re-submission of UOW {0} for {1}@{2} in {3};' .format(uow.db_id, uow.process_name, uow.timeperiod, uow.state)) return if uow.is_invalid: uow.number_of_retries += 1 uow.state = unit_of_work.STATE_REQUESTED uow.submitted_at = datetime.utcnow() self.uow_dao.update(uow) self.mq_transmitter.publish_managed_uow(uow) self.logger.info('re-submitted UOW {0} for {1}@{2}; attempt {3}' .format(uow.db_id, uow.process_name, uow.timeperiod, uow.number_of_retries)) def _cancel_uow(self, uow): uow.state = unit_of_work.STATE_CANCELED self.uow_dao.update(uow) self.mq_transmitter.publish_uow_status(uow) self.logger.info('canceled UOW {0} for {1}@{2}; attempt {3}; created at {4}' .format(uow.db_id, uow.process_name, uow.timeperiod, uow.number_of_retries, uow.created_at)) def _run(self): try: self.logger.info('run {') self.logger.debug('step 1: validate existing queue entries') self.validate() self.logger.debug('step 2: scan reprocessing candidates') self.scan_uow_candidates() self.logger.debug('step 3: repost after timeout') self.flush() self.logger.debug('step 4: timetable housekeeping') self.timetable.build_trees() self.logger.debug('step 5: timetable validation') self.timetable.validate() except Exception as e: self.logger.error('GC run exception: {0}'.format(e)) finally: self.logger.info('}') def start(self): self.timer.start() def stop(self): self.timer.cancel()
class StateMachineFreerun(object): """ State Machine to handle freerun jobs/triggers """ def __init__(self, logger, name=STATE_MACHINE_FREERUN): self.name = name self.logger = logger self.mq_transmitter = MqTransmitter(self.logger) self.uow_dao = UnitOfWorkDao(self.logger) self.sfe_dao = FreerunProcessDao(self.logger) @with_reconnect def _log_message(self, level, freerun_entry, msg): """ method performs logging into log file and the freerun_entry """ self.logger.log(level, msg) assert isinstance(freerun_entry, FreerunProcessEntry) event_log = freerun_entry.event_log if len(event_log) > MAX_NUMBER_OF_EVENTS: del event_log[-1] event_log.insert(0, msg) self.sfe_dao.update(freerun_entry) @with_reconnect def _insert_uow(self, freerun_entry): """ creates unit_of_work and inserts it into the DB :raise DuplicateKeyError: if unit_of_work with given parameters already exists """ current_timeperiod = time_helper.actual_timeperiod(QUALIFIER_REAL_TIME) process_entry = context.process_context[freerun_entry.process_name] uow = UnitOfWork() uow.process_name = freerun_entry.schedulable_name uow.timeperiod = current_timeperiod uow.start_id = 0 uow.end_id = 0 uow.start_timeperiod = current_timeperiod uow.end_timeperiod = current_timeperiod uow.created_at = datetime.utcnow() uow.submitted_at = datetime.utcnow() uow.source = process_entry.source if hasattr(process_entry, 'source') else None uow.sink = process_entry.sink if hasattr(process_entry, 'sink') else None uow.state = unit_of_work.STATE_REQUESTED uow.unit_of_work_type = unit_of_work.TYPE_FREERUN uow.number_of_retries = 0 uow.arguments = freerun_entry.arguments uow.db_id = self.uow_dao.insert(uow) msg = 'Created: UOW {0} for {1}@{2}.' \ .format(uow.db_id, freerun_entry.schedulable_name, current_timeperiod) self._log_message(INFO, freerun_entry, msg) return uow def _publish_uow(self, freerun_entry, uow): self.mq_transmitter.publish_freerun_uow(freerun_entry, uow) msg = 'Published: UOW {0} for {1}.'.format(uow.db_id, freerun_entry.schedulable_name) self._log_message(INFO, freerun_entry, msg) def insert_and_publish_uow(self, freerun_entry): try: uow = self._insert_uow(freerun_entry) except DuplicateKeyError as e: msg = 'Duplication of UOW found for {0}. Error msg: {1}'.format(freerun_entry.schedulable_name, e) self._log_message(WARNING, freerun_entry, msg) uow = self.uow_dao.recover_from_duplicatekeyerror(e) if uow is not None: # publish the created/caught up unit_of_work self._publish_uow(freerun_entry, uow) freerun_entry.related_unit_of_work = uow.db_id self.sfe_dao.update(freerun_entry) else: msg = 'PERSISTENT TIER ERROR! Unable to locate UOW for {0}' \ .format(freerun_entry.schedulable_name) self._log_message(WARNING, freerun_entry, msg) def _process_state_embryo(self, freerun_entry): """ method creates unit_of_work and associates it with the FreerunProcessEntry """ self.insert_and_publish_uow(freerun_entry) def _process_state_in_progress(self, freerun_entry, uow): """ method that takes care of processing unit_of_work records in STATE_REQUESTED or STATE_IN_PROGRESS states""" self._publish_uow(freerun_entry, uow) def _process_terminal_state(self, freerun_entry, uow): """ method that takes care of processing unit_of_work records in STATE_PROCESSED, STATE_NOOP, STATE_INVALID, STATE_CANCELED states""" msg = 'UOW for {0} found in state {1}.'.format(freerun_entry.schedulable_name, uow.state) self._log_message(INFO, freerun_entry, msg) self.insert_and_publish_uow(freerun_entry) def manage_schedulable(self, freerun_entry): """ method main duty - is to _avoid_ publishing another unit_of_work, if previous was not yet processed In case the Scheduler sees that the unit_of_work is pending it will fire another WorkerMqRequest """ assert isinstance(freerun_entry, FreerunProcessEntry) if freerun_entry.related_unit_of_work is None: uow = None else: uow = self.uow_dao.get_one(freerun_entry.related_unit_of_work) try: if uow is None: self._process_state_embryo(freerun_entry) elif uow.is_requested or uow.is_in_progress: self._process_state_in_progress(freerun_entry, uow) elif uow.is_finished or uow.is_invalid: self._process_terminal_state(freerun_entry, uow) else: msg = 'Unknown state {0} of the UOW {1}'.format(uow.state, uow.db_id) self._log_message(ERROR, freerun_entry, msg) except LookupError as e: msg = 'Lookup issue for schedulable: {0} in timeperiod {1}, because of: {2}' \ .format(freerun_entry.db_id, uow.timeperiod, e) self._log_message(WARNING, freerun_entry, msg) def cancel_uow(self, freerun_entry): uow_id = freerun_entry.related_unit_of_work if uow_id is None: msg = 'cancel_uow: no related UOW for {0}'.format(freerun_entry.schedulable_name) else: uow = self.uow_dao.get_one(uow_id) uow.state = unit_of_work.STATE_CANCELED self.uow_dao.update(uow) msg = 'cancel_uow: canceled UOW {0} for {1}'.format(uow_id, freerun_entry.schedulable_name) self._log_message(INFO, freerun_entry, msg)
def __init__(self, logger, freerun_handlers): self.lock = RLock() self.logger = logger self.freerun_handlers = freerun_handlers self.uow_dao = UnitOfWorkDao(self.logger)
def tearDown(self): # cleaning up DB self.virtual_tear_down() uow_dao = UnitOfWorkDao(self.aggregator.logger) uow_dao.remove(self.uow_id) del self.aggregator
def __init__(self, request, **values): super(AbstractActionHandler, self).__init__(request, **values) self.uow_dao = UnitOfWorkDao(self.logger) self.log_recording_dao = LogRecordingDao(self.logger)
class AbstractUowAwareWorker(AbstractMqWorker): """ Abstract class is inherited by all workers/aggregators that are aware of unit_of_work and capable of processing it """ def __init__(self, process_name, perform_db_logging=False): super(AbstractUowAwareWorker, self).__init__(process_name) self.perform_db_logging = perform_db_logging self.uow_dao = UnitOfWorkDao(self.logger) self.mq_transmitter = MqTransmitter(self.logger) def __del__(self): del self.mq_transmitter super(AbstractUowAwareWorker, self).__del__() # **************** Abstract Methods ************************ def _init_performance_tracker(self, logger): self.performance_tracker = UowAwareTracker(logger) self.performance_tracker.start() def _process_uow(self, uow): """ :param uow: unit_of_work to process :return: a tuple (number of processed items/documents/etc, desired unit_of_work state) or None if None is returned then it is assumed that the return tuple is (0, unit_of_work.STATE_PROCESSED) :raise an Exception if the UOW shall be marked as STATE_INVALID """ raise NotImplementedError('method _process_uow must be implemented by {0}'.format(self.__class__.__name__)) def _clean_up(self): """ method is called from the *finally* clause and is suppose to clean up after the uow processing """ pass def _mq_callback(self, message): try: mq_request = MqTransmission.from_json(message.body) uow = self.uow_dao.get_one(mq_request.record_db_id) if not uow.is_requested: # accept only UOW in STATE_REQUESTED self.logger.warning('Skipping UOW: id {0}; state {1};'.format(message.body, uow.state), exc_info=False) self.consumer.acknowledge(message.delivery_tag) return except Exception: self.logger.error('Safety fuse. Can not identify UOW {0}'.format(message.body), exc_info=True) self.consumer.acknowledge(message.delivery_tag) return db_log_handler = UowLogHandler(self.logger, uow.db_id) try: uow.state = unit_of_work.STATE_IN_PROGRESS uow.started_at = datetime.utcnow() self.uow_dao.update(uow) self.performance_tracker.start_uow(uow) if self.perform_db_logging: db_log_handler.attach() result = self._process_uow(uow) if result is None: self.logger.warning('method {0}._process_uow returned None. Assuming happy flow.' .format(self.__class__.__name__)) number_of_aggregated_objects, target_state = 0, unit_of_work.STATE_PROCESSED else: number_of_aggregated_objects, target_state = result uow.number_of_aggregated_documents = number_of_aggregated_objects uow.number_of_processed_documents = self.performance_tracker.success_per_job uow.finished_at = datetime.utcnow() uow.state = target_state self.uow_dao.update(uow) if uow.is_finished: self.performance_tracker.finish_uow() else: self.performance_tracker.cancel_uow() except Exception as e: fresh_uow = self.uow_dao.get_one(mq_request.record_db_id) self.performance_tracker.cancel_uow() if fresh_uow.is_canceled: self.logger.warning('UOW {0} for {1}@{2} was likely marked by MX as SKIPPED. No UOW update performed.' .format(uow.db_id, uow.process_name, uow.timeperiod), exc_info=False) else: self.logger.error('Safety fuse while processing UOW {0} for {1}@{2}: {3}' .format(uow.db_id, uow.process_name, uow.timeperiod, e), exc_info=True) uow.state = unit_of_work.STATE_INVALID self.uow_dao.update(uow) finally: self.consumer.acknowledge(message.delivery_tag) self.consumer.close() self._clean_up() db_log_handler.detach() try: self.mq_transmitter.publish_uow_status(uow) self.logger.info('UOW *{0}* status report published into MQ'.format(uow.state)) except Exception: self.logger.error('Error on UOW status report publishing', exc_info=True)
class AbstractActionHandler(BaseRequestHandler): def __init__(self, request, **values): super(AbstractActionHandler, self).__init__(request, **values) self.uow_dao = UnitOfWorkDao(self.logger) self.log_recording_dao = LogRecordingDao(self.logger) @property def thread_handler(self): raise NotImplementedError('property thread_handler must be implemented by {0}'.format(self.__class__.__name__)) @property def process_entry(self): raise NotImplementedError('property process_entry must be implemented by {0}'.format(self.__class__.__name__)) @property def uow_id(self): raise NotImplementedError('property uow_id must be implemented by {0}'.format(self.__class__.__name__)) def get_event_log(self): raise NotImplementedError('method action_get_event_log must be implemented by {0}' .format(self.__class__.__name__)) @safe_json_response def get_uow(self): if self.uow_id is None: resp = {'response': 'no related unit_of_work'} else: resp = self.uow_dao.get_one(self.uow_id).document return resp @safe_json_response def get_uow_log(self): try: resp = self.log_recording_dao.get_one(self.uow_id).document except (TypeError, LookupError): resp = {'response': 'no related uow log'} return resp @valid_action_request def change_interval(self): resp = dict() new_interval = self.request_arguments['interval'] if new_interval is not None: self.thread_handler.change_interval(new_interval) msg = 'changed interval for {0} to {1}'.format(self.thread_handler.key, new_interval) self.logger.info('MX: {0}'.format(msg)) resp['status'] = msg return resp @valid_action_request def trigger_now(self): self.thread_handler.trigger() self.logger.info('MX: triggered thread handler {0}'.format(self.thread_handler.key)) return self.reply_ok() @valid_action_request def activate_trigger(self): self.thread_handler.activate() self.logger.info('MX: activated thread handler {0}'.format(self.thread_handler.key)) return self.reply_ok() @valid_action_request def deactivate_trigger(self): self.thread_handler.deactivate() self.logger.info('MX: deactivated thread handler {0}'.format(self.thread_handler.key)) return self.reply_ok()
class Timetable(object): """ Timetable holds all known process trees, where every node presents a timeperiod-driven job""" def __init__(self, logger): self.lock = RLock() self.logger = logger self.uow_dao = UnitOfWorkDao(self.logger) self.job_dao = JobDao(self.logger) self.reprocess = dict() # self.trees contain all of the trees and manages much of their life cycle # remember to enlist there all trees the system is working with self.trees = self._construct_trees_from_context() self._register_callbacks() self._register_dependencies() self.load_tree() self.build_trees() self.validate() def _construct_trees_from_context(self): trees = dict() for tree_name, context_entry in context.timetable_context.items(): tree = MultiLevelTree(process_names=context_entry.enclosed_processes, tree_name=tree_name, mx_name=context_entry.mx_name, mx_page=context_entry.mx_page) trees[tree_name] = tree return trees def _register_dependencies(self): """ register dependencies between trees""" for tree_name, context_entry in context.timetable_context.items(): tree = self.trees[tree_name] assert isinstance(tree, MultiLevelTree) for dependent_on in context_entry.dependent_on: dependent_on_tree = self.trees[dependent_on] assert isinstance(dependent_on_tree, MultiLevelTree) tree.register_dependent_on(dependent_on_tree) def _register_callbacks(self): """ register logic that reacts on reprocessing request and create embryo timetable record request""" # reprocessing request for tree_name, tree in self.trees.items(): tree.register_reprocess_callback(self._callback_reprocess) # skip request for tree_name, tree in self.trees.items(): tree.register_skip_callback(self._callback_skip) # callbacks register for tree_name, tree in self.trees.items(): tree.register_create_callbacks(self._callback_create_job_record) # *** Call-back methods *** def _find_dependant_trees(self, tree_obj): """ returns list of trees that are dependent_on given tree_obj """ dependant_trees = [] for tree_name, tree in self.trees.items(): if tree_obj in tree.dependent_on: dependant_trees.append(tree) return dependant_trees def _find_dependant_tree_nodes(self, node_a): dependant_nodes = set() for tree_b in self._find_dependant_trees(node_a.tree): node_b = node_a.find_counterpart_in(tree_b) if node_b is None: continue dependant_nodes.add(node_b) return dependant_nodes def _reprocess_single_tree_node(self, tree_node): """ is called from tree to answer reprocessing request. It is possible that timetable record will be transferred to STATE_IN_PROGRESS with no related unit_of_work""" uow_id = tree_node.job_record.related_unit_of_work if uow_id is not None: tree_node.job_record.state = job.STATE_IN_PROGRESS uow = self.uow_dao.get_one(uow_id) uow.state = unit_of_work.STATE_INVALID uow.number_of_retries = 0 uow.created_at = datetime.utcnow() self.uow_dao.update(uow) msg = 'Transferred job record %s for %s in timeperiod %s to %s; Transferred unit_of_work to %s' \ % (tree_node.job_record.db_id, tree_node.process_name, tree_node.job_record.timeperiod, tree_node.job_record.state, uow.state) if tree_node.process_name not in self.reprocess: self.reprocess[tree_node.process_name] = dict() self.reprocess[tree_node.process_name][tree_node.timeperiod] = tree_node else: tree_node.job_record.state = job.STATE_EMBRYO msg = 'Transferred job record %s for %s in timeperiod %s to %s;' \ % (tree_node.job_record.db_id, tree_node.process_name, tree_node.job_record.timeperiod, tree_node.job_record.state) tree_node.job_record.number_of_failures = 0 self.job_dao.update(tree_node.job_record) self.logger.warn(msg) tree_node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg]) @thread_safe def _callback_reprocess(self, tree_node): """ is called from tree to answer reprocessing request. It is possible that job record will be transferred to STATE_IN_PROGRESS with no related unit_of_work""" if (tree_node.job_record.is_embryo and tree_node.job_record.number_of_failures == 0) \ or (tree_node.process_name in self.reprocess and tree_node.timeperiod in self.reprocess[tree_node.process_name]): # the node has already been marked for re-processing or does not require one pass else: self._reprocess_single_tree_node(tree_node) reprocessing_nodes = self._find_dependant_tree_nodes(tree_node) for node in reprocessing_nodes: node.request_reprocess() @thread_safe def _callback_skip(self, tree_node): """ is called from tree to answer skip request""" tree_node.job_record.state = job.STATE_SKIPPED uow_id = tree_node.job_record.related_unit_of_work if uow_id is not None: uow = self.uow_dao.get_one(uow_id) uow.state = unit_of_work.STATE_CANCELED self.uow_dao.update(uow) msg = 'Transferred job record %s in timeperiod %s to %s; Transferred unit_of_work to %s' \ % (tree_node.job_record.db_id, tree_node.job_record.timeperiod, tree_node.job_record.state, uow.state) else: msg = 'Transferred job record %s in timeperiod %s to %s;' \ % (tree_node.job_record.db_id, tree_node.job_record.timeperiod, tree_node.job_record.state) self.job_dao.update(tree_node.job_record) self.logger.warn(msg) tree_node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg]) if tree_node.process_name in self.reprocess \ and tree_node.timeperiod in self.reprocess[tree_node.process_name]: del self.reprocess[tree_node.process_name][tree_node.timeperiod] @thread_safe def _callback_create_job_record(self, tree_node): """ is called from a tree to create job record in STATE_EMBRYO and bind it to the given tree node""" try: job_record = self.job_dao.get_one(tree_node.process_name, tree_node.timeperiod) except LookupError: job_record = Job() job_record.state = job.STATE_EMBRYO job_record.timeperiod = tree_node.timeperiod job_record.process_name = tree_node.process_name tr_id = self.job_dao.update(job_record) self.logger.info('Created job record %s, with timeperiod %s for process %s' % (str(tr_id), tree_node.timeperiod, tree_node.process_name)) tree_node.job_record = job_record # *** Tree-manipulation methods *** @thread_safe def get_tree(self, process_name): """ return tree that is managing time-periods for given process""" for tree_name, tree in self.trees.items(): if process_name in tree: return tree @thread_safe def _build_tree_by_level(self, collection_name, since): """ method iterated thru all documents in all job collections and builds a tree of known system state""" try: unsupported_records = dict() job_records = self.job_dao.get_all(collection_name, since) for job_record in job_records: tree = self.get_tree(job_record.process_name) if tree is not None: tree.update_node(job_record) else: unsupported_records[job_record.process_name] = \ unsupported_records.get(job_record.process_name, 0) + 1 for name, counter in unsupported_records.items(): self.logger.warn('Skipping %r Job records for %s as no tree is handling it.' % (counter, name)) except LookupError: self.logger.warn('No Job Records in %s.' % str(collection_name)) @thread_safe def load_tree(self): """ method iterates thru all objects older than synergy_start_timeperiod parameter in job collections and loads them into this timetable""" timeperiod = settings.settings['synergy_start_timeperiod'] yearly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_YEARLY, timeperiod) monthly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_MONTHLY, timeperiod) daily_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_DAILY, timeperiod) hourly_timeperiod = time_helper.cast_to_time_qualifier(QUALIFIER_HOURLY, timeperiod) self._build_tree_by_level(COLLECTION_JOB_HOURLY, since=hourly_timeperiod) self._build_tree_by_level(COLLECTION_JOB_DAILY, since=daily_timeperiod) self._build_tree_by_level(COLLECTION_JOB_MONTHLY, since=monthly_timeperiod) self._build_tree_by_level(COLLECTION_JOB_YEARLY, since=yearly_timeperiod) @thread_safe def build_trees(self): """ method iterates thru all trees and ensures that all time-period nodes are created up till <utc_now>""" for tree_name, tree in self.trees.items(): tree.build_tree() @thread_safe def validate(self): """validates that none of nodes in tree is improperly finalized and that every node has job_record""" for tree_name, tree in self.trees.items(): tree.validate() @thread_safe def dependent_on_composite_state(self, job_record): """ :return instance of <NodesCompositeState> """ assert isinstance(job_record, Job) tree = self.get_tree(job_record.process_name) node = tree.get_node(job_record.process_name, job_record.timeperiod) return node.dependent_on_composite_state() # *** Job manipulation methods *** @thread_safe def update_job_record(self, job_record, uow, new_state): """ method updates job record with a new unit_of_work and new state""" job_record.state = new_state job_record.related_unit_of_work = uow.db_id job_record.start_id = uow.start_id job_record.end_id = uow.end_id self.job_dao.update(job_record) tree = self.get_tree(job_record.process_name) tree.update_node(job_record) msg = 'Transferred job %s for %s in timeperiod %s to new state %s' \ % (job_record.db_id, job_record.timeperiod, job_record.process_name, new_state) self.logger.info(msg) self.add_log_entry(job_record.process_name, job_record.timeperiod, msg) @thread_safe def failed_on_processing_job_record(self, process_name, timeperiod): """method increases node's inner counter of failed processing if _skip_node logic returns True - node is set to STATE_SKIP""" tree = self.get_tree(process_name) node = tree.get_node(process_name, timeperiod) node.job_record.number_of_failures += 1 if tree._skip_the_node(node): node.request_skip() else: # job record is automatically updated in request_skip() # so if the node was not skipped - job record has to be updated explicitly self.job_dao.update(node.job_record) @thread_safe def get_next_job_record(self, process_name): """returns next job record to work on for given process""" if process_name in self.reprocess and len(self.reprocess[process_name]) > 0: timeperiod = sorted(self.reprocess[process_name].keys())[0] node = self.reprocess[process_name][timeperiod] del self.reprocess[process_name][timeperiod] else: tree = self.get_tree(process_name) node = tree.get_next_node(process_name) if node.job_record is None: node.request_embryo_job_record() return node.job_record @thread_safe def is_job_record_finalizable(self, job_record): """ :return True, if the node and all its children are either in STATE_PROCESSED or STATE_SKIPPED""" assert isinstance(job_record, Job) tree = self.get_tree(job_record.process_name) node = tree.get_node(job_record.process_name, job_record.timeperiod) return node.is_finalizable() @thread_safe def add_log_entry(self, process_name, timeperiod, msg): """ adds a log entry to the tree node. log entries has no persistence """ tree = self.get_tree(process_name) node = tree.get_node(process_name, timeperiod) node.add_log_entry([datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), msg])
def __init__(self, logger): self.lock = RLock() self.logger = logger self.uow_dao = UnitOfWorkDao(self.logger)
def __init__(self, logger, name=STATE_MACHINE_FREERUN): self.name = name self.logger = logger self.mq_transmitter = MqTransmitter(self.logger) self.uow_dao = UnitOfWorkDao(self.logger) self.sfe_dao = FreerunProcessDao(self.logger)
class AbstractStateMachine(object): """ Abstract state machine used to host common logic for the rest of registered State Machines """ def __init__(self, logger, timetable, name): self.name = name self.logger = logger self.mq_transmitter = MqTransmitter(self.logger) self.timetable = timetable self.uow_dao = UnitOfWorkDao(self.logger) self.job_dao = JobDao(self.logger) @property def run_on_active_timeperiod(self): """ :return: True if given State Machine allows execution on *live* timeperiod, as opposed to the finished one """ raise NotImplementedError('property run_on_active_timeperiod must be implemented by {0}' .format(self.__class__.__name__)) def _log_message(self, level, process_name, timeperiod, msg): """ method performs logging into log file and Timetable's tree node""" self.timetable.add_log_entry(process_name, timeperiod, msg) self.logger.log(level, msg) @with_reconnect def _insert_uow(self, process_name, timeperiod, start_timeperiod, end_timeperiod, start_id, end_id): """creates unit_of_work and inserts it into the DB :raise DuplicateKeyError: if unit_of_work with given parameters already exists """ uow = UnitOfWork() uow.process_name = process_name uow.timeperiod = timeperiod uow.start_id = str(start_id) uow.end_id = str(end_id) uow.start_timeperiod = start_timeperiod uow.end_timeperiod = end_timeperiod uow.created_at = datetime.utcnow() uow.submitted_at = datetime.utcnow() uow.source = context.process_context[process_name].source uow.sink = context.process_context[process_name].sink uow.state = unit_of_work.STATE_REQUESTED uow.unit_of_work_type = unit_of_work.TYPE_MANAGED uow.number_of_retries = 0 uow.arguments = context.process_context[process_name].arguments uow.db_id = self.uow_dao.insert(uow) msg = 'Created: UOW {0} for {1}@{2}.'.format(uow.db_id, process_name, start_timeperiod) self._log_message(INFO, process_name, start_timeperiod, msg) return uow def _publish_uow(self, uow): self.mq_transmitter.publish_managed_uow(uow) msg = 'Published: UOW {0} for {1}@{2}.'.format(uow.db_id, uow.process_name, uow.start_timeperiod) self._log_message(INFO, uow.process_name, uow.start_timeperiod, msg) def insert_and_publish_uow(self, job_record, start_id, end_id): """ method creates and publishes a unit_of_work. it also handles DuplicateKeyError and attempts recovery :return: tuple (uow, is_duplicate) :raise UserWarning: if the recovery from DuplicateKeyError was unsuccessful """ process_name = job_record.process_name timeperiod = job_record.timeperiod start_timeperiod = self.compute_start_timeperiod(job_record.process_name, job_record.timeperiod) end_timeperiod = self.compute_end_timeperiod(job_record.process_name, job_record.timeperiod) try: is_duplicate = False uow = self._insert_uow(process_name, timeperiod, start_timeperiod, end_timeperiod, start_id, end_id) except DuplicateKeyError as e: is_duplicate = True msg = 'Catching up with latest UOW {0}@{1}, because of: {2}' \ .format(process_name, start_timeperiod, e) self._log_message(WARNING, process_name, start_timeperiod, msg) uow = self.uow_dao.recover_from_duplicatekeyerror(e) if not uow: msg = 'PERSISTENT TIER ERROR! Unable to locate UOW for {0}@{1}' \ .format(process_name, start_timeperiod) self._log_message(WARNING, process_name, start_timeperiod, msg) raise UserWarning(msg) if uow.is_canceled: # this UOW was marked for re-processing. recycle it uow.created_at = datetime.utcnow() # reset created_at to bypass GC cancellation logic uow.submitted_at = datetime.utcnow() # reset submitted_at to allow 1 hour free of GC resubmitting del uow.started_at del uow.finished_at del uow.number_of_aggregated_documents del uow.number_of_processed_documents uow.state = unit_of_work.STATE_REQUESTED self.uow_dao.update(uow) # publish the created/recovered/recycled unit_of_work self._publish_uow(uow) return uow, is_duplicate def notify(self, uow): """ method is used by StateMachine's users to provide notifications about change in UOW if applicable, method will update job_record state and Timetable tree node state :assumptions: uow is in [STATE_NOOP, STATE_CANCELED, STATE_PROCESSED] """ pass def compute_start_timeperiod(self, process_name, timeperiod): """ computes lowest *inclusive* timeperiod boundary for job to process for process with time_grouping == 1, it returns given timeperiod with no change for process with time_grouping != 1, it computes first timeperiod, not processed by the previous job run For instance: with time_grouping = 3, QUALIFIER_HOURLY, and timeperiod = 2016042018, the start_timeperiod will be = 2016042016 (computed as 2016042018 - 3 + 1) """ time_grouping = context.process_context[process_name].time_grouping if time_grouping == 1: return timeperiod # step1: translate given timeperiod to the time grouped one process_hierarchy = self.timetable.get_tree(process_name).process_hierarchy timeperiod_dict = process_hierarchy[process_name].timeperiod_dict translated_timeperiod = timeperiod_dict._translate_timeperiod(timeperiod) # step 2: compute previous grouped period # NOTICE: simple `time_helper.increment_timeperiod(time_qualifier, timeperiod)` is insufficient # as it does not address edge cases, such as the last day of the month or the last hour of the day # For instance: with time_grouping=3, QUALIFIER_DAILY, and 2016123100 # the `increment_timeperiod` will yield 2016122800 instead of 2016123100 time_qualifier = context.process_context[process_name].time_qualifier for i in range(1, time_grouping + 1): prev_timeperiod = time_helper.increment_timeperiod(time_qualifier, translated_timeperiod, delta=-i) if prev_timeperiod == timeperiod_dict._translate_timeperiod(prev_timeperiod): # prev_timeperiod is currently at the last grouped timeperiod break # step 3: compute first exclusive timeperiod after the *prev_timeperiod*, # which becomes first inclusive timeperiod for this job run over_the_edge_timeperiod = time_helper.increment_timeperiod(time_qualifier, prev_timeperiod, delta=-1) if prev_timeperiod != timeperiod_dict._translate_timeperiod(over_the_edge_timeperiod): # over_the_edge_timeperiod fell into previous day or month or year # *prev_timeperiod* points to the first month, first day of the month or 00 hour start_timeperiod = prev_timeperiod else: start_timeperiod = self.compute_end_timeperiod(process_name, prev_timeperiod) return start_timeperiod def compute_end_timeperiod(self, process_name, timeperiod): """ computes first *exclusive* timeperiod for job to process """ time_qualifier = context.process_context[process_name].time_qualifier end_timeperiod = time_helper.increment_timeperiod(time_qualifier, timeperiod) return end_timeperiod def _is_noop_timeperiod(self, process_name, timeperiod): """ method verifies if the given timeperiod for given process is valid or falls in-between grouping checkpoints :param process_name: name of the process :param timeperiod: timeperiod to verify :return: False, if given process has no time_grouping set or it is equal to 1. False, if time_grouping is custom but the given timeperiod matches the grouped timeperiod. True, if the timeperiod falls in-between grouping cracks """ time_grouping = context.process_context[process_name].time_grouping if time_grouping == 1: return False process_hierarchy = self.timetable.get_tree(process_name).process_hierarchy timeperiod_dict = process_hierarchy[process_name].timeperiod_dict return timeperiod_dict._translate_timeperiod(timeperiod) != timeperiod def _process_noop_timeperiod(self, job_record): """ method is valid for processes having time_grouping != 1. should a job record fall in-between grouped time milestones, its state should be set to STATE_NOOP without any processing """ job_record.state = job.STATE_NOOP self.job_dao.update(job_record) time_grouping = context.process_context[job_record.process_name].time_grouping msg = 'Job {0}@{1} with time_grouping {2} was transferred to STATE_NOOP' \ .format(job_record.process_name, job_record.timeperiod, time_grouping) self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg) def _process_state_embryo(self, job_record): """ method that takes care of processing job records in STATE_EMBRYO state""" pass def _process_state_in_progress(self, job_record): """ method that takes care of processing job records in STATE_IN_PROGRESS state""" pass def _process_state_final_run(self, job_record): """method takes care of processing job records in STATE_FINAL_RUN state""" uow = self.uow_dao.get_one(job_record.related_unit_of_work) if uow.is_processed: self.update_job(job_record, uow, job.STATE_PROCESSED) elif uow.is_noop: self.update_job(job_record, uow, job.STATE_NOOP) elif uow.is_canceled: self.update_job(job_record, uow, job.STATE_SKIPPED) elif uow.is_invalid: msg = 'Job {0}: UOW for {1}@{2} is in {3}; ' \ 'relying on the Garbage Collector to either recycle or cancel the UOW.' \ .format(job_record.db_id, job_record.process_name, job_record.timeperiod, uow.state) self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg) else: msg = 'Suppressed creating UOW for {0}@{1}; Job is in {2}; uow is in {3}' \ .format(job_record.process_name, job_record.timeperiod, job_record.state, uow.state) self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg) timetable_tree = self.timetable.get_tree(job_record.process_name) timetable_tree.build_tree() def _process_terminal_state(self, job_record): """ method logs a warning message notifying that the job is no longer govern by this state machine """ msg = 'Job {0} for {1}@{2} is in the terminal state {3}, ' \ 'and is no further govern by the State Machine {4}' \ .format(job_record.db_id, job_record.process_name, job_record.timeperiod, job_record.state, self.name) self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg) def manage_job_with_blocking_children(self, job_record): """ method will trigger job processing only if: - all children are finished (STATE_PROCESSED, STATE_SKIPPED, STATE_NOOP) - all external dependencies are finalized (STATE_PROCESSED, STATE_SKIPPED, STATE_NOOP) """ is_job_finalizable = self.timetable.is_job_record_finalizable(job_record) if is_job_finalizable: self.manage_job(job_record) else: msg = 'Job {0}@{1} is blocked by unprocessed children/dependencies. Waiting another tick' \ .format(job_record.process_name, job_record.timeperiod) self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg) def manage_job_with_blocking_dependencies(self, job_record): """ method will trigger job processing only if: - all dependencies are in [STATE_PROCESSED, STATE_NOOP] NOTICE: method will transfer current job into STATE_SKIPPED if any dependency is in STATE_SKIPPED """ composite_state = self.timetable.dependent_on_composite_state(job_record) assert isinstance(composite_state, NodesCompositeState) if composite_state.all_processed: self.manage_job(job_record) elif composite_state.skipped_present: # As soon as among <dependent on> periods are in STATE_SKIPPED # there is very little sense in waiting for them to become STATE_PROCESSED # Skip this timeperiod itself job_record.state = job.STATE_SKIPPED self.job_dao.update(job_record) self.mq_transmitter.publish_job_status(job_record) msg = 'Job {0}@{1} is blocked by STATE_SKIPPED dependencies. ' \ 'Transferred the job to STATE_SKIPPED'.format(job_record.process_name, job_record.timeperiod) self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg) else: msg = 'Job {0}@{1} is blocked by unprocessed dependencies. Waiting another tick' \ .format(job_record.process_name, job_record.timeperiod) self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg) def manage_job(self, job_record): """ method main duty - is to _avoid_ publishing another unit_of_work, if previous was not yet processed In case the State Machine sees that the unit_of_work already exist it could either: - update boundaries of the processing; republish - wait another tick; idle """ assert isinstance(job_record, Job) try: if self._is_noop_timeperiod(job_record.process_name, job_record.timeperiod): self._process_noop_timeperiod(job_record) return if job_record.is_embryo: self._process_state_embryo(job_record) elif job_record.is_in_progress: self._process_state_in_progress(job_record) elif job_record.is_final_run: self._process_state_final_run(job_record) elif job_record.is_skipped: self._process_terminal_state(job_record) elif job_record.is_processed: self._process_terminal_state(job_record) elif job_record.is_noop: self._process_terminal_state(job_record) else: msg = 'Unknown state {0} of the job {1}'.format(job_record.state, job_record.db_id) self._log_message(ERROR, job_record.process_name, job_record.timeperiod, msg) except LookupError as e: job_record.number_of_failures += 1 self.job_dao.update(job_record) self.timetable.skip_if_needed(job_record) msg = 'Increasing fail counter for Job {0}@{1}, because of: {2}' \ .format(job_record.process_name, job_record.timeperiod, e) self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg) finally: self.mq_transmitter.publish_job_status(job_record) def reprocess_job(self, job_record): """ method marks given job for reprocessing: - if no UOW was bind with the job record - it is transferred to STATE_EMBRYO only - if related UOW was not finished - we mark the UOW as STATE_CANCELED and pass the job record thru the _process_in_progress routine - otherwise job record is passed thru the _process_in_progress routine """ original_job_state = job_record.state job_record.number_of_failures = 0 if not job_record.related_unit_of_work: job_record.state = job.STATE_EMBRYO self.job_dao.update(job_record) else: uow = self.uow_dao.get_one(job_record.related_unit_of_work) if not uow.is_finished: uow.state = unit_of_work.STATE_CANCELED uow.submitted_at = datetime.utcnow() self.uow_dao.update(uow) self._process_state_in_progress(job_record) msg = 'Reprocessed Job {0} for {1}@{2}: state transfer {3} -> {4};' \ .format(job_record.db_id, job_record.process_name, job_record.timeperiod, original_job_state, job_record.state) self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg) def skip_job(self, job_record): """ method transfers: - given job into STATE_SKIPPED if it is not not in finished state - UOW into STATE_CANCELED if it is not in finished state """ original_job_state = job_record.state if not job_record.is_finished: job_record.state = job.STATE_SKIPPED self.job_dao.update(job_record) if job_record.related_unit_of_work: uow = self.uow_dao.get_one(job_record.related_unit_of_work) if not uow.is_finished: uow.state = unit_of_work.STATE_CANCELED uow.submitted_at = datetime.utcnow() self.uow_dao.update(uow) msg = 'Skipped Job {0} for {1}@{2}: state transfer {3} -> {4};' \ .format(job_record.db_id, job_record.process_name, job_record.timeperiod, original_job_state, job_record.state) self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg) def create_job(self, process_name, timeperiod): """ method creates a job record in STATE_EMBRYO for given process_name and timeperiod :returns: created job record of type <Job>""" job_record = Job() job_record.state = job.STATE_EMBRYO job_record.timeperiod = timeperiod job_record.process_name = process_name self.job_dao.update(job_record) self.logger.info('Created Job {0} for {1}@{2}' .format(job_record.db_id, job_record.process_name, job_record.timeperiod)) return job_record def update_job(self, job_record, uow, new_state): """ method updates job record with a new unit_of_work and new state""" original_job_state = job_record.state job_record.state = new_state job_record.related_unit_of_work = uow.db_id self.job_dao.update(job_record) msg = 'Updated Job {0} for {1}@{2}: state transfer {3} -> {4};' \ .format(job_record.db_id, job_record.process_name, job_record.timeperiod, original_job_state, new_state) self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg)
class AbstractStateMachine(object): """ Abstract state machine used to govern all processes and their states """ def __init__(self, logger, timetable, name): self.name = name self.logger = logger self.publishers = PublishersPool(self.logger) self.timetable = timetable self.uow_dao = UnitOfWorkDao(self.logger) self.job_dao = JobDao(self.logger) def __del__(self): try: self.logger.info('Closing Flopsy Publishers Pool...') self.publishers.close() except Exception as e: self.logger.error('Exception caught while closing Flopsy Publishers Pool: %s' % str(e)) def _log_message(self, level, process_name, timeperiod, msg): """ method performs logging into log file and Timetable's tree node""" self.timetable.add_log_entry(process_name, timeperiod, msg) self.logger.log(level, msg) @with_reconnect def _insert_uow(self, process_name, start_timeperiod, end_timeperiod, start_id, end_id): """creates unit_of_work and inserts it into the DB :raise DuplicateKeyError: if unit_of_work with given parameters already exists """ uow = UnitOfWork() uow.process_name = process_name uow.timeperiod = start_timeperiod uow.start_id = str(start_id) uow.end_id = str(end_id) uow.start_timeperiod = start_timeperiod uow.end_timeperiod = end_timeperiod uow.created_at = datetime.utcnow() uow.source = context.process_context[process_name].source uow.sink = context.process_context[process_name].sink uow.state = unit_of_work.STATE_REQUESTED uow.unit_of_work_type = TYPE_MANAGED uow.number_of_retries = 0 uow.arguments = context.process_context[process_name].arguments uow.db_id = self.uow_dao.insert(uow) msg = 'Created: UOW %s for %s in timeperiod [%s:%s).' \ % (uow.db_id, process_name, start_timeperiod, end_timeperiod) self._log_message(INFO, process_name, start_timeperiod, msg) return uow def _publish_uow(self, uow): mq_request = SynergyMqTransmission(process_name=uow.process_name, unit_of_work_id=uow.db_id) publisher = self.publishers.get(uow.process_name) publisher.publish(mq_request.document) publisher.release() msg = 'Published: UOW %r for %r in timeperiod %r.' % (uow.db_id, uow.process_name, uow.start_timeperiod) self._log_message(INFO, uow.process_name, uow.start_timeperiod, msg) def insert_and_publish_uow(self, process_name, start_timeperiod, end_timeperiod, start_id, end_id): """ method creates and publishes a unit_of_work. it also handles DuplicateKeyError and attempts recovery :return: tuple (uow, is_duplicate) :raise UserWarning: if the recovery from DuplicateKeyError was unsuccessful """ is_duplicate = False try: uow = self._insert_uow(process_name, start_timeperiod, end_timeperiod, start_id, end_id) except DuplicateKeyError as e: is_duplicate = True msg = 'Catching up with latest unit_of_work %s in timeperiod %s, because of: %r' \ % (process_name, start_timeperiod, e) self._log_message(WARNING, process_name, start_timeperiod, msg) uow = self.uow_dao.recover_from_duplicatekeyerror(e) if uow is None: msg = 'MANUAL INTERVENTION REQUIRED! Unable to locate unit_of_work for %s in %s' \ % (process_name, start_timeperiod) self._log_message(WARNING, process_name, start_timeperiod, msg) raise UserWarning(msg) # publish the created/caught up unit_of_work self._publish_uow(uow) return uow, is_duplicate def shallow_state_update(self, uow): """ method does not trigger any new actions if applicable, it will update job_record state and Timetable tree node state :assumptions: uow is either in STATE_CANCELED or STATE_PROCESSED """ pass def _process_state_embryo(self, job_record): """ method that takes care of processing job records in STATE_EMBRYO state""" pass def _process_state_in_progress(self, job_record): """ method that takes care of processing job records in STATE_IN_PROGRESS state""" pass def _process_state_final_run(self, job_record): """method takes care of processing job records in STATE_FINAL_RUN state""" pass def _process_state_skipped(self, job_record): """method takes care of processing job records in STATE_FINAL_SKIPPED state""" pass def _process_state_processed(self, job_record): """method takes care of processing job records in STATE_FINAL_SKIPPED state""" pass def manage_job_with_blocking_children(self, job_record, run_on_active_timeperiod): """ method will trigger job processing only if all children are in STATE_PROCESSED or STATE_SKIPPED and if all external dependencies are finalized (i.e. in STATE_PROCESSED or STATE_SKIPPED) """ is_job_finalizable = self.timetable.is_job_record_finalizable(job_record) composite_state = self.timetable.dependent_on_composite_state(job_record) if is_job_finalizable: self.manage_job(job_record) elif composite_state.all_healthy and run_on_active_timeperiod: self.manage_job(job_record) else: msg = '%s for timeperiod %r is blocked by unprocessed children/dependencies. Waiting another tick' \ % (job_record.process_name, job_record.timeperiod) self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg) def manage_job_with_blocking_dependencies(self, job_record, run_on_active_timeperiod): """ method will trigger job processing only if _all_ dependencies are in STATE_PROCESSED method will transfer current job into STATE_SKIPPED if any dependency is in STATE_SKIPPED """ composite_state = self.timetable.dependent_on_composite_state(job_record) assert isinstance(composite_state, NodesCompositeState) if composite_state.all_processed: self.manage_job(job_record) elif composite_state.all_healthy and run_on_active_timeperiod: self.manage_job(job_record) elif composite_state.skipped_present: # As soon as among <dependent on> periods are in STATE_SKIPPED # there is very little sense in waiting for them to become STATE_PROCESSED # Skip this timeperiod itself job_record.state = job.STATE_SKIPPED self.job_dao.update(job_record) tree = self.timetable.get_tree(job_record.process_name) tree.update_node(job_record) msg = '%s for timeperiod %r is blocked by STATE_SKIPPED dependencies. ' \ 'Transferred the job to STATE_SKIPPED' % (job_record.process_name, job_record.timeperiod) self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg) else: msg = '%s for timeperiod %r is blocked by unprocessed dependencies. Waiting another tick' \ % (job_record.process_name, job_record.timeperiod) self._log_message(INFO, job_record.process_name, job_record.timeperiod, msg) def manage_job(self, job_record): """ method main duty - is to _avoid_ publishing another unit_of_work, if previous was not yet processed In case the Scheduler sees that the unit_of_work is pending it could either update boundaries of the processing or wait another tick """ assert isinstance(job_record, Job) try: if job_record.is_embryo: self._process_state_embryo(job_record) elif job_record.is_in_progress: self._process_state_in_progress(job_record) elif job_record.is_final_run: self._process_state_final_run(job_record) elif job_record.is_skipped: self._process_state_skipped(job_record) elif job_record.is_processed: self._process_state_processed(job_record) else: msg = 'Unknown state %s of the job %s' % (job_record.state, job_record.db_id) self._log_message(ERROR, job_record.process_name, job_record.timeperiod, msg) except LookupError as e: self.timetable.failed_on_processing_job_record(job_record.process_name, job_record.timeperiod) msg = 'Increasing fail counter for %s in timeperiod %s, because of: %r' \ % (job_record.process_name, job_record.timeperiod, e) self._log_message(WARNING, job_record.process_name, job_record.timeperiod, msg)
class BashRunnable(threading.Thread): """Process starts remote or local bash script job, supervises its execution and updates mq""" def __init__(self, logger, message, consumer, performance_ticker): self.logger = logger self.message = message self.mq_request = SynergyMqTransmission.from_json(message.body) self.consumer = consumer self.performance_ticker = performance_ticker self.alive = False self.return_code = -1 self.uow_dao = UnitOfWorkDao(self.logger) self.thread_name = str(self.mq_request) super(BashRunnable, self).__init__(name=self.thread_name) def _poll_process(self): return self.alive, self.return_code def _start_process(self): try: uow = self.uow_dao.get_one(self.mq_request.unit_of_work_id) if not uow.is_requested: # accept only UOW in STATE_REQUESTED self.logger.warn('Skipping unit_of_work: id %s; state %s;' % (str(self.message.body), uow.state), exc_info=False) self.consumer.acknowledge(self.message.delivery_tag) return except Exception: self.logger.error('Safety fuse. Can not identify unit_of_work %s' % str(self.message.body), exc_info=True) self.consumer.acknowledge(self.message.delivery_tag) return try: self.logger.info('start: %s {' % self.thread_name) self.alive = True uow.state = unit_of_work.STATE_IN_PROGRESS uow.started_at = datetime.utcnow() self.uow_dao.update(uow) fabric.operations.env.warn_only = True fabric.operations.env.abort_on_prompts = True fabric.operations.env.use_ssh_config = True fabric.operations.env.host_string = uow.arguments[ARGUMENT_CMD_HOST] command = os.path.join(uow.arguments[ARGUMENT_CMD_PATH], uow.arguments[ARGUMENT_CMD_FILE]) command += ' %s' % uow.arguments[ARGUMENT_CMD_ARGS] run_result = fabric.operations.run(command, pty=False) if run_result.succeeded: self.return_code = 0 uow.finished_at = datetime.utcnow() uow.state = unit_of_work.STATE_PROCESSED self.uow_dao.update(uow) self.logger.info('Completed %s with result = %r' % (self.thread_name, self.return_code)) except Exception: self.logger.error('Exception on starting: %s' % self.thread_name, exc_info=True) uow.state = unit_of_work.STATE_INVALID self.uow_dao.update(uow) finally: self.logger.info('}') self.alive = False def run(self): try: self._start_process() code = None alive = True while alive: alive, code = self._poll_process() time.sleep(0.1) if code == 0: self.performance_ticker.tracker.increment_success() else: self.performance_ticker.tracker.increment_failure() self.logger.info('BashDriver for %s return code is %r' % (self.thread_name, code)) except Exception as e: self.performance_ticker.tracker.increment_failure() self.logger.error('Safety fuse while processing request %r: %r' % (self.message.body, e), exc_info=True) finally: self.consumer.acknowledge(self.message.delivery_tag)
def __init__(self, process_name, perform_db_logging=False): super(AbstractUowAwareWorker, self).__init__(process_name) self.perform_db_logging = perform_db_logging self.uow_dao = UnitOfWorkDao(self.logger) self.mq_transmitter = MqTransmitter(self.logger)
class StatusBusListener(object): """ class instance listens to the QUEUE_UOW_REPORT queue and updates Timetable records correspondingly """ def __init__(self, scheduler): self.scheduler = scheduler self.timetable = scheduler.timetable self.logger = scheduler.logger self.uow_dao = UnitOfWorkDao(self.logger) self.consumer = Consumer(QUEUE_UOW_REPORT) self.main_thread = None def __del__(self): try: self.logger.info('StatusBusListener: Closing Flopsy Consumer...') self.consumer.close() except Exception as e: self.logger.error('StatusBusListener: Exception caught while closing Flopsy Consumer: %s' % str(e)) # ********************** thread-related methods **************************** def _mq_callback(self, message): """ method processes messages from Synergy Worker and updates corresponding Timetable record, as well as the job itself :param message: <SynergyMqTransmission> mq message """ try: self.logger.info('StatusBusListener {') mq_request = SynergyMqTransmission.from_json(message.body) uow = self.uow_dao.get_one(mq_request.unit_of_work_id) if uow.unit_of_work_type != TYPE_MANAGED: self.logger.info('Received transmission from TYPE_FREERUN execution. Ignoring it.') return tree = self.timetable.get_tree(uow.process_name) node = tree.get_node(uow.process_name, uow.timeperiod) if uow.db_id != node.job_record.related_unit_of_work: self.logger.info('Received transmission is likely outdated. Ignoring it.') return if not uow.is_finished: # rely on Garbage Collector to re-trigger the failing unit_of_work self.logger.info('Received unit_of_work status report from %s at %s in non-final state %s. Ignoring it.' % (uow.process_name, uow.timeperiod, uow.state)) return process_entry = self.scheduler.managed_handlers[uow.process_name].process_entry state_machine = self.scheduler.state_machines[process_entry.state_machine_name] assert isinstance(state_machine, AbstractStateMachine) self.logger.info('Commencing shallow state update for unit_of_work from %s at %s in %s.' % (uow.process_name, uow.timeperiod, uow.state)) state_machine.shallow_state_update(uow) except KeyError: self.logger.error('Access error for %s' % str(message.body), exc_info=True) except LookupError: self.logger.error('Can not perform shallow state update for %s' % str(message.body), exc_info=True) except Exception: self.logger.error('Unexpected error during shallow state update for %s' % str(message.body), exc_info=True) finally: self.consumer.acknowledge(message.delivery_tag) self.logger.info('StatusBusListener }') def _run_mq_listener(self): try: self.consumer.register(self._mq_callback) self.logger.info('StatusBusListener: instantiated and activated.') self.consumer.wait() except (AMQPError, IOError) as e: self.logger.error('StatusBusListener: AMQPError %s' % str(e)) finally: self.__del__() self.logger.info('StatusBusListener: Shutting down... All auxiliary threads stopped.') def start(self, *_): self.main_thread = Thread(target=self._run_mq_listener) self.main_thread.daemon = True self.main_thread.start()