def _start_process(self, start_timeperiod, end_timeperiod, arguments): try: start_dt = time_helper.synergy_to_datetime(QUALIFIER_HOURLY, start_timeperiod) sqoop_slice_starttime = start_dt.strftime(SqoopDriver.SQOOP_DATE_FORMAT) end_dt = time_helper.synergy_to_datetime(QUALIFIER_HOURLY, end_timeperiod) sqoop_slice_endtime = end_dt.strftime(SqoopDriver.SQOOP_DATE_FORMAT) sink_path = ProcessContext.get_sink(self.process_name) self.logger.info('start: %s {' % self.process_name) p = psutil.Popen([settings['bash_shell'], settings['sqoop_command'], str(sqoop_slice_starttime), str(sqoop_slice_endtime), sink_path + '/' + start_timeperiod], close_fds=True, cwd=settings['process_cwd'], stdin=PIPE, stdout=PIPE, stderr=PIPE) self.cli_process = p self.logger.info('Started %s with pid = %r' % (self.process_name, p.pid)) except Exception: self.logger.error('Exception on starting: %s' % self.process_name, exc_info=True) finally: self.logger.info('}')
def _skip_the_node(self, node): """Method is used during _get_next_node calculations. Returns True in case node shall be _skipped_""" # case 1: node processing is complete if node.time_record.get_state() == TimeTableEntry.STATE_SKIPPED \ or node.time_record.get_state() == TimeTableEntry.STATE_PROCESSED: return True # case 2: this is a daily leaf node. retry this time_period for INFINITE_RETRY_HOURS if node.process_name == self.process_daily: if len(node.children) == 0: # no children - this is a leaf creation_time = time_helper.synergy_to_datetime(node.process_name, node.timestamp) if datetime.utcnow() - creation_time < timedelta(hours=LIFE_SUPPORT_HOURS): return False else: return node.time_record.get_number_of_failures() > MAX_NUMBER_OF_RETRIES # case 3: here we process process_daily, process_monthly and process_yearly that have children # iterate thru children and check if all of them are in STATE_SKIPPED (i.e. no data for parent to process) # if any is still in processing (i.e. has produced some data) - then we can not skip parent of the child node # case 3': consider parent as worth processing (i.e. do not skip) if child's time_record is None all_children_spoiled = True for key in node.children.keys(): child = node.children[key] if child.time_record is None or \ (child.time_record.get_number_of_failures() <= MAX_NUMBER_OF_RETRIES and child.time_record.get_state() != TimeTableEntry.STATE_SKIPPED): all_children_spoiled = False break return all_children_spoiled
def test_catching_up_time_build_tree(self): delta = 5 * 24 new_synergy_start_time = wind_the_time(QUALIFIER_HOURLY, self.initial_synergy_start_time, -delta) settings['synergy_start_timeperiod'] = new_synergy_start_time self.tree.build_tree() self._perform_assertions(new_synergy_start_time, delta / 24) new_actual_timeperiod = wind_the_time(QUALIFIER_HOURLY, self.initial_synergy_start_time, delta) time_helper.actual_timeperiod = \ wind_actual_timeperiod(time_helper.synergy_to_datetime(QUALIFIER_HOURLY, new_actual_timeperiod)) self.tree.build_tree() self._perform_assertions(new_synergy_start_time, 2 * delta / 24)
def fire_managed_worker(self, *args): """requests vertical aggregator (hourly site, daily variant, etc) to start up""" try: process_name = args[0] scheduler_entry_obj = args[1] self.logger.info('%s {' % process_name) timetable_record = self.timetable.get_next_job_record(process_name) pipeline = self.pipelines[scheduler_entry_obj.state_machine_name] run_on_active_timeperiod = ProcessContext.run_on_active_timeperiod(scheduler_entry_obj.process_name) if not run_on_active_timeperiod: time_qualifier = ProcessContext.get_time_qualifier(process_name) incremented_timeperiod = time_helper.increment_timeperiod(time_qualifier, timetable_record.timeperiod) dt_record_timestamp = time_helper.synergy_to_datetime(time_qualifier, incremented_timeperiod) dt_record_timestamp += timedelta(minutes=LAG_5_MINUTES) if datetime.utcnow() <= dt_record_timestamp: self.logger.info('Timetable record %s for timeperiod %s will not be triggered until %s.' % (timetable_record.document['_id'], timetable_record.timeperiod, dt_record_timestamp.strftime('%Y-%m-%d %H:%M:%S'))) return process_type = ProcessContext.get_process_type(scheduler_entry_obj.process_name) if process_type == TYPE_BLOCKING_DEPENDENCIES: pipeline.manage_pipeline_with_blocking_dependencies(process_name, timetable_record) elif process_type == TYPE_BLOCKING_CHILDREN: pipeline.manage_pipeline_with_blocking_children(process_name, timetable_record) elif process_type == TYPE_MANAGED: pipeline.manage_pipeline_for_process(process_name, timetable_record) except (AMQPError, IOError) as e: self.logger.error('AMQPError: %s' % str(e), exc_info=True) self.publishers.reset_all(suppress_logging=True) except Exception as e: self.logger.error('Exception: %s' % str(e), exc_info=True) finally: self.logger.info('}')
QUALIFIER_YEARLY] expected = [datetime(year=2010, month=12, day=31, hour=23, minute=12, second=34), datetime(year=2010, month=12, day=31, hour=23, minute=00, second=0), datetime(year=2010, month=12, day=31, hour=00, minute=00, second=0), datetime(year=2010, month=12, day=01, hour=00, minute=00, second=0), datetime(year=2010, month=01, day=01, hour=00, minute=00, second=0)] params = ['20101231231234', '2010123123', '2010123100', '2010120000', '2010000000'] for i in range(5): assert time_helper.synergy_to_datetime(qualifiers[i], params[i]) == expected[i] def test_increment_time(self): stamps = ['2011010100', '2011010112', '2011010123'] expected = ['2011010101', '2011010113', '2011010200'] for i in range(3): assert time_helper.increment_timeperiod(QUALIFIER_HOURLY, stamps[i]) == expected[i] stamps = ['2011010100', '2011013100', '2010123100'] expected = ['2011010200', '2011020100', '2011010100'] for i in range(3): assert time_helper.increment_timeperiod(QUALIFIER_DAILY, stamps[i]) == expected[i] stamps = ['2011010000', '2011120000', '2011100000'] expected = ['2011020000', '2012010000', '2011110000'] for i in range(3):
process_context.PROCESS_SITE_YEARLY] expected = [datetime(year = 2010, month = 12, day = 31, hour = 23, minute = 12, second = 34), datetime(year = 2010, month = 12, day = 31, hour = 23, minute = 00, second = 0), datetime(year = 2010, month = 12, day = 31, hour = 00, minute = 00, second = 0), datetime(year = 2010, month = 12, day = 01, hour = 00, minute = 00, second = 0), datetime(year = 2010, month = 01, day = 01, hour = 00, minute = 00, second = 0), ] params = ['20101231231234', '2010123123', '2010123100', '2010120000', '2010000000'] for i in range(5): assert time_helper.synergy_to_datetime(processes[i], params[i]) == expected[i] def test_increment_time(self): stamps = ['2011010100', '2011010112', '2011010123'] expected = ['2011010101', '2011010113', '2011010200'] for i in range(3): assert time_helper.increment_time(process_context.PROCESS_SITE_HOURLY, stamps[i]) == expected[i] stamps = ['2011010100', '2011013100', '2010123100'] expected = ['2011010200', '2011020100', '2011010100'] for i in range(3): assert time_helper.increment_time(process_context.PROCESS_SITE_DAILY, stamps[i]) == expected[i] stamps = ['2011010000', '2011120000', '2011100000'] expected = ['2011020000', '2012010000', '2011110000']