def filter_expired_tmp_jobs(client, job_names): """ Given a list of temporary jobs, find those ready to be removed. Their suitablity for removal is defined by two things: - the job has completed (irrespective of whether it was a success or failure) - the job completed more than 24 hours ago """ expired = [] for job_name in job_names: service, instance = chronos_tools.decompose_job_id(job_name) temporary_jobs = chronos_tools.get_temporary_jobs_for_service_instance( client=client, service=service, instance=instance ) for job in temporary_jobs: last_run_time, last_run_state = chronos_tools.get_status_last_run(job) if last_run_state != chronos_tools.LastRunState.NotRun: if ((datetime.datetime.now(dateutil.tz.tzutc()) - dateutil.parser.parse(last_run_time)) > datetime.timedelta(days=1)): expired.append(job_name) return expired
def last_run_state_for_jobs(jobs): """ Map over a list of jobs to create a pair of (job, LasRunState). ``chronos_tools.get_status_last_run`` returns a pair of (time, state), of which we only need the latter([-1]). """ return [(chronos_job, chronos_tools.get_status_last_run(chronos_job)[-1]) for chronos_job in jobs]
def _format_last_result(job): time, status = chronos_tools.get_status_last_run(job) if status is chronos_tools.LastRunState.NotRun: formatted_time = "never" else: formatted_time = _prettify_time(time) return _prettify_status(status), formatted_time
def filter_expired_tmp_jobs(client, job_names): """ Given a list of temporary jobs, find those ready to be removed. Their suitablity for removal is defined by two things: - the job has completed (irrespective of whether it was a success or failure) - the job completed more than 24 hours ago """ expired = [] for job_name in job_names: service, instance = chronos_tools.decompose_job_id(job_name) temporary_jobs = chronos_tools.get_temporary_jobs_for_service_instance( client=client, service=service, instance=instance, ) for job in temporary_jobs: last_run_time, last_run_state = chronos_tools.get_status_last_run( job) if last_run_state != chronos_tools.LastRunState.NotRun: if ((datetime.datetime.now(dateutil.tz.tzutc()) - dateutil.parser.parse(last_run_time)) > datetime.timedelta(days=1)): expired.append(job_name) return expired
def filter_expired_tmp_jobs(client, job_names, cluster, soa_dir): """ Given a list of temporary jobs, find those ready to be removed. Their suitability for removal is defined by two things: - the job has completed (irrespective of whether it was a success or failure) - the job completed more than 24 hours ago """ expired = [] for job_name in job_names: service, instance = chronos_tools.decompose_job_id(job_name) temporary_jobs = chronos_tools.get_temporary_jobs_for_service_instance( client=client, service=service, instance=instance ) for job in temporary_jobs: last_run_time, last_run_state = chronos_tools.get_status_last_run(job) try: chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir ) interval = chronos_job_config.get_schedule_interval_in_seconds() or 0 except NoConfigurationForServiceError: # If we can't get the job's config, default to cleanup after 1 day interval = 0 if last_run_state != chronos_tools.LastRunState.NotRun: if ( datetime.datetime.now(dateutil.tz.tzutc()) - dateutil.parser.parse(last_run_time) ) > max( datetime.timedelta(seconds=interval), datetime.timedelta(days=1) ): expired.append(job_name) return expired
def chronos_instance_status(instance_status, service, instance, verbose): cstatus = {} chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=settings.cluster, soa_dir=settings.soa_dir, ) cstatus['desired_state'] = job_config.get_desired_state() job_type = chronos_tools.get_job_type(job_config.config_dict) if job_type == chronos_tools.JobType.Scheduled: schedule_type = 'schedule' schedule = job_config.get_schedule() epsilon = job_config.get_epsilon() time_zone = job_config.get_schedule_time_zone() if time_zone == 'null' or time_zone is None: time_zone = 'UTC' cstatus['schedule'] = {} cstatus['schedule']['schedule'] = schedule cstatus['schedule']['epsilon'] = epsilon cstatus['schedule']['time_zone'] = time_zone elif job_type == chronos_tools.JobType.Dependent: schedule_type = 'parents' parents = job_config.get_parents() cstatus['parents'] = parents else: schedule_type = 'unknown' cstatus['schedule_type'] = schedule_type cstatus['status'] = {} if verbose: running_task_count = len( select_tasks_by_id( a_sync.block(get_cached_list_of_running_tasks_from_frameworks), job_config.get_job_name(), ), ) cstatus['status']['mesos_state'] = 'running' if running_task_count else 'not_running' cstatus['status']['disabled_state'] = 'not_scheduled' if job_config.get_disabled() else 'scheduled' cstatus['status']['chronos_state'] = chronos_tools.get_chronos_status_for_job(client, service, instance) cstatus['command'] = job_config.get_cmd() last_time, last_status = chronos_tools.get_status_last_run(job_config.config_dict) if last_status == chronos_tools.LastRunState.Success: last_status = 'success' elif last_status == chronos_tools.LastRunState.Fail: last_status = 'fail' elif last_status == chronos_tools.LastRunState.NotRun: last_status = 'not_run' else: last_status = '' if last_status == 'not_run' or last_status == '': last_time = 'never' cstatus['last_status'] = {} cstatus['last_status']['result'] = last_status cstatus['last_status']['time'] = last_time return cstatus
def sensu_message_status_for_jobs(chronos_job_config, chronos_job, client): """ :param chronos_job_config: an instance of ChronosJobConfig :param client: configured Chronos client """ if not chronos_job: if chronos_job_config.get_disabled(): sensu_status = pysensu_yelp.Status.OK output = "Job {}{}{} is disabled - ignoring status.".format( chronos_job_config.service, utils.SPACER, chronos_job_config.instance) else: sensu_status = pysensu_yelp.Status.WARNING output = ("Warning: %s%s%s isn't in chronos at all, " "which means it may not be deployed yet" % ( chronos_job_config.service, utils.SPACER, chronos_job_config.instance, )) else: if chronos_job.get("disabled") and not chronos_tools.is_temporary_job( chronos_job): sensu_status = pysensu_yelp.Status.OK output = "Job {}{}{} is disabled - ignoring status.".format( chronos_job_config.service, utils.SPACER, chronos_job_config.instance) else: last_run_time, state = chronos_tools.get_status_last_run( chronos_job) interval_in_seconds = chronos_job_config.get_schedule_interval_in_seconds( ) if job_is_stuck(last_run_time, interval_in_seconds, client, chronos_job["name"]): sensu_status = pysensu_yelp.Status.CRITICAL output = message_for_stuck_job( service=chronos_job_config.service, instance=chronos_job_config.instance, cluster=chronos_job_config.cluster, last_run_iso_time=last_run_time, interval_in_seconds=interval_in_seconds, schedule=chronos_job_config.get_schedule(), schedule_timezone=chronos_job_config. get_schedule_time_zone(), ) else: sensu_status = sensu_event_for_last_run_state(state) output = message_for_status(sensu_status, chronos_job_config) return output, sensu_status
def sensu_message_status_for_jobs(chronos_job_config, service, instance, cluster, chronos_job): if not chronos_job: if chronos_job_config.get_disabled(): sensu_status = pysensu_yelp.Status.OK output = ("Job %s%s%s is disabled - ignoring status." % (service, utils.SPACER, instance)) else: sensu_status = pysensu_yelp.Status.WARNING output = ("Warning: %s%s%s isn't in chronos at all, " "which means it may not be deployed yet" % (service, utils.SPACER, instance)) else: if chronos_job.get('disabled') and not chronos_tools.is_temporary_job( chronos_job): sensu_status = pysensu_yelp.Status.OK output = "Job %s%s%s is disabled - ignoring status." % ( service, utils.SPACER, instance) else: last_run_time, state = chronos_tools.get_status_last_run( chronos_job) interval_in_seconds = chronos_job_config.get_schedule_interval_in_seconds( ) if job_is_stuck(last_run_time, interval_in_seconds): sensu_status = pysensu_yelp.Status.CRITICAL output = message_for_stuck_job( service=service, instance=instance, cluster=cluster, last_run_iso_time=last_run_time, interval_in_seconds=interval_in_seconds, schedule=chronos_job_config.get_schedule(), schedule_timezone=chronos_job_config. get_schedule_time_zone(), ) else: sensu_status = sensu_event_for_last_run_state(state) output = message_for_status(sensu_status, service, instance, cluster) return output, sensu_status