def test_old_and_new_ways_load_the_same_chronos_configs( mock_chronos_tools_read_extra_service_information, mock_read_extra_service_information, mock_chronos_tools_load_deployments_json, mock_load_deployments_json, ): mock_read_extra_service_information.return_value = chronos_cluster_config() mock_chronos_tools_read_extra_service_information.return_value = ( chronos_cluster_config()) mock_load_deployments_json.return_value = deployment_json() mock_chronos_tools_load_deployments_json.return_value = deployment_json() s = create_test_service() expected = [ load_chronos_job_config( service=TEST_SERVICE_NAME, instance="example_chronos_job", cluster=TEST_CLUSTER_NAME, load_deployments=True, soa_dir=TEST_SOA_DIR, ), load_chronos_job_config( service=TEST_SERVICE_NAME, instance="example_child_job", cluster=TEST_CLUSTER_NAME, load_deployments=True, soa_dir=TEST_SOA_DIR, ), ] assert list(s.instance_configs(TEST_CLUSTER_NAME, ChronosJobConfig)) == expected
def get_instance_config_for_service(soa_dir, service): for cluster in list_clusters( service=service, soa_dir=soa_dir, ): for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='marathon', ): yield load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='chronos', ): yield load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, )
def validate_chronos(service_path): soa_dir, service = path_to_soa_dir_service(service_path) instance_type = 'chronos' returncode = 0 for cluster in list_clusters(service, soa_dir, instance_type): for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) checks_passed, check_msgs = cjc.validate() # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: print invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs)) returncode = 1 else: print valid_chronos_instance(cluster, instance) return returncode
def create_chronos_job_config_object_from_configs(context, instance, service): context.chronos_job_config_obj = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=context.cluster, soa_dir=context.soa_dir, )
def send_event(service, instance, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param instance: The instance of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event """ cluster = load_system_paasta_config().get_cluster() monitoring_overrides = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ).get_monitoring() # In order to let sensu know how often to expect this check to fire, # we need to set the ``check_every`` to the frequency of our cron job, which # is 10s. monitoring_overrides['check_every'] = '10s' # Most deploy_chronos_jobs failures are transient and represent issues # that will probably be fixed eventually, so we set an alert_after # to suppress extra noise monitoring_overrides['alert_after'] = '10m' check_name = 'setup_chronos_job.%s' % compose_job_id(service, instance) monitoring_tools.send_event( service=service, check_name=check_name, overrides=monitoring_overrides, status=status, output=output, soa_dir=soa_dir, )
def filter_expired_tmp_jobs(client, job_names, cluster, soa_dir): """ Given a list of temporary jobs, find those ready to be removed. Their suitability for removal is defined by two things: - the job has completed (irrespective of whether it was a success or failure) - the job completed more than 24 hours ago """ expired = [] for job_name in job_names: service, instance = chronos_tools.decompose_job_id(job_name) temporary_jobs = chronos_tools.get_temporary_jobs_for_service_instance( client=client, service=service, instance=instance ) for job in temporary_jobs: last_run_time, last_run_state = chronos_tools.get_status_last_run(job) try: chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir ) interval = chronos_job_config.get_schedule_interval_in_seconds() or 0 except NoConfigurationForServiceError: # If we can't get the job's config, default to cleanup after 1 day interval = 0 if last_run_state != chronos_tools.LastRunState.NotRun: if ( datetime.datetime.now(dateutil.tz.tzutc()) - dateutil.parser.parse(last_run_time) ) > max( datetime.timedelta(seconds=interval), datetime.timedelta(days=1) ): expired.append(job_name) return expired
def main(args): soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() # get those jobs listed in configs configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir) service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config, service, instance, job_state_pairs) monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir ) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, )
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir) try: service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, chronos_job in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] try: chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except utils.NoDeploymentsAvailable: log.info("Skipping %s because no deployments are available" % service) continue sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, chronos_job=chronos_job, client=client, ) if sensu_status is not None: send_event(chronos_job_config, sensu_status, sensu_output) except (chronos.ChronosAPIError) as e: log.error("CRITICAL: Unable to contact Chronos! Error: %s" % e) sys.exit(2)
def chronos_instance_status(instance_status, service, instance, verbose): cstatus = {} chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=settings.cluster, soa_dir=settings.soa_dir, ) cstatus['desired_state'] = job_config.get_desired_state() job_type = chronos_tools.get_job_type(job_config.config_dict) if job_type == chronos_tools.JobType.Scheduled: schedule_type = 'schedule' schedule = job_config.get_schedule() epsilon = job_config.get_epsilon() time_zone = job_config.get_schedule_time_zone() if time_zone == 'null' or time_zone is None: time_zone = 'UTC' cstatus['schedule'] = {} cstatus['schedule']['schedule'] = schedule cstatus['schedule']['epsilon'] = epsilon cstatus['schedule']['time_zone'] = time_zone elif job_type == chronos_tools.JobType.Dependent: schedule_type = 'parents' parents = job_config.get_parents() cstatus['parents'] = parents else: schedule_type = 'unknown' cstatus['schedule_type'] = schedule_type cstatus['status'] = {} if verbose: running_task_count = len( select_tasks_by_id( a_sync.block(get_cached_list_of_running_tasks_from_frameworks), job_config.get_job_name(), ), ) cstatus['status']['mesos_state'] = 'running' if running_task_count else 'not_running' cstatus['status']['disabled_state'] = 'not_scheduled' if job_config.get_disabled() else 'scheduled' cstatus['status']['chronos_state'] = chronos_tools.get_chronos_status_for_job(client, service, instance) cstatus['command'] = job_config.get_cmd() last_time, last_status = chronos_tools.get_status_last_run(job_config.config_dict) if last_status == chronos_tools.LastRunState.Success: last_status = 'success' elif last_status == chronos_tools.LastRunState.Fail: last_status = 'fail' elif last_status == chronos_tools.LastRunState.NotRun: last_status = 'not_run' else: last_status = '' if last_status == 'not_run' or last_status == '': last_time = 'never' cstatus['last_status'] = {} cstatus['last_status']['result'] = last_status cstatus['last_status']['time'] = last_time return cstatus
def validate_chronos(service_path): """Check that any chronos configurations are valid""" soa_dir, service = path_to_soa_dir_service(service_path) instance_type = 'chronos' chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER returncode = True if service.startswith(TMP_JOB_IDENTIFIER): paasta_print(( "Services using scheduled tasks cannot be named %s, as it clashes with the " "identifier used for temporary jobs" % TMP_JOB_IDENTIFIER)) return False for cluster in list_clusters(service, soa_dir, instance_type): services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type='chronos', soa_dir=soa_dir) valid_services = { f"{name}{chronos_spacer}{instance}" for name, instance in services_in_cluster } for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir, ): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) parents = cjc.get_parents() or [] checks_passed, check_msgs = cjc.validate() for parent in parents: if not check_parent_format(parent): continue if f"{service}{chronos_spacer}{instance}" == parent: checks_passed = False check_msgs.append("Job %s cannot depend on itself" % parent) elif parent not in valid_services: checks_passed = False check_msgs.append("Parent job %s could not be found" % parent) # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: paasta_print( invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs))) returncode = False else: paasta_print(valid_chronos_instance(cluster, instance)) return returncode
def compose_monitoring_overrides_for_service(cluster, service, instance, soa_dir): """ Compose a group of monitoring overrides """ monitoring_overrides = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir).get_monitoring() monitoring_overrides['alert_after'] = '2m' monitoring_overrides['check_every'] = '1m' monitoring_overrides['runbook'] = monitoring_tools.get_runbook( monitoring_overrides, service, soa_dir=soa_dir) return monitoring_overrides
def compose_monitoring_overrides_for_service(cluster, service, instance, soa_dir): """ Compose a group of monitoring overrides """ monitoring_overrides = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir ).get_monitoring() monitoring_overrides['alert_after'] = '2m' monitoring_overrides['check_every'] = '1m' monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir) return monitoring_overrides
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster( cluster, soa_dir=soa_dir) try: service_job_mapping = build_service_job_mapping( client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] try: chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except utils.NoDeploymentsAvailable: paasta_print( utils.PaastaColors.cyan( "Skipping %s because no deployments are available" % service)) continue sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, service=service, instance=instance, cluster=cluster, job_state_pairs=job_state_pairs) if sensu_status is not None: monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, ) except (chronos.ChronosAPIError) as e: paasta_print( utils.PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2)
def main(): args = parse_args() cluster = load_system_paasta_config().get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = load_system_paasta_config() chronos_job_config = chronos_tools.load_chronos_job_config( service, instance, system_paasta_config.get_cluster(), soa_dir=args.soa_dir) try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=args.soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) print error_msg raise e except chronos_tools.UnknownChronosJobError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) print error_msg raise e except chronos_tools.InvalidParentError as e: raise e # complete_job_config is a formatted version # of the job, so the command is fornatted in the context # of 'now' # replace it with the 'original' cmd so it can be # re rendered original_command = chronos_job_config.get_cmd() complete_job_config['command'] = original_command clone = clone_job( complete_job_config, datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S")) client.add(clone)
def get_instance_configs_for_service(service, soa_dir, type_filter=None): for cluster in list_clusters( service=service, soa_dir=soa_dir, ): if type_filter is None: type_filter = ['marathon', 'chronos', 'adhoc'] if 'marathon' in type_filter: for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ): yield load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ) if 'chronos' in type_filter: for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='chronos', soa_dir=soa_dir, ): yield load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ) if 'adhoc' in type_filter: for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='adhoc', soa_dir=soa_dir, ): yield load_adhoc_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, )
def get_chronos_steps(service, soa_dir): """This is a kind of funny function that gets all the chronos instances for a service and massages it into a form that matches up with what deploy.yaml's steps look like. This is only so we can compare it 1-1 with what deploy.yaml has for linting.""" steps = [] for cluster in list_clusters(service, soa_dir): for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type="chronos", soa_dir=soa_dir ): config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False ) steps.append(config.get_deploy_group()) return steps
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir) try: service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] try: chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except utils.NoDeploymentsAvailable: paasta_print(utils.PaastaColors.cyan("Skipping %s because no deployments are available" % service)) continue sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, service=service, instance=instance, cluster=cluster, job_state_pairs=job_state_pairs ) if sensu_status is not None: monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir ) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, ) except (chronos.ChronosAPIError) as e: paasta_print(utils.PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2)
def validate_chronos(service_path): """Check that any chronos configurations are valid""" soa_dir, service = path_to_soa_dir_service(service_path) instance_type = 'chronos' chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER returncode = True for cluster in list_clusters(service, soa_dir, instance_type): services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type='chronos', soa_dir=soa_dir) valid_services = set([ "%s%s%s" % (name, chronos_spacer, instance) for name, instance in services_in_cluster ]) for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) parents = cjc.get_parents() or [] checks_passed, check_msgs = cjc.validate() for parent in parents: if not check_parent_format(parent): continue if "%s%s%s" % (service, chronos_spacer, instance) == parent: checks_passed = False check_msgs.append("Job %s cannot depend on itself" % parent) elif parent not in valid_services: checks_passed = False check_msgs.append("Parent job %s could not be found" % parent) # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: print invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs)) returncode = False else: print valid_chronos_instance(cluster, instance) return returncode
def main(): args = parse_args() cluster = load_system_paasta_config().get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = load_system_paasta_config() chronos_job_config = chronos_tools.load_chronos_job_config( service, instance, system_paasta_config.get_cluster(), soa_dir=args.soa_dir) try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=args.soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) print error_msg raise e except chronos_tools.UnknownChronosJobError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) print error_msg raise e except chronos_tools.InvalidParentError as e: raise e # complete_job_config is a formatted version # of the job, so the command is fornatted in the context # of 'now' # replace it with the 'original' cmd so it can be # re rendered original_command = chronos_job_config.get_cmd() complete_job_config['command'] = original_command clone = clone_job(complete_job_config, datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S")) client.add(clone)
def get_instance_configs_for_service(service, soa_dir): for cluster in list_clusters( service=service, soa_dir=soa_dir, ): for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ): yield load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ) for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='chronos', soa_dir=soa_dir, ): yield load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ) for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='adhoc', soa_dir=soa_dir, ): yield load_adhoc_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, )
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir) try: service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] chronos_job_config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) sensu_output, sensu_status = sensu_message_status_for_jobs( chronos_job_config=chronos_job_config, service=service, instance=instance, cluster=cluster, job_state_pairs=job_state_pairs ) monitoring_overrides = compose_monitoring_overrides_for_service( chronos_job_config=chronos_job_config, soa_dir=soa_dir ) send_event( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=soa_dir, ) except (ServerNotFoundError, chronos.ChronosAPIError, socket_error) as e: print(utils.PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2)
def validate_chronos(service_path): """Check that any chronos configurations are valid""" soa_dir, service = path_to_soa_dir_service(service_path) instance_type = 'chronos' chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER returncode = True if service.startswith(TMP_JOB_IDENTIFIER): print ("Services using scheduled tasks cannot be named %s, as it clashes with the" " identifier used for temporary jobs" % TMP_JOB_IDENTIFIER) return False for cluster in list_clusters(service, soa_dir, instance_type): services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type='chronos', soa_dir=soa_dir) valid_services = set(["%s%s%s" % (name, chronos_spacer, instance) for name, instance in services_in_cluster]) for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) parents = cjc.get_parents() or [] checks_passed, check_msgs = cjc.validate() for parent in parents: if not check_parent_format(parent): continue if "%s%s%s" % (service, chronos_spacer, instance) == parent: checks_passed = False check_msgs.append("Job %s cannot depend on itself" % parent) elif parent not in valid_services: checks_passed = False check_msgs.append("Parent job %s could not be found" % parent) # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: print invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs)) returncode = False else: print valid_chronos_instance(cluster, instance) return returncode
def get_chronos_steps(service, soa_dir): """This is a kind of funny function that gets all the chronos instances for a service and massages it into a form that matches up with what deploy.yaml's steps look like. This is only so we can compare it 1-1 with what deploy.yaml has for linting.""" steps = [] for cluster in list_clusters(service, soa_dir): for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='chronos', soa_dir=soa_dir, ): config = load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ) steps.append(config.get_deploy_group()) return steps
def validate_chronos(service_path): """Check that any chronos configurations are valid""" soa_dir, service = path_to_soa_dir_service(service_path) instance_type = "chronos" chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER returncode = True for cluster in list_clusters(service, soa_dir, instance_type): services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type="chronos", soa_dir=soa_dir) valid_services = set(["%s%s%s" % (name, chronos_spacer, instance) for name, instance in services_in_cluster]) for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir ): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) parents = cjc.get_parents() or [] checks_passed, check_msgs = cjc.validate() for parent in parents: if not check_parent_format(parent): continue if "%s%s%s" % (service, chronos_spacer, instance) == parent: checks_passed = False check_msgs.append("Job %s cannot depend on itself" % parent) elif parent not in valid_services: checks_passed = False check_msgs.append("Parent job %s could not be found" % parent) # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: print invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs)) returncode = False else: print valid_chronos_instance(cluster, instance) return returncode
def main(): configure_log() args = parse_args() soa_dir = args.soa_dir if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance) except InvalidJobNameError: log.error( "Invalid service instance '%s' specified. Format is service%sinstance." % (args.service_instance, SPACER)) sys.exit(1) client = chronos_tools.get_chronos_client( chronos_tools.load_chronos_config()) cluster = load_system_paasta_config().get_cluster() try: chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError): error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) send_event( service=service, instance=None, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) # exit 0 because the event was sent to the right team and this is not an issue with Paasta itself sys.exit(0) except chronos_tools.InvalidChronosConfigError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) log.error(error_msg) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) # exit 0 because the event was sent to the right team and this is not an issue with Paasta itself sys.exit(0) complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=soa_dir, ) status, output = setup_job( service=service, instance=instance, cluster=cluster, chronos_job_config=chronos_job_config, complete_job_config=complete_job_config, client=client, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event( service=service, instance=instance, soa_dir=soa_dir, status=sensu_status, output=output, ) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def main(): configure_log() args = parse_args() soa_dir = args.soa_dir if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance) except InvalidJobNameError: log.error("Invalid service instance '%s' specified. Format is service%sinstance." % (args.service_instance, SPACER)) sys.exit(1) client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config()) cluster = load_system_paasta_config().get_cluster() try: chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError): error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) send_event( service=service, instance=None, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) # exit 0 because the event was sent to the right team and this is not an issue with Paasta itself sys.exit(0) except chronos_tools.InvalidChronosConfigError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) log.error(error_msg) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) # exit 0 because the event was sent to the right team and this is not an issue with Paasta itself sys.exit(0) complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=soa_dir, ) status, output = setup_job( service=service, instance=instance, cluster=cluster, chronos_job_config=chronos_job_config, complete_job_config=complete_job_config, client=client, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event( service=service, instance=instance, soa_dir=soa_dir, status=sensu_status, output=output, ) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def paasta_rerun(args): """Reruns a Chronos job. :param args: argparse.Namespace obj created from sys.args by cli""" system_paasta_config = load_system_paasta_config() soa_dir = args.soa_dir service = figure_out_service_name( args, soa_dir) # exit with an error if the service doesn't exist if args.execution_date: execution_date = args.execution_date else: execution_date = None all_clusters = list_clusters(soa_dir=soa_dir) actual_deployments = get_actual_deployments( service, soa_dir) # cluster.instance: sha if actual_deployments: deploy_pipeline = list(get_planned_deployments( service, soa_dir)) # cluster.instance deployed_clusters = list_deployed_clusters(deploy_pipeline, actual_deployments) deployed_cluster_instance = _get_cluster_instance( actual_deployments.keys()) if args.clusters is not None: clusters = args.clusters.split(",") else: clusters = deployed_clusters for cluster in clusters: print "cluster: %s" % cluster if cluster not in all_clusters: print " Warning: \"%s\" does not look like a valid cluster." % cluster continue if cluster not in deployed_clusters: print " Warning: service \"%s\" has not been deployed to \"%s\" yet." % ( service, cluster) continue if not deployed_cluster_instance[cluster].get(args.instance, False): print( " Warning: instance \"%s\" is either invalid " "or has not been deployed to \"%s\" yet." % (args.instance, cluster)) continue try: chronos_job_config = chronos_tools.load_chronos_job_config( service, args.instance, cluster, load_deployments=False, soa_dir=soa_dir) if chronos_tools.uses_time_variables( chronos_job_config) and execution_date is None: print( " Warning: \"%s\" uses time variables interpolation, " "please supply a `--execution_date` argument." % args.instance) continue except chronos_tools.UnknownChronosJobError as e: print " Warning: %s" % e.message continue if execution_date is None: execution_date = _get_default_execution_date() rc, output = execute_chronos_rerun_on_remote_master( service=service, instancename=args.instance, cluster=cluster, verbose=args.verbose, execution_date=execution_date.strftime( chronos_tools.EXECUTION_DATE_FORMAT), system_paasta_config=system_paasta_config, ) if rc == 0: print PaastaColors.green(' successfully created job') else: print PaastaColors.red(' error') print output
def perform_command(command, service, instance, cluster, verbose, soa_dir): """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :returns: A unix-style return code """ chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) complete_job_config = chronos_tools.create_complete_config(service, instance, soa_dir=soa_dir) job_id = complete_job_config["name"] if command == "start": start_chronos_job( service=service, instance=instance, job_id=job_id, client=client, cluster=cluster, job_config=job_config, complete_job_config=complete_job_config, emergency=True, ) elif command == "stop": matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, include_temporary=True ) stop_chronos_job(service, instance, client, cluster, matching_jobs, emergency=True) elif command == "restart": matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, ) restart_chronos_job( service=service, instance=instance, job_id=job_id, client=client, cluster=cluster, matching_jobs=matching_jobs, job_config=job_config, complete_job_config=complete_job_config, emergency=True, ) elif command == "status": # Verbose mode shows previous versions. matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, ) sorted_matching_jobs = chronos_tools.sort_jobs(matching_jobs) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) paasta_print(status_chronos_jobs(client, sorted_matching_jobs, job_config, verbose)) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def get_instance_configs_for_service( service: str, soa_dir: str, type_filter: Optional[Sequence[str]] = None, ) -> Iterable[InstanceConfig]: for cluster in list_clusters( service=service, soa_dir=soa_dir, ): if type_filter is None: type_filter = ['marathon', 'chronos', 'adhoc', 'kubernetes'] if 'marathon' in type_filter: for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ): yield load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ) if 'chronos' in type_filter: for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='chronos', soa_dir=soa_dir, ): yield load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ) if 'adhoc' in type_filter: for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='adhoc', soa_dir=soa_dir, ): yield load_adhoc_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ) if 'kubernetes' in type_filter: for _, instance in get_service_instance_list( service=service, cluster=cluster, instance_type='kubernetes', soa_dir=soa_dir, ): yield load_kubernetes_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, )
def paasta_rerun(args): """Reruns a Chronos job. :param args: argparse.Namespace obj created from sys.args by cli""" soa_dir = args.soa_dir service = figure_out_service_name(args, soa_dir) # exit with an error if the service doesn't exist if args.execution_date: execution_date = args.execution_date else: execution_date = None all_clusters = list_clusters(soa_dir=soa_dir) actual_deployments = get_actual_deployments(service, soa_dir) # cluster.instance: sha if actual_deployments: deploy_pipeline = list(get_planned_deployments(service, soa_dir)) # cluster.instance deployed_clusters = list_deployed_clusters(deploy_pipeline, actual_deployments) deployed_cluster_instance = _get_cluster_instance(actual_deployments.keys()) if args.clusters is not None: clusters = args.clusters.split(",") else: clusters = deployed_clusters for cluster in clusters: print "cluster: %s" % cluster if cluster not in all_clusters: print " Warning: \"%s\" does not look like a valid cluster." % cluster continue if cluster not in deployed_clusters: print " Warning: service \"%s\" has not been deployed to \"%s\" yet." % (service, cluster) continue if not deployed_cluster_instance[cluster].get(args.instance, False): print (" Warning: instance \"%s\" is either invalid " "or has not been deployed to \"%s\" yet." % (args.instance, cluster)) continue try: chronos_job_config = chronos_tools.load_chronos_job_config( service, args.instance, cluster, load_deployments=False, soa_dir=soa_dir) if chronos_tools.uses_time_variables(chronos_job_config) and execution_date is None: print (" Warning: \"%s\" uses time variables interpolation, " "please supply a `--execution_date` argument." % args.instance) continue except chronos_tools.UnknownChronosJobError as e: print " Warning: %s" % e.message continue if execution_date is None: execution_date = _get_default_execution_date() rc, output = execute_chronos_rerun_on_remote_master( service=service, instancename=args.instance, cluster=cluster, verbose=args.verbose, execution_date=execution_date.strftime(chronos_tools.EXECUTION_DATE_FORMAT) ) if rc == 0: print PaastaColors.green(' successfully created job') else: print PaastaColors.red(' error') print output
def main(): args = parse_args() system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) related_jobs = chronos_tools.get_related_jobs_configs(cluster, service, instance, soa_dir=args.soa_dir) if not related_jobs: error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format( args.service_instance, cluster, ) paasta_print(error_msg) raise NoDeploymentsAvailable if not args.run_all_related_jobs: # Strip all the configuration for the related services # those information will not be used by the rest of the flow related_jobs = { (service, instance): related_jobs[(service, instance)], } complete_job_configs = {} for (srv, inst) in related_jobs: try: complete_job_configs.update( { (srv, inst): chronos_tools.create_complete_config( service=srv, job_name=inst, soa_dir=args.soa_dir, ), }, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format( chronos_tools.compose_job_id(srv, inst), cluster, ) paasta_print(error_msg) raise e except NoConfigurationForServiceError as e: error_msg = ( "Could not read chronos configuration file for {} in cluster {}\nError was: {}" .format( chronos_tools.compose_job_id(srv, inst), cluster, str(e), )) paasta_print(error_msg) raise e except chronos_tools.InvalidParentError as e: raise e if not args.run_all_related_jobs: sorted_jobs = [(service, instance)] else: sorted_jobs = chronos_tools.topological_sort_related_jobs( cluster, service, instance, soa_dir=args.soa_dir) timestamp = datetime.datetime.utcnow().isoformat() chronos_to_add = [] for (service, instance) in sorted_jobs: # complete_job_config is a formatted version of the job, # so the command is formatted in the context of 'now' # replace it with the 'original' cmd so it can be re rendered chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=args.soa_dir, ) original_command = chronos_job_config.get_cmd() complete_job_config = complete_job_configs[(service, instance)] complete_job_config['command'] = original_command clone = clone_job( chronos_job=complete_job_config, timestamp=timestamp, force_disabled=args.force_disabled, ) # modify the command to run commands for a given date clone = modify_command_for_date( chronos_job=clone, date=datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S"), verbose=args.verbose, ) if not args.run_all_related_jobs and chronos_tools.get_job_type( clone) == chronos_tools.JobType.Dependent: # If the job is a dependent job and we want to re-run only the specific instance # remove the parents and update the schedule to start the job as soon as possible clone = set_default_schedule(remove_parents(clone)) chronos_to_add.append(clone) for job_to_add in chronos_to_add: client.add(job_to_add)
def paasta_rerun(args): """Reruns a Chronos job. :param args: argparse.Namespace obj created from sys.args by cli""" system_paasta_config = load_system_paasta_config() soa_dir = args.soa_dir service = figure_out_service_name( args, soa_dir) # exit with an error if the service doesn't exist if args.execution_date: execution_date = args.execution_date else: execution_date = None all_clusters = list_clusters(soa_dir=soa_dir) actual_deployments = get_actual_deployments( service, soa_dir) # cluster.instance: sha if actual_deployments: deploy_pipeline = list(get_planned_deployments( service, soa_dir)) # cluster.instance deployed_clusters = list_deployed_clusters(deploy_pipeline, actual_deployments) deployed_cluster_instance = _get_cluster_instance( actual_deployments.keys()) if args.clusters is not None: clusters = args.clusters.split(",") else: clusters = deployed_clusters for cluster in clusters: paasta_print("cluster: %s" % cluster) if cluster not in all_clusters: paasta_print( " Warning: \"%s\" does not look like a valid cluster." % cluster) continue if cluster not in deployed_clusters: paasta_print( f" Warning: service \"{service}\" has not been deployed to \"{cluster}\" yet." ) continue if not deployed_cluster_instance[cluster].get(args.instance, False): paasta_print((" Warning: instance \"%s\" is either invalid " "or has not been deployed to \"%s\" yet." % (args.instance, cluster))) continue try: chronos_job_config = chronos_tools.load_chronos_job_config( service, args.instance, cluster, load_deployments=False, soa_dir=soa_dir, ) if chronos_tools.uses_time_variables( chronos_job_config) and execution_date is None: paasta_print( (" Warning: \"%s\" uses time variables interpolation, " "please supply a `--execution_date` argument." % args.instance)) continue except NoConfigurationForServiceError as e: paasta_print(" Warning: %s" % e) continue if execution_date is None: execution_date = _get_default_execution_date() related_job_configs = get_related_jobs_configs(cluster, service, args.instance) if not args.rerun_type and len(related_job_configs) > 1: instance_names = sorted([ f'- {srv}{chronos_tools.INTERNAL_SPACER}{inst}' for srv, inst in related_job_configs if srv != service or inst != args.instance ]) paasta_print(PaastaColors.red(' error')) paasta_print( 'Instance {instance} has dependency relations with the following jobs:\n' '{relations}\n' '\n' 'Please specify the rerun policy via --rerun-type argument'. format( instance=args.instance, relations='\n'.join(instance_names), ), ) return rc, output = execute_chronos_rerun_on_remote_master( service=service, instancename=args.instance, cluster=cluster, verbose=args.verbose, execution_date=execution_date.strftime( chronos_tools.EXECUTION_DATE_FORMAT), system_paasta_config=system_paasta_config, run_all_related_jobs=args.rerun_type == 'graph', force_disabled=args.force_disabled, ) if rc == 0: paasta_print(PaastaColors.green(' successfully created job')) else: paasta_print(PaastaColors.red(' error')) paasta_print(output)
def perform_command(command, service, instance, cluster, verbose, soa_dir): """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :returns: A unix-style return code """ chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) complete_job_config = chronos_tools.create_complete_config(service, instance, soa_dir=soa_dir) job_id = complete_job_config["name"] if command == "start": start_chronos_job( service=service, instance=instance, job_id=job_id, client=client, cluster=cluster, job_config=job_config, complete_job_config=complete_job_config, emergency=True, ) elif command == "stop": matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, include_temporary=True) stop_chronos_job(service, instance, client, cluster, matching_jobs, emergency=True) elif command == "restart": matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, ) restart_chronos_job( service=service, instance=instance, job_id=job_id, client=client, cluster=cluster, matching_jobs=matching_jobs, job_config=job_config, complete_job_config=complete_job_config, emergency=True, ) elif command == "status": # Verbose mode shows previous versions. matching_jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=client, include_disabled=True, ) sorted_matching_jobs = chronos_tools.sort_jobs(matching_jobs) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) paasta_print( status_chronos_jobs(client, sorted_matching_jobs, job_config, verbose)) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0