def main(): args = parse_args() docker_client = get_docker_client() running_mesos_task_ids = [task["id"] for task in mesos_tools.filter_running_tasks( mesos_tools.get_running_tasks_from_active_frameworks(''))] running_mesos_docker_containers = get_running_mesos_docker_containers() orphaned_containers = [] for container in running_mesos_docker_containers: mesos_task_id = mesos_tools.get_mesos_id_from_container( container=container, client=docker_client) if mesos_task_id not in running_mesos_task_ids: orphaned_containers.append((container["Names"][0].strip("/"), mesos_task_id)) if orphaned_containers: print "CRIT: Docker containers are orphaned: %s%s" % (", ".join( "%s (%s)" % (container_name, mesos_task_id) for container_name, mesos_task_id in orphaned_containers ), " and will be killed" if args.force else "") if args.force: for container_name, mesos_task_id in orphaned_containers: docker_client.kill(container_name) sys.exit(1) else: print "OK: All mesos task IDs accounted for" sys.exit(0)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances( ) and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() all_marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks( '') # empty string matches all app ids with ZookeeperPool(): for config in configs: if config.get_autoscaling_params( )['decision_policy'] != 'bespoke': try: job_id = format_job_id(config.service, config.instance) marathon_tasks = { task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and task.health_check_results } if not marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks" ) mesos_tasks = [ task for task in all_mesos_tasks if task['id'] in marathon_tasks ] autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: raise e write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def marathon_job_status(mstatus, client, job_config): try: app_id = job_config.format_marathon_app_dict()['id'] except NoDockerImageError: error_msg = "Docker image is not in deployments.json." mstatus['error_message'] = error_msg return mstatus['app_id'] = app_id mstatus['slaves'] = list({task.slave['hostname'] for task in get_running_tasks_from_active_frameworks(app_id)}) mstatus['expected_instance_count'] = job_config.get_instances() deploy_status = marathon_tools.get_marathon_app_deploy_status(client, app_id) mstatus['deploy_status'] = marathon_tools.MarathonDeployStatus.tostring(deploy_status) # by comparing running count with expected count, callers can figure # out if the instance is in Healthy, Warning or Critical state. if deploy_status == marathon_tools.MarathonDeployStatus.NotRunning: mstatus['running_instance_count'] = 0 else: mstatus['running_instance_count'] = client.get_app(app_id).tasks_running if deploy_status == marathon_tools.MarathonDeployStatus.Delayed: _, backoff_seconds = marathon_tools.get_app_queue_status(client, app_id) mstatus['backoff_seconds'] = backoff_seconds
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. log.info("Inspecting %s for autoscaling" % job_id) marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning("Skipping autoscaling run for services because the lock is held") pass
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() mesos_tasks = get_running_tasks_from_active_frameworks('') for config in configs: try: autoscale_marathon_instance(config, marathon_tasks, mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e, level='event') except LockHeldException: pass
def status_mesos_tasks(service, instance, normal_instance_count): job_id = marathon_tools.format_job_id(service, instance) running_and_active_tasks = get_running_tasks_from_active_frameworks(job_id) count = len(running_and_active_tasks) if count >= normal_instance_count: status = PaastaColors.green("Healthy") count = PaastaColors.green("(%d/%d)" % (count, normal_instance_count)) elif count == 0: status = PaastaColors.red("Critical") count = PaastaColors.red("(%d/%d)" % (count, normal_instance_count)) else: status = PaastaColors.yellow("Warning") count = PaastaColors.yellow("(%d/%d)" % (count, normal_instance_count)) running_string = PaastaColors.bold('TASK_RUNNING') return "Mesos: %s - %s tasks in the %s state." % (status, count, running_string)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \ and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def status_chronos_jobs(jobs, job_config, verbose): """Returns a formatted string of the status of a list of chronos jobs :param jobs: list of dicts of chronos job info as returned by the chronos client :param job_config: dict containing configuration about these jobs as provided by chronos_tools.load_chronos_job_config(). """ if jobs == []: return "%s: chronos job is not set up yet" % PaastaColors.yellow("Warning") else: output = [] desired_state = job_config.get_desired_state_human() for job in jobs: running_tasks = get_running_tasks_from_active_frameworks(job["name"]) output.append(format_chronos_job_status(job, desired_state, running_tasks, verbose)) return "\n".join(output)
def status_mesos_tasks(service, instance, normal_instance_count): job_id = marathon_tools.format_job_id(service, instance) # We have to add a spacer at the end to make sure we only return # things for service.main and not service.main_foo filter_string = "%s%s" % (job_id, marathon_tools.MESOS_TASK_SPACER) running_and_active_tasks = get_running_tasks_from_active_frameworks(filter_string) count = len(running_and_active_tasks) if count >= normal_instance_count: status = PaastaColors.green("Healthy") count = PaastaColors.green("(%d/%d)" % (count, normal_instance_count)) elif count == 0: status = PaastaColors.red("Critical") count = PaastaColors.red("(%d/%d)" % (count, normal_instance_count)) else: status = PaastaColors.yellow("Warning") count = PaastaColors.yellow("(%d/%d)" % (count, normal_instance_count)) running_string = PaastaColors.bold('TASK_RUNNING') return "Mesos: %s - %s tasks in the %s state." % (status, count, running_string)
def status_chronos_jobs(jobs, job_config, verbose): """Returns a formatted string of the status of a list of chronos jobs :param jobs: list of dicts of chronos job info as returned by the chronos client :param job_config: dict containing configuration about these jobs as provided by chronos_tools.load_chronos_job_config(). """ if jobs == []: return "%s: chronos job is not set up yet" % PaastaColors.yellow("Warning") else: output = [] desired_state = job_config.get_desired_state_human() output.append("Desired: %s" % desired_state) for job in jobs: running_tasks = get_running_tasks_from_active_frameworks(job["name"]) output.append(format_chronos_job_status(job, running_tasks, verbose)) return "\n".join(output)