Exemplo n.º 1
0
def create_complete_app(context):
    with contextlib.nested(
        mock.patch('paasta_tools.marathon_tools.create_complete_config'),
        mock.patch('paasta_tools.marathon_tools.load_marathon_config', return_value=context.marathon_config),
        mock.patch('paasta_tools.marathon_tools.load_system_paasta_config', return_value=context.system_paasta_config),
        mock.patch('paasta_tools.bounce_lib.load_system_paasta_config', return_value=context.system_paasta_config),
        mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config',
                   return_value=context.system_paasta_config),
    ) as (
        mock_create_complete_config,
        _,
        _,
        _,
        mock_load_system_paasta_config,
    ):
        mock_create_complete_config.return_value = fake_service_config
        mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster)
        print marathon_tools.load_marathon_config()
        return_tuple = setup_marathon_job.setup_service(
            service=fake_service_name,
            instance=fake_instance_name,
            client=context.marathon_client,
            marathon_config=context.marathon_config,
            service_marathon_config=fake_service_marathon_config,
            soa_dir=None,
        )
        assert return_tuple[0] == 0
        assert 'deployed' in return_tuple[1]
Exemplo n.º 2
0
def service_instance_status_error(context, error_code, job_id):
    marathon_config = marathon_tools.load_marathon_config()
    settings.marathon_client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password()
    )
    settings.cluster = load_system_paasta_config().get_cluster()
    settings.soa_dir = context.soa_dir

    (service, instance, _, __) = decompose_job_id(job_id)

    request = testing.DummyRequest()
    request.matchdict = {'service': service, 'instance': instance}

    response = None
    try:
        response = instance_status(request)
    except InstanceFailure as exc:
        print exc.msg
        assert exc.err == int(error_code)
    except:
        raise

    assert not response
Exemplo n.º 3
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir)
            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            log.info("Inspecting %s for autoscaling" % job_id)
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning("Skipping autoscaling run for services because the lock is held")
        pass
def main():

    args = parse_args()
    soa_dir = args.soa_dir

    logging.basicConfig()
    if args.verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.WARNING)
    cluster = load_system_paasta_config().get_cluster()
    service_instances = get_services_for_cluster(
        cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir)

    config = marathon_tools.load_marathon_config()
    client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password())
    for service, instance in service_instances:

        check_service_replication(
            client=client,
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
        )
Exemplo n.º 5
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_tasks = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password(),
                ).list_tasks()
                mesos_tasks = get_running_tasks_from_active_frameworks('')
                for config in configs:
                    try:
                        autoscale_marathon_instance(config, marathon_tasks, mesos_tasks)
                    except Exception as e:
                        write_to_log(config=config, line='Caught Exception %s' % e, level='event')
    except LockHeldException:
        pass
Exemplo n.º 6
0
def paasta_sysdig(args):
    if not args.local:
        mesos_master = get_any_mesos_master(cluster=args.cluster)
        ssh_cmd = 'ssh -At -o LogLevel=QUIET {0} "sudo paasta {1} --local"'.format(mesos_master, ' '.join(sys.argv[1:]))
        return_code, output = _run(ssh_cmd)
        if return_code != 0:
            print output
            sys.exit(return_code)
        slave, command = output.split(':', 1)
        subprocess.call(shlex.split("ssh -tA {0} '{1}'".format(slave, command.strip())))
        return
    status = get_status_for_instance(cluster=args.cluster,
                                     service=args.service,
                                     instance=args.instance)
    slave = pick_slave_from_status(status=status,
                                   host=args.host)
    marathon_config = load_marathon_config()
    marathon_url = marathon_config.get_url()[0]
    marathon_user = marathon_config.get_username()
    marathon_pass = marathon_config.get_password()
    mesos_url = get_mesos_master().host
    marathon_parsed_url = urlparse(marathon_url)
    marathon_creds_url = marathon_parsed_url._replace(netloc="{0}:{1}@{2}".format(marathon_user, marathon_pass,
                                                                                  marathon_parsed_url.netloc))
    print format_mesos_command(slave, status.marathon.app_id, mesos_url, marathon_creds_url.geturl())
Exemplo n.º 7
0
def cleanup_apps(soa_dir):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    for app_id in running_app_ids:
        log.debug("Checking app id %s", app_id)
        try:
            service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id)
            continue
        if (service, instance) not in valid_services:
            delete_app(
                app_id=app_id,
                client=client,
                soa_dir=soa_dir,
            )
Exemplo n.º 8
0
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None):
    """Performs a start/stop/restart/status/scale on an instance
    :param command: String of start, stop, restart, status or scale
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: bool if the output should be verbose or not
    :returns: A unix-style return code
    """
    marathon_config = marathon_tools.load_marathon_config()
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = marathon_tools.create_complete_config(service, instance, marathon_config, soa_dir=soa_dir)['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id
            return 1

    normal_instance_count = job_config.get_instances()
    normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance)
    proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, soa_dir=soa_dir)

    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())
    if command == 'start':
        start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster)
    elif command == 'stop':
        stop_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'restart':
        restart_marathon_job(service, instance, app_id, normal_instance_count, client, cluster)
    elif command == 'status':
        # Setting up transparent cache for http API calls
        requests_cache.install_cache('paasta_serviceinit', backend='memory')

        print status_desired_state(service, instance, client, job_config)
        print status_marathon_job(service, instance, app_id, normal_instance_count, client)
        tasks, out = status_marathon_job_verbose(service, instance, client)
        if verbose:
            print out
        print status_mesos_tasks(service, instance, normal_instance_count)
        if verbose:
            print status_mesos_tasks_verbose(app_id, get_short_task_id)
        if proxy_port is not None:
            print status_smartstack_backends(
                service=service,
                instance=instance,
                cluster=cluster,
                job_config=job_config,
                tasks=tasks,
                expected_count=normal_smartstack_count,
                soa_dir=soa_dir,
                verbose=verbose,
            )
    elif command == 'scale':
        scale_marathon_job(service, instance, app_id, delta, client, cluster)
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Exemplo n.º 9
0
def main(argv=None):
    args = parse_paasta_api_args()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    if args.soa_dir:
        settings.soa_dir = args.soa_dir

    # Exit on exceptions while loading settings
    settings.cluster = load_system_paasta_config().get_cluster()

    marathon_config = marathon_tools.load_marathon_config()
    settings.marathon_client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password()
    )

    # Set up transparent cache for http API calls. With expire_after, responses
    # are removed only when the same request is made. Expired storage is not a
    # concern here. Thus remove_expired_responses is not needed.
    requests_cache.install_cache("paasta-api", backend="memory", expire_after=30)

    server = WSGIServer(('', int(args.port)), make_app())
    log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir))

    try:
        server.serve_forever()
    except KeyboardInterrupt:
        sys.exit(0)
Exemplo n.º 10
0
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None):
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    marathon_config = marathon_tools.load_marathon_config()
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id
            return 1

    normal_instance_count = job_config.get_instances()
    normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance, cluster)
    proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir)

    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())
    if command == 'restart':
        restart_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'status':
        print status_desired_state(service, instance, client, job_config)
        print status_marathon_job(service, instance, app_id, normal_instance_count, client)
        tasks, out = status_marathon_job_verbose(service, instance, client)
        if verbose > 0:
            print out
        print status_mesos_tasks(service, instance, normal_instance_count)
        if verbose > 0:
            tail_lines = calculate_tail_lines(verbose_level=verbose)
            print status_mesos_tasks_verbose(
                job_id=app_id,
                get_short_task_id=get_short_task_id,
                tail_lines=tail_lines,
            )
        if proxy_port is not None:
            print status_smartstack_backends(
                service=service,
                instance=instance,
                cluster=cluster,
                job_config=job_config,
                tasks=tasks,
                expected_count=normal_smartstack_count,
                soa_dir=soa_dir,
                verbose=verbose > 0,
                synapse_port=system_config.get_synapse_port(),
                synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
            )
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Exemplo n.º 11
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)
    mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose,
                                     humanize_output=args.humanize)

    # Check to see if Marathon should be running here by checking for config
    try:
        marathon_config = marathon_tools.load_marathon_config()
    except MarathonNotConfigured:
        marathon_results = [('marathon is not configured to run here', True)]

    # Check to see if Chronos should be running here by checking for config
    try:
        chronos_config = load_chronos_config()
    except ChronosNotConfigured:
        chronos_results = [('chronos is not configured to run here', True)]

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except ServerNotFoundError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)

    mesos_ok = all(status_for_results(mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose)
    print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not all([mesos_ok, marathon_ok, chronos_ok]):
        sys.exit(2)
    else:
        sys.exit(0)
Exemplo n.º 12
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                        and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        pass
Exemplo n.º 13
0
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from
    :param kill_threshold: The decimal fraction of apps we think is
        sane to kill when this job runs.
    :param force: Force the cleanup if we are above the kill_threshold"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    running_apps = []
    for app_id in running_app_ids:
        try:
            app_id = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id)
            continue
        running_apps.append(app_id)
    apps_to_kill = [(service, instance, git_sha, config_sha)
                    for service, instance, git_sha, config_sha in running_apps
                    if (service, instance) not in valid_services]

    log.debug("Running apps: %s" % running_apps)
    log.debug("Valid apps: %s" % valid_services)
    log.debug("Terminating: %s" % apps_to_kill)
    if running_apps:
        above_kill_threshold = float(len(apps_to_kill)) / float(len(running_apps)) > float(kill_threshold)
        if above_kill_threshold and not force:
            log.critical("Paasta was about to kill more than %s of the running services, this "
                         "is probably a BAD mistake!, run again with --force if you "
                         "really need to destroy everything" % kill_threshold)
            raise DontKillEverythingError
    for running_app in apps_to_kill:
        app_id = marathon_tools.format_job_id(*running_app)
        delete_app(
            app_id=app_id,
            client=client,
            soa_dir=soa_dir,
        )
def main():
    args = parse_args()
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    config = marathon_tools.load_marathon_config()
    client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password())

    for deployment in client.list_deployments():
        delete_deployment_if_too_old(
            client=client,
            deployment=deployment,
            max_date=args.age,
            dry_run=args.dry_run,
        )
Exemplo n.º 15
0
def service_instance_status(context, app_count, job_id):
    marathon_config = marathon_tools.load_marathon_config()
    settings.marathon_client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password()
    )
    settings.cluster = load_system_paasta_config().get_cluster()
    settings.soa_dir = context.soa_dir

    (service, instance, _, __) = decompose_job_id(job_id)

    request = testing.DummyRequest()
    request.matchdict = {'service': service, 'instance': instance}
    response = instance_status(request)

    assert response['app_count'] == int(app_count), response
    assert response['marathon']['running_instance_count'] == response['marathon']['expected_instance_count'], response
Exemplo n.º 16
0
def setup_paasta_api():
    # pyinotify is a better solution than turning off file caching completely
    service_configuration_lib.disable_yaml_cache()

    # Exit on exceptions while loading settings
    settings.cluster = load_system_paasta_config().get_cluster()

    marathon_config = marathon_tools.load_marathon_config()
    settings.marathon_client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password()
    )

    # Set up transparent cache for http API calls. With expire_after, responses
    # are removed only when the same request is made. Expired storage is not a
    # concern here. Thus remove_expired_responses is not needed.
    requests_cache.install_cache("paasta-api", backend="memory", expire_after=30)
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    cluster = args.cluster
    if args.minimal:
        marathon_config = load_marathon_config()
        marathon_client = get_marathon_client(
            url=marathon_config.get_url(),
            user=marathon_config.get_username(),
            passwd=marathon_config.get_password(),
        )
        service_instances = get_service_instances_that_need_bouncing(
            marathon_client=marathon_client, soa_dir=soa_dir)
    else:
        instances = get_services_for_cluster(cluster=cluster,
                                             instance_type='marathon',
                                             soa_dir=soa_dir)
        service_instances = []
        for name, instance in instances:
            service_instances.append(compose_job_id(name, instance))
    print '\n'.join(service_instances)
    sys.exit(0)
Exemplo n.º 18
0
def get_deployments():
    marathon_config = load_marathon_config()
    marathon_client = get_marathon_client(marathon_config)
    deployments = marathon_client.list_deployments()
    return deployments
Exemplo n.º 19
0
def get_main_marathon_config():
    log.debug("Reading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Marathon config is: %s", marathon_config)
    return marathon_config
Exemplo n.º 20
0
def main():
    args = parse_args()
    full_appid = args.appname.lstrip('/')
    soa_dir = args.soa_dir
    marathon_config = marathon_tools.load_marathon_config()
    client = marathon_tools.get_marathon_client(
        url=marathon_config.get_url(),
        user=marathon_config.get_username(),
        passwd=marathon_config.get_password(),
    )

    if not marathon_tools.is_app_id_running(app_id=full_appid, client=client):
        print("Couldn't find an app named {0}".format(full_appid))
        sys.exit(1)

    service, instance, _, __ = (s.replace('--', '_') for s in decompose_job_id(full_appid))
    complete_config = marathon_tools.create_complete_config(service, instance, marathon_config)
    cluster = load_system_paasta_config().get_cluster()
    service_instance_config = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    )
    nerve_ns = service_instance_config.get_nerve_namespace()
    service_namespace_config = marathon_tools.load_service_namespace_config(service=service, namespace=nerve_ns)
    drain_method = drain_lib.get_drain_method(
        service_instance_config.get_drain_method(service_namespace_config),
        service=service,
        instance=instance,
        nerve_ns=nerve_ns,
        drain_method_params=service_instance_config.get_drain_method_params(service_namespace_config),
    )

    bounce_func = bounce_lib.get_bounce_method_func('down')

    while marathon_tools.is_app_id_running(app_id=full_appid, client=client):
        app_to_kill = client.get_app(full_appid)
        old_app_live_tasks, old_app_draining_tasks = get_old_live_draining_tasks([app_to_kill], drain_method)
        do_bounce(
            bounce_func=bounce_func,
            drain_method=drain_method,
            config=complete_config,
            new_app_running='',
            happy_new_tasks=[],
            old_app_live_tasks=old_app_live_tasks,
            old_app_draining_tasks=old_app_draining_tasks,
            serviceinstance="{0}.{1}".format(service, instance),
            bounce_method='down',
            service=service,
            cluster=cluster,
            instance=instance,
            marathon_jobid=full_appid,
            client=client,
            soa_dir=soa_dir,
        )

        print "Sleeping for 10 seconds to give the tasks time to drain"
        time.sleep(10)

    print("Sucessfully killed {0}".format(full_appid))
Exemplo n.º 21
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print (PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = get_mesos_state_status(mesos_state=mesos_state)
    metrics = get_mesos_stats()
    mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status

    # Check to see if Marathon should be running here by checking for config
    try:
        marathon_config = marathon_tools.load_marathon_config()
    except MarathonNotConfigured:
        marathon_results = [HealthCheckResult(message="Marathon is not configured to run here", healthy=True)]

    # Check to see if Chronos should be running here by checking for config
    try:
        chronos_config = load_chronos_config()
    except PaastaNotConfiguredError:
        chronos_results = [HealthCheckResult(message="Chronos is not configured to run here", healthy=True)]

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print (PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except ServerNotFoundError as e:
            print (PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)

    mesos_ok = all(status_for_results(all_mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    if args.verbose == 0:
        print mesos_summary
        print marathon_summary
        print chronos_summary
    elif args.verbose == 1:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    elif args.verbose == 2:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent("Resources Grouped by %s" % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict["total"], free=resource_info_dict["free"]
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(
                    get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize)
                )
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    else:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent("Resources Grouped by %s" % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict["total"], free=resource_info_dict["free"]
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(
                    get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize)
                )
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        print_with_indent("Per Slave Utilization", 2)
        slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave["hostname"], mesos_state)
        all_rows = [["Hostname", "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]]

        # print info about slaves here. Note that we don't make modifications to
        # the healthy_exit variable here, because we don't care about a single slave
        # having high usage.
        for attribute_value, resource_info_dict in slave_resource_dict.items():
            table_rows = []
            resource_utilizations = resource_utillizations_from_resource_info(
                total=resource_info_dict["total"], free=resource_info_dict["free"]
            )
            healthcheck_utilization_pairs = [
                healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                for utilization in resource_utilizations
            ]
            table_rows.append(
                get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize)
            )
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Exemplo n.º 22
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = metastatus_lib.get_mesos_state_status(
        mesos_state=mesos_state,
    )

    metrics = master.metrics_snapshot()
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics,
                                                                                mesos_state=mesos_state)
    framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = metastatus_lib.get_marathon_client(marathon_config)
        try:
            marathon_results = metastatus_lib.get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here',
                                                             healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here',
                                                            healthy=True)]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    print "Master paasta_tools version: {0}".format(__version__)
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function,
                                                                                     mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'],
                                                                                      mesos_state)
            all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Exemplo n.º 23
0
def get_marathon_client_from_config():
    marathon_config = load_marathon_config()
    marathon_client = get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                          marathon_config.get_password())
    return marathon_client
Exemplo n.º 24
0
def perform_command(command,
                    service,
                    instance,
                    cluster,
                    verbose,
                    soa_dir,
                    app_id=None,
                    delta=None):
    """Performs a start/stop/restart/status/scale on an instance
    :param command: String of start, stop, restart, status or scale
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    marathon_config = marathon_tools.load_marathon_config()
    job_config = marathon_tools.load_marathon_service_config(service,
                                                             instance,
                                                             cluster,
                                                             soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id
            return 1

    normal_instance_count = job_config.get_instances()
    normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
        service, instance, cluster)
    proxy_port = marathon_tools.get_proxy_port_for_instance(service,
                                                            instance,
                                                            cluster,
                                                            soa_dir=soa_dir)

    client = marathon_tools.get_marathon_client(marathon_config.get_url(),
                                                marathon_config.get_username(),
                                                marathon_config.get_password())
    if command == 'start':
        start_marathon_job(service, instance, app_id, normal_instance_count,
                           client, cluster)
    elif command == 'stop':
        stop_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'restart':
        restart_marathon_job(service, instance, app_id, normal_instance_count,
                             client, cluster)
    elif command == 'status':
        print status_desired_state(service, instance, client, job_config)
        print status_marathon_job(service, instance, app_id,
                                  normal_instance_count, client)
        tasks, out = status_marathon_job_verbose(service, instance, client)
        if verbose > 0:
            print out
        print status_mesos_tasks(service, instance, normal_instance_count)
        if verbose > 0:
            tail_stdstreams = verbose > 1
            print status_mesos_tasks_verbose(app_id, get_short_task_id,
                                             tail_stdstreams)
        if proxy_port is not None:
            print status_smartstack_backends(
                service=service,
                instance=instance,
                cluster=cluster,
                job_config=job_config,
                tasks=tasks,
                expected_count=normal_smartstack_count,
                soa_dir=soa_dir,
                verbose=verbose > 0,
                synapse_port=system_config.get_synapse_port(),
                synapse_haproxy_url_format=system_config.
                get_synapse_haproxy_url_format(),
            )
    elif command == 'scale':
        scale_marathon_job(service, instance, app_id, delta, client, cluster)
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Exemplo n.º 25
0
def get_main_marathon_config():
    log.debug("Reading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Marathon config is: %s", marathon_config)
    return marathon_config