예제 #1
0
def delete_app(app_id, client):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    try:
        with bounce_lib.bounce_lock_zookeeper(marathon_tools.compose_job_id(service, instance)):
            bounce_lib.delete_marathon_app(app_id, client)
            log_line = "Deleted stale marathon job that looks lost: %s" % app_id
            _log(service=service,
                 component='deploy',
                 level='event',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=log_line)
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(service=service,
                 component='deploy',
                 level='debug',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=logline)
        raise
예제 #2
0
def deploy_marathon_service(service, instance, client, soa_dir, marathon_config):
    try:
        service_instance_config = marathon_tools.load_marathon_service_config(
            service,
            instance,
            load_system_paasta_config().get_cluster(),
            soa_dir=soa_dir,
        )
    except NoDeploymentsAvailable:
        log.debug("No deployments found for %s.%s in cluster %s. Skipping." %
                  (service, instance, load_system_paasta_config().get_cluster()))
        return 0
    except NoConfigurationForServiceError:
        error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \
                    (service, instance, load_system_paasta_config().get_cluster())
        log.error(error_msg)
        return 1

    try:
        status, output = setup_service(service, instance, client, marathon_config,
                                       service_instance_config, soa_dir)
        sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
        send_event(service, instance, soa_dir, sensu_status, output)
        return 0
    except (KeyError, TypeError, AttributeError, InvalidInstanceConfig):
        error_str = traceback.format_exc()
        log.error(error_str)
        send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str)
        return 1
예제 #3
0
def create_complete_config(service, instance, marathon_config, soa_dir=DEFAULT_SOA_DIR):
    """Generates a complete dictionary to be POST'ed to create an app on Marathon"""
    system_paasta_config = load_system_paasta_config()
    partial_id = format_job_id(service=service, instance=instance)
    instance_config = load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=load_system_paasta_config().get_cluster(),
        soa_dir=soa_dir,
    )
    docker_url = get_docker_url(system_paasta_config.get_docker_registry(), instance_config.get_docker_image())
    service_namespace_config = load_service_namespace_config(
        service=service,
        namespace=instance_config.get_nerve_namespace(),
    )
    docker_volumes = system_paasta_config.get_volumes() + instance_config.get_extra_volumes()

    complete_config = instance_config.format_marathon_app_dict(
        app_id=partial_id,
        docker_url=docker_url,
        docker_volumes=docker_volumes,
        service_namespace_config=service_namespace_config,
    )
    code_sha = get_code_sha_from_dockerurl(docker_url)
    config_hash = get_config_hash(
        complete_config,
        force_bounce=instance_config.get_force_bounce(),
    )
    full_id = format_job_id(service, instance, code_sha, config_hash)
    complete_config['id'] = full_id
    return complete_config
예제 #4
0
def create_complete_config(service, instance, marathon_config, soa_dir=DEFAULT_SOA_DIR):
    """Generates a complete dictionary to be POST'ed to create an app on Marathon"""
    # A set of config attributes that don't get included in the hash of the config.
    # These should be things that PaaSTA/Marathon knows how to change without requiring a bounce.
    CONFIG_HASH_BLACKLIST = set(['instances', 'backoff_seconds'])

    system_paasta_config = load_system_paasta_config()
    partial_id = format_job_id(service=service, instance=instance)
    instance_config = load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=load_system_paasta_config().get_cluster(),
        soa_dir=soa_dir,
    )
    docker_url = get_docker_url(system_paasta_config.get_docker_registry(), instance_config.get_docker_image())
    service_namespace_config = load_service_namespace_config(
        service=service,
        namespace=instance_config.get_nerve_namespace(),
    )
    docker_volumes = system_paasta_config.get_volumes() + instance_config.get_extra_volumes()

    complete_config = instance_config.format_marathon_app_dict(
        app_id=partial_id,
        docker_url=docker_url,
        docker_volumes=docker_volumes,
        service_namespace_config=service_namespace_config,
    )
    code_sha = get_code_sha_from_dockerurl(docker_url)
    config_hash = get_config_hash(
        {key: value for key, value in complete_config.items() if key not in CONFIG_HASH_BLACKLIST},
        force_bounce=instance_config.get_force_bounce(),
    )
    full_id = format_job_id(service, instance, code_sha, config_hash)
    complete_config['id'] = full_id
    return complete_config
예제 #5
0
def test_load_system_paasta_config_file_non_existent_dir():
    fake_path = "/var/dir_of_fake"
    with contextlib.nested(mock.patch("os.path.isdir", return_value=False)) as (isdir_patch,):
        with raises(utils.PaastaNotConfiguredError) as excinfo:
            utils.load_system_paasta_config(fake_path)
        expected = "Could not find system paasta configuration directory: %s" % fake_path
        assert str(excinfo.value) == expected
예제 #6
0
def main():
    """Attempt to set up the marathon service instance given.
    Exits 1 if the deployment failed.
    This is done in the following order:

    - Load the marathon configuration
    - Connect to marathon
    - Load the service instance's configuration
    - Create the complete marathon job configuration
    - Deploy/bounce the service
    - Emit an event about the deployment to sensu"""
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    try:
        service, instance, _, __ = decompose_job_id(args.service_instance)
    except InvalidJobNameError:
        log.error("Invalid service instance specified. Format is service%sinstance." % SPACER)
        sys.exit(1)

    marathon_config = get_main_marathon_config()
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    try:
        service_instance_config = marathon_tools.load_marathon_service_config(
            service,
            instance,
            load_system_paasta_config().get_cluster(),
            soa_dir=soa_dir,
        )
    except NoDeploymentsAvailable:
        log.debug("No deployments found for %s in cluster %s. Skipping." % (args.service_instance,
                                                                            load_system_paasta_config().get_cluster()))
        sys.exit(0)
    except NoConfigurationForServiceError:
        error_msg = "Could not read marathon configuration file for %s in cluster %s" % \
            (args.service_instance, load_system_paasta_config().get_cluster())
        log.error(error_msg)
        sys.exit(1)

    try:
        status, output = setup_service(service, instance, client, marathon_config,
                                       service_instance_config, soa_dir)
        sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
        send_event(service, instance, soa_dir, sensu_status, output)
        # We exit 0 because the script finished ok and the event was sent to the right team.
        sys.exit(0)
    except (KeyError, TypeError, AttributeError, InvalidInstanceConfig):
        import traceback
        error_str = traceback.format_exc()
        log.error(error_str)
        send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str)
        # We exit 0 because the script finished ok and the event was sent to the right team.
        sys.exit(0)
예제 #7
0
def delete_app(app_id, client, soa_dir):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    cluster = load_system_paasta_config().get_cluster()
    try:
        short_app_id = marathon_tools.compose_job_id(service, instance)
        with bounce_lib.bounce_lock_zookeeper(short_app_id):
            bounce_lib.delete_marathon_app(app_id, client)
        send_event(
            service=service,
            check_name='check_marathon_services_replication.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='setup_marathon_job.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='paasta_bounce_progress.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        log_line = "Deleted stale marathon job that looks lost: %s" % app_id
        _log(
            service=service,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
            line=log_line,
        )
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(
                service=service,
                component='deploy',
                level='debug',
                cluster=load_system_paasta_config().get_cluster(),
                instance=instance,
                line=logline,
            )
        raise
예제 #8
0
def test_load_system_paasta_config_file_dne():
    fake_path = "/var/dir_of_fake"
    with contextlib.nested(
        mock.patch("os.path.isdir", return_value=True),
        mock.patch("os.access", return_value=True),
        mock.patch("paasta_tools.utils.open", create=True, side_effect=IOError(2, "a", "b")),
        mock.patch("paasta_tools.utils.get_readable_files_in_glob", autospec=True, return_value=[fake_path]),
    ) as (isdir_patch, access_patch, open_patch, mock_get_readable_files_in_glob):
        with raises(utils.PaastaNotConfiguredError) as excinfo:
            utils.load_system_paasta_config(fake_path)
        assert str(excinfo.value) == "Could not load system paasta config file b: a"
예제 #9
0
def test_load_system_paasta_config_file_non_readable_dir():
    fake_path = '/var/dir_of_fake'
    with contextlib.nested(
        mock.patch('os.path.isdir', return_value=True),
        mock.patch('os.access', return_value=False),
    ) as (
        isdir_patch,
        access_patch,
    ):
        with raises(utils.PaastaNotConfiguredError) as excinfo:
            utils.load_system_paasta_config(fake_path)
        expected = "Could not read from system paasta configuration directory: %s" % fake_path
        assert str(excinfo.value) == expected
예제 #10
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_tasks = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password(),
                ).list_tasks()
                mesos_tasks = get_running_tasks_from_active_frameworks('')
                for config in configs:
                    try:
                        autoscale_marathon_instance(config, marathon_tasks, mesos_tasks)
                    except Exception as e:
                        write_to_log(config=config, line='Caught Exception %s' % e, level='event')
    except LockHeldException:
        pass
예제 #11
0
def service_instance_status_error(context, error_code, job_id):
    marathon_config = marathon_tools.load_marathon_config()
    settings.marathon_client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password()
    )
    settings.cluster = load_system_paasta_config().get_cluster()
    settings.soa_dir = context.soa_dir

    (service, instance, _, __) = decompose_job_id(job_id)

    request = testing.DummyRequest()
    request.matchdict = {'service': service, 'instance': instance}

    response = None
    try:
        response = instance_status(request)
    except InstanceFailure as exc:
        print exc.msg
        assert exc.err == int(error_code)
    except:
        raise

    assert not response
예제 #12
0
def create_complete_config(service, job_name, soa_dir=DEFAULT_SOA_DIR):
    """Generates a complete dictionary to be POST'ed to create a job on Chronos"""
    system_paasta_config = load_system_paasta_config()
    chronos_job_config = load_chronos_job_config(service, job_name, system_paasta_config.get_cluster(), soa_dir=soa_dir)
    docker_url = get_docker_url(system_paasta_config.get_docker_registry(), chronos_job_config.get_docker_image())
    docker_volumes = system_paasta_config.get_volumes() + chronos_job_config.get_extra_volumes()

    complete_config = chronos_job_config.format_chronos_job_dict(docker_url, docker_volumes)
    code_sha = get_code_sha_from_dockerurl(docker_url)
    config_hash = get_config_hash(complete_config)

    # Chronos clears the history for a job whenever it is updated, so we use a new job name for each revision
    # so that we can keep history of old job revisions rather than just the latest version
    full_id = compose_job_id(service, job_name, code_sha, config_hash)
    complete_config["name"] = full_id
    desired_state = chronos_job_config.get_desired_state()

    # If the job was previously stopped, we should stop the new job as well
    # NOTE this clobbers the 'disabled' param specified in the config file!
    if desired_state == "start":
        complete_config["disabled"] = False
    elif desired_state == "stop":
        complete_config["disabled"] = True

    log.debug("Complete configuration for instance is: %s" % complete_config)
    return complete_config
예제 #13
0
def get_marathon_services_running_here_for_nerve(cluster, soa_dir):
    if not cluster:
        try:
            cluster = load_system_paasta_config().get_cluster()
        # In the cases where there is *no* cluster or in the case
        # where there isn't a Paasta configuration file at *all*, then
        # there must be no marathon services running here, so we catch
        # these custom exceptions and return [].
        except (PaastaNotConfiguredError):
            return []
    # When a cluster is defined in mesos, let's iterate through marathon services
    marathon_services = marathon_services_running_here()
    nerve_list = []
    for name, instance, port in marathon_services:
        try:
            namespace = read_namespace_for_service_instance(name, instance, cluster, soa_dir)
            nerve_dict = load_service_namespace_config(name, namespace, soa_dir)
            if not nerve_dict.is_in_smartstack():
                continue
            nerve_dict['port'] = port
            nerve_name = compose_job_id(name, namespace)
            nerve_list.append((nerve_name, nerve_dict))
        except KeyError:
            continue  # SOA configs got deleted for this app, it'll get cleaned up
    return nerve_list
def send_event(service, instance, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param instance: The instance of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event
    """
    cluster = load_system_paasta_config().get_cluster()
    monitoring_overrides = chronos_tools.load_chronos_job_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    ).get_monitoring()
    # In order to let sensu know how often to expect this check to fire,
    # we need to set the ``check_every`` to the frequency of our cron job, which
    # is 10s.
    monitoring_overrides['check_every'] = '10s'
    # Most deploy_chronos_jobs failures are transient and represent issues
    # that will probably be fixed eventually, so we set an alert_after
    # to suppress extra noise
    monitoring_overrides['alert_after'] = '10m'
    check_name = 'setup_chronos_job.%s' % compose_job_id(service, instance)
    monitoring_tools.send_event(
        service=service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status,
        output=output,
        soa_dir=soa_dir,
    )
예제 #15
0
def paasta_emergency_start(args):
    """Performs an emergency start on a given service instance on a given cluster

    Warning: This command is not magic and cannot actually get a service to start if it couldn't
    run before. This includes configurations that prevent the service from running,
    such as 'instances: 0' (for Marathon apps).

    All it does for Marathon apps is ask Marathon to resume normal operation by scaling up to
    the instance count defined in the service's config.
    All it does for Chronos jobs is send the latest version of the job config to Chronos and run it immediately.
    """
    system_paasta_config = load_system_paasta_config()
    service = figure_out_service_name(args, soa_dir=args.soa_dir)
    print "Performing an emergency start on %s..." % compose_job_id(service, args.instance)
    output = execute_paasta_serviceinit_on_remote_master(
        subcommand="start",
        cluster=args.cluster,
        service=service,
        instances=args.instance,
        system_paasta_config=system_paasta_config,
    )
    print "%s" % "\n".join(paasta_emergency_start.__doc__.splitlines()[-8:])
    print "Output: %s" % PaastaColors.grey(output)
    print "Run this command to see the status:"
    print "paasta status --service %s --clusters %s" % (service, args.cluster)
def main():

    args = parse_args()
    soa_dir = args.soa_dir

    logging.basicConfig()
    if args.verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.WARNING)
    cluster = load_system_paasta_config().get_cluster()
    service_instances = get_services_for_cluster(
        cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir)

    config = marathon_tools.load_marathon_config()
    client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password())
    for service, instance in service_instances:

        check_service_replication(
            client=client,
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
        )
예제 #17
0
def create_complete_config(service, job_name, soa_dir=DEFAULT_SOA_DIR):
    """Generates a complete dictionary to be POST'ed to create a job on Chronos"""
    system_paasta_config = load_system_paasta_config()
    chronos_job_config = load_chronos_job_config(
        service, job_name, system_paasta_config.get_cluster(), soa_dir=soa_dir)
    docker_url = get_docker_url(
        system_paasta_config.get_docker_registry(), chronos_job_config.get_docker_image())
    docker_volumes = system_paasta_config.get_volumes() + chronos_job_config.get_extra_volumes()

    complete_config = chronos_job_config.format_chronos_job_dict(
        docker_url,
        docker_volumes,
        system_paasta_config.get_dockercfg_location(),
    )

    complete_config['name'] = compose_job_id(service, job_name)

    # resolve conflicts between the 'desired_state' and soa_configs disabled
    # flag.
    desired_state = chronos_job_config.get_desired_state()
    soa_disabled_state = complete_config['disabled']

    resolved_disabled_state = determine_disabled_state(desired_state,
                                                       soa_disabled_state)
    complete_config['disabled'] = resolved_disabled_state

    # we use the undocumented description field to store a hash of the chronos config.
    # this makes it trivial to compare configs and know when to bounce.
    complete_config['description'] = get_config_hash(complete_config)

    log.debug("Complete configuration for instance is: %s" % complete_config)
    return complete_config
예제 #18
0
파일: api.py 프로젝트: gstarnberger/paasta
def main(argv=None):
    args = parse_paasta_api_args()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    if args.soa_dir:
        settings.soa_dir = args.soa_dir

    # Exit on exceptions while loading settings
    settings.cluster = load_system_paasta_config().get_cluster()

    marathon_config = marathon_tools.load_marathon_config()
    settings.marathon_client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password()
    )

    # Set up transparent cache for http API calls. With expire_after, responses
    # are removed only when the same request is made. Expired storage is not a
    # concern here. Thus remove_expired_responses is not needed.
    requests_cache.install_cache("paasta-api", backend="memory", expire_after=30)

    server = WSGIServer(('', int(args.port)), make_app())
    log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir))

    try:
        server.serve_forever()
    except KeyboardInterrupt:
        sys.exit(0)
    def run(self):
        self.setup_logging()
        all_service_config = read_services_configuration()
        system_config = load_system_paasta_config()
        service_replication = self.get_service_replication(
            all_services=all_service_config.keys(),
            synapse_host=system_config.get_default_synapse_host(),
            synapse_port=system_config.get_synapse_port(),
            synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
        )

        checked_services = []
        for service, service_config in all_service_config.iteritems():
            do_monitoring, monitoring_config = extract_replication_info(
                service_config
            )

            if do_monitoring:
                self.log.debug("Checking {0}".format(service))
                replication = service_replication.get('%s.main' % service, 0)
                event = do_replication_check(service, monitoring_config,
                                             replication)
                checked_services.append(service)
                self.log.debug("Result for {0}: {1}".format(service,
                                                            event['output']))
                report_event(event)
            else:
                self.log.debug("Not checking {0}".format(service))

        self.ok("Finished checking services: {0}".format(checked_services))
예제 #20
0
def load_performance_check_config():
    try:
        return load_system_paasta_config().get_performance_check_config()
    except PaastaNotConfiguredError as e:
        print "No performance check config to use. Safely bailing."
        print e.strerror
        sys.exit(0)
예제 #21
0
def create_complete_config(service, job_name, soa_dir=DEFAULT_SOA_DIR):
    """Generates a complete dictionary to be POST'ed to create a job on Chronos"""
    system_paasta_config = load_system_paasta_config()
    chronos_job_config = load_chronos_job_config(
        service, job_name, system_paasta_config.get_cluster(), soa_dir=soa_dir)
    docker_url = get_docker_url(
        system_paasta_config.get_docker_registry(), chronos_job_config.get_docker_image())
    docker_volumes = system_paasta_config.get_volumes() + chronos_job_config.get_extra_volumes()

    complete_config = chronos_job_config.format_chronos_job_dict(
        docker_url,
        docker_volumes,
    )

    complete_config['name'] = compose_job_id(service, job_name)
    desired_state = chronos_job_config.get_desired_state()

    # If the job was previously stopped, we should stop the new job as well
    # NOTE this clobbers the 'disabled' param specified in the config file!
    if desired_state == 'start':
        complete_config['disabled'] = False
    elif desired_state == 'stop':
        complete_config['disabled'] = True

    # we use the undocumented description field to store a hash of the chronos config.
    # this makes it trivial to compare configs and know when to bounce.
    complete_config['description'] = get_config_hash(complete_config)

    log.debug("Complete configuration for instance is: %s" % complete_config)
    return complete_config
예제 #22
0
def main(args):
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()

    # get those jobs listed in configs
    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)

    service_job_mapping = build_service_job_mapping(client, configured_jobs)
    for service_instance, job_state_pairs in service_job_mapping.items():
        service, instance = service_instance[0], service_instance[1]
        sensu_output, sensu_status = sensu_message_status_for_jobs(service, instance, job_state_pairs)
        monitoring_overrides = compose_monitoring_overrides_for_service(
            cluster=system_paasta_config.get_cluster(),
            service=service,
            instance=instance,
            soa_dir=args.soa_dir
        )
        send_event_to_sensu(
            service=service,
            instance=instance,
            monitoring_overrides=monitoring_overrides,
            status_code=sensu_status,
            message=sensu_output,
            soa_dir=args.soa_dir,
        )
예제 #23
0
파일: client.py 프로젝트: oktopuz/paasta
def get_paasta_api_client(cluster=None, system_paasta_config=None):
    if not system_paasta_config:
        system_paasta_config = load_system_paasta_config()

    if not cluster:
        cluster = system_paasta_config.get_cluster()

    api_endpoints = system_paasta_config.get_api_endpoints()
    if cluster not in api_endpoints:
        log.error('Cluster %s not in paasta-api endpoints config', cluster)
        return None

    url = str(api_endpoints[cluster])
    parsed = urlparse(url)
    if not parsed:
        log.error('Unsupported paasta-api url %s', url)
        return None
    api_server = parsed.netloc

    # Get swagger spec from file system instead of the api server
    paasta_api_path = os.path.dirname(sys.modules['paasta_tools.api'].__file__)
    swagger_file = os.path.join(paasta_api_path, 'api_docs/swagger.json')
    if not os.path.isfile(swagger_file):
        log.error('paasta-api swagger spec %s does not exist', swagger_file)
        return None

    with open(swagger_file) as f:
        spec_dict = json.load(f)
    # replace localhost in swagger.json with actual api server
    spec_dict['host'] = api_server
    return SwaggerClient.from_spec(spec_dict=spec_dict)
예제 #24
0
def main(argv):
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    drivers = []
    schedulers = []
    for service, instance in get_paasta_native_jobs_for_cluster(cluster=cluster, soa_dir=args.soa_dir):
        scheduler = PaastaScheduler(
            service_name=service,
            instance_name=instance,
            cluster=cluster,
            system_paasta_config=system_paasta_config,
            soa_dir=args.soa_dir,
        )
        schedulers.append(scheduler)

        driver = create_driver(
            service=service,
            instance=instance,
            scheduler=scheduler,
            system_paasta_config=system_paasta_config,
        )
        driver.start()
        drivers.append(driver)

    end_time = time.time() + args.stay_alive_seconds
    while time.time() < end_time:
        sleep(args.periodic_interval)
        for scheduler, driver in zip(schedulers, drivers):
            scheduler.periodic(driver)

    return schedulers
예제 #25
0
파일: status.py 프로젝트: RedCobbler/paasta
def paasta_status(args):
    """Print the status of a Yelp service running on PaaSTA.
    :param args: argparse.Namespace obj created from sys.args by cli"""
    soa_dir = args.soa_dir
    service = figure_out_service_name(args, soa_dir)
    actual_deployments = get_actual_deployments(service, soa_dir)
    system_paasta_config = load_system_paasta_config()

    if args.clusters is not None:
        cluster_whitelist = args.clusters.split(",")
    else:
        cluster_whitelist = []
    if args.instances is not None:
        instance_whitelist = args.instances.split(",")
    else:
        instance_whitelist = []

    if actual_deployments:
        deploy_pipeline = list(get_planned_deployments(service, soa_dir))
        report_status(
            service=service,
            deploy_pipeline=deploy_pipeline,
            actual_deployments=actual_deployments,
            cluster_whitelist=cluster_whitelist,
            instance_whitelist=instance_whitelist,
            system_paasta_config=system_paasta_config,
            verbose=args.verbose,
        )
    else:
        print missing_deployments_message(service)
예제 #26
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir)
            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            log.info("Inspecting %s for autoscaling" % job_id)
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning("Skipping autoscaling run for services because the lock is held")
        pass
예제 #27
0
def test_load_system_paasta_config():
    json_load_return_value = {'foo': 'bar'}
    expected = utils.SystemPaastaConfig(json_load_return_value, '/some/fake/dir')
    file_mock = mock.MagicMock(spec=file)
    with contextlib.nested(
        mock.patch('os.path.isdir', return_value=True),
        mock.patch('os.access', return_value=True),
        mock.patch('paasta_tools.utils.open', create=True, return_value=file_mock),
        mock.patch('paasta_tools.utils.get_readable_files_in_glob', autospec=True,
                   return_value=['/some/fake/dir/some_file.json']),
        mock.patch('paasta_tools.utils.json.load', autospec=True, return_value=json_load_return_value)
    ) as (
        os_is_dir_patch,
        os_access_patch,
        open_file_patch,
        mock_get_readable_files_in_glob,
        json_patch,
    ):
        actual = utils.load_system_paasta_config()
        assert actual == expected
        # Kinda weird but without this load_system_paasta_config() can (and
        # did! during development) return a plain dict without the test
        # complaining.
        assert actual.__class__ == expected.__class__
        open_file_patch.assert_any_call('/some/fake/dir/some_file.json')
        json_patch.assert_any_call(file_mock.__enter__())
        assert json_patch.call_count == 1
예제 #28
0
def start_paasta_native_framework(context, reconcile_backoff):
    clear_mesos_tools_cache()
    system_paasta_config = load_system_paasta_config()
    system_paasta_config['docker_registry'] = 'docker.io'  # so busybox runs.

    context.scheduler = PaastaScheduler(
        service_name=context.service,
        instance_name=context.instance,
        cluster=context.cluster,
        system_paasta_config=system_paasta_config,
        service_config=context.new_config,
        reconcile_backoff=int(reconcile_backoff),
    )

    context.driver = create_driver(
        service=context.service,
        instance=context.instance,
        scheduler=context.scheduler,
        system_paasta_config=system_paasta_config,
    )

    context.driver.start()

    if not hasattr(context, 'framework_ids'):
        context.framework_ids = []

    for _ in xrange(10):
        if context.scheduler.framework_id:
            context.framework_ids.append(context.scheduler.framework_id)
            break
        time.sleep(1)
    else:
        raise Exception("Expected scheduler to successfully register before timeout")
예제 #29
0
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None):
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    marathon_config = marathon_tools.load_marathon_config()
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id
            return 1

    normal_instance_count = job_config.get_instances()
    normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance, cluster)
    proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir)

    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())
    if command == 'restart':
        restart_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'status':
        print status_desired_state(service, instance, client, job_config)
        print status_marathon_job(service, instance, app_id, normal_instance_count, client)
        tasks, out = status_marathon_job_verbose(service, instance, client)
        if verbose > 0:
            print out
        print status_mesos_tasks(service, instance, normal_instance_count)
        if verbose > 0:
            tail_lines = calculate_tail_lines(verbose_level=verbose)
            print status_mesos_tasks_verbose(
                job_id=app_id,
                get_short_task_id=get_short_task_id,
                tail_lines=tail_lines,
            )
        if proxy_port is not None:
            print status_smartstack_backends(
                service=service,
                instance=instance,
                cluster=cluster,
                job_config=job_config,
                tasks=tasks,
                expected_count=normal_smartstack_count,
                soa_dir=soa_dir,
                verbose=verbose > 0,
                synapse_port=system_config.get_synapse_port(),
                synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
            )
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
예제 #30
0
def paasta_emergency_stop(args):
    """Performs an emergency stop on a given service instance on a given cluster

    Warning: This command does not permanently stop the service. The next time the service is updated
    (config change, deploy, bounce, etc.), those settings will override the emergency stop.

    If you want this stop to be permanant, adjust the relevant config file to reflect that.
    For example, this can be done for Marathon apps by setting 'instances: 0', or
    for Chronos jobs by setting 'disabled: True'. Alternatively, remove the config yaml entirely.
    """
    system_paasta_config = load_system_paasta_config()
    service = figure_out_service_name(args, soa_dir=args.soa_dir)
    print "Performing an emergency stop on %s..." % compose_job_id(service, args.instance)
    output = execute_paasta_serviceinit_on_remote_master(
        subcommand="stop",
        cluster=args.cluster,
        service=service,
        instances=args.instance,
        system_paasta_config=system_paasta_config,
        app_id=args.appid,
    )
    print "Output: %s" % output
    print "%s" % "\n".join(paasta_emergency_stop.__doc__.splitlines()[-7:])
    print "To start this service again asap, run:"
    print "paasta emergency-start --service %s --instance %s --cluster %s" % (service, args.instance, args.cluster)
예제 #31
0
def main():

    args = parse_args()
    soa_dir = args.soa_dir

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)

    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    running_jobs = set(deployed_job_names(client))

    expected_service_jobs = {
        chronos_tools.compose_job_id(*job)
        for job in chronos_tools.get_chronos_jobs_for_cluster(
            soa_dir=args.soa_dir)
    }

    all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs)))
    expired_tmp_jobs = set(
        filter_expired_tmp_jobs(client,
                                all_tmp_jobs,
                                cluster=cluster,
                                soa_dir=soa_dir))
    valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs

    to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs

    task_responses = cleanup_tasks(client, to_delete)
    task_successes = []
    task_failures = []
    for response in task_responses:
        if isinstance(response[-1], Exception):
            task_failures.append(response)
        else:
            task_successes.append(response)

    job_responses = cleanup_jobs(client, to_delete)
    job_successes = []
    job_failures = []
    for response in job_responses:
        if isinstance(response[-1], Exception):
            job_failures.append(response)
        else:
            job_successes.append(response)
            try:
                (service,
                 instance) = chronos_tools.decompose_job_id(response[0])
                monitoring_tools.send_event(
                    check_name=check_chronos_job_name(service, instance),
                    service=service,
                    overrides={},
                    soa_dir=soa_dir,
                    status=pysensu_yelp.Status.OK,
                    output=
                    "This instance was removed and is no longer supposed to be scheduled.",
                )
            except InvalidJobNameError:
                # If we deleted some bogus job with a bogus jobid that could not be parsed,
                # Just move on, no need to send any kind of paasta event.
                pass

    if len(to_delete) == 0:
        paasta_print('No Chronos Jobs to remove')
    else:
        if len(task_successes) > 0:
            paasta_print(
                format_list_output(
                    "Successfully Removed Tasks (if any were running) for:",
                    [job[0] for job in task_successes],
                ))

        # if there are any failures, print and exit appropriately
        if len(task_failures) > 0:
            paasta_print(
                format_list_output("Failed to Delete Tasks for:",
                                   [job[0] for job in task_failures]))

        if len(job_successes) > 0:
            paasta_print(
                format_list_output("Successfully Removed Jobs:",
                                   [job[0] for job in job_successes]))

        # if there are any failures, print and exit appropriately
        if len(job_failures) > 0:
            paasta_print(
                format_list_output("Failed to Delete Jobs:",
                                   [job[0] for job in job_failures]))

        if len(job_failures) > 0 or len(task_failures) > 0:
            sys.exit(1)
예제 #32
0
def send_event(service,
               check_name,
               overrides,
               status,
               output,
               soa_dir,
               ttl=None,
               cluster=None):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param check_name: The name of the check as it appears in Sensu
    :param overrides: A dictionary containing overrides for monitoring options
                      (e.g. notification_email, ticket, page)
    :param status: The status to emit for this event
    :param output: The output to emit for this event
    :param soa_dir: The service directory to read monitoring information from
    :param cluster: The cluster name (optional)
    """
    # This function assumes the input is a string like "mumble.main"
    team = get_team(overrides, service, soa_dir)
    if not team:
        return

    system_paasta_config = load_system_paasta_config()
    if cluster is None:
        try:
            cluster = system_paasta_config.get_cluster()
        except PaastaNotConfiguredError:
            cluster = "localhost"

    alert_after = overrides.get("alert_after", "5m")
    result_dict = {
        "name":
        check_name,
        "runbook":
        overrides.get("runbook", "http://y/paasta-troubleshooting"),
        "status":
        status,
        "output":
        output,
        "team":
        team,
        "page":
        get_page(overrides, service, soa_dir),
        "tip":
        get_tip(overrides, service, soa_dir),
        "notification_email":
        get_notification_email(overrides, service, soa_dir),
        "check_every":
        overrides.get("check_every", "1m"),
        "realert_every":
        overrides.get("realert_every", monitoring_defaults("realert_every")),
        "alert_after":
        f"{alert_after}s" if isinstance(alert_after, int) else alert_after,
        "irc_channels":
        get_irc_channels(overrides, service, soa_dir),
        "slack_channels":
        get_slack_channels(overrides, service, soa_dir),
        "ticket":
        get_ticket(overrides, service, soa_dir),
        "project":
        get_project(overrides, service, soa_dir),
        "priority":
        get_priority(overrides, service, soa_dir),
        "source":
        "paasta-%s" % cluster,
        "tags":
        get_tags(overrides, service, soa_dir),
        "ttl":
        ttl,
        "sensu_host":
        system_paasta_config.get_sensu_host(),
        "sensu_port":
        system_paasta_config.get_sensu_port(),
        "component":
        get_component(overrides, service, soa_dir),
        "description":
        get_description(overrides, service, soa_dir),
    }

    if result_dict.get("sensu_host"):
        pysensu_yelp.send_event(**result_dict)
예제 #33
0
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys

from paasta_tools.smartstack_tools import are_services_up_on_ip_port
from paasta_tools.utils import load_system_paasta_config

system_paasta_config = load_system_paasta_config()
synapse_port = system_paasta_config.get_synapse_port()
synapse_host = '169.254.255.254'
synapse_haproxy_url_format = system_paasta_config.get_synapse_haproxy_url_format(
)
host_ip = os.environ['PAASTA_POD_IP']
port = sys.argv[1]
services = sys.argv[2:]

if are_services_up_on_ip_port(
        synapse_host=synapse_host,
        synapse_port=synapse_port,
        synapse_haproxy_url_format=synapse_haproxy_url_format,
        services=services,
        host_ip=host_ip,
        host_port=int(port),
예제 #34
0
def set_boost_factor(
    zk_boost_path: str,
    region: str='',
    pool: str='',
    send_clusterman_metrics: bool=False,
    factor: float=DEFAULT_BOOST_FACTOR,
    duration_minutes: int=DEFAULT_BOOST_DURATION,
    override: bool=False,
) -> bool:
    """
    Set a boost factor for a path in zk

    Can be used to boost either cluster or service autoscalers.
    If using for cluster you must specify region, pool and set
    send_clusterman_metrics=True so that clusterman metrics are updated

    otherwise just zk_boost_path is enough.
    """
    if factor < MIN_BOOST_FACTOR:
        log.error(f'Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}')
        return False

    if factor > MAX_BOOST_FACTOR:
        log.warning('Boost factor {} does not sound reasonable. Defaulting to {}'.format(
            factor,
            MAX_BOOST_FACTOR,
        ))
        factor = MAX_BOOST_FACTOR

    if duration_minutes > MAX_BOOST_DURATION:
        log.warning('Boost duration of {} minutes is too much. Falling back to {}.'.format(
            duration_minutes,
            MAX_BOOST_DURATION,
        ))
        duration_minutes = MAX_BOOST_DURATION

    current_time = get_time()
    end_time = current_time + 60 * duration_minutes

    if clusterman_metrics and send_clusterman_metrics:
        cluster = load_system_paasta_config().get_cluster()
        metrics_client = clusterman_metrics.ClustermanMetricsBotoClient(region_name=region, app_identifier='default')
        with metrics_client.get_writer(clusterman_metrics.APP_METRICS) as writer:
            metrics_key = clusterman_metrics.generate_key_with_dimensions(
                'boost_factor',
                {'cluster': cluster, 'pool': pool},
            )
            writer.send((metrics_key, current_time, factor))
            if duration_minutes > 0:
                writer.send((metrics_key, end_time, 1.0))

    zk_end_time_path = zk_boost_path + '/end_time'
    zk_factor_path = zk_boost_path + '/factor'
    zk_expected_load_path = zk_boost_path + '/expected_load'

    with ZookeeperPool() as zk:
        if (
            not override and
            current_time < get_boost_values(zk_boost_path, zk).end_time
        ):
            log.error('Boost already active. Not overriding.')
            return False

        try:
            zk.ensure_path(zk_end_time_path)
            zk.ensure_path(zk_factor_path)
            zk.ensure_path(zk_expected_load_path)
            zk.set(zk_end_time_path, str(end_time).encode('utf-8'))
            zk.set(zk_factor_path, str(factor).encode('utf-8'))
            zk.set(zk_expected_load_path, '0'.encode('utf-8'))
        except Exception:
            log.error('Error setting the boost in Zookeeper')
            raise

        log.info('Load boost: Set capacity boost factor {} at path {} until {}'.format(
            factor,
            zk_boost_path,
            datetime.fromtimestamp(end_time).strftime('%c'),
        ))

        # Let's check that this factor has been properly written to zk
        return get_boost_values(zk_boost_path, zk) == BoostValues(
            end_time=end_time,
            boost_factor=factor,
            expected_load=0,
        )
def deploy_service(
    service,
    instance,
    marathon_jobid,
    config,
    client,
    bounce_method,
    drain_method_name,
    drain_method_params,
    nerve_ns,
    bounce_health_params,
    soa_dir,
):
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param client: A MarathonClient object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :returns: A tuple of (status, output) to be used with send_sensu_event"""
    def log_deploy_error(errormsg, level='event'):
        return _log(service=service,
                    line=errormsg,
                    component='deploy',
                    level='event',
                    cluster=cluster,
                    instance=instance)

    short_id = marathon_tools.format_job_id(service, instance)

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    existing_apps = marathon_tools.get_matching_apps(service,
                                                     instance,
                                                     client,
                                                     embed_failures=True)
    new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']]
    other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']]
    serviceinstance = "%s.%s" % (service, instance)

    if new_app_list:
        new_app = new_app_list[0]
        if len(new_app_list) != 1:
            raise ValueError("Only expected one app per ID; found %d" %
                             len(new_app_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service,
                                                     nerve_ns,
                                                     system_paasta_config,
                                                     **bounce_health_params)
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \
            (drain_method_name, ', '.join(drain_lib.list_drain_methods()))
        log_deploy_error(errormsg)
        return (1, errormsg)

    old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks(
        other_apps,
        drain_method,
        service,
        nerve_ns,
        bounce_health_params,
        system_paasta_config,
    )

    if new_app_running:
        protected_draining_tasks = set()
        if new_app.instances < config['instances']:
            client.scale_app(app_id=new_app.id,
                             instances=config['instances'],
                             force=True)
        elif new_app.instances > config['instances']:
            num_tasks_to_scale = max(
                min(len(new_app.tasks), new_app.instances) -
                config['instances'], 0)
            task_dict = get_old_happy_unhappy_draining_tasks_for_app(
                new_app,
                drain_method,
                service,
                nerve_ns,
                bounce_health_params,
                system_paasta_config,
            )
            scaling_app_happy_tasks = list(task_dict['happy'])
            scaling_app_unhappy_tasks = list(task_dict['unhappy'])
            scaling_app_draining_tasks = list(task_dict['draining'])

            tasks_to_move_draining = min(len(scaling_app_draining_tasks),
                                         num_tasks_to_scale)
            old_app_draining_tasks[new_app.id] = set(
                scaling_app_draining_tasks[:tasks_to_move_draining])
            protected_draining_tasks.update(
                scaling_app_draining_tasks[:tasks_to_move_draining])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining

            tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks),
                                        num_tasks_to_scale)
            old_app_live_unhappy_tasks[new_app.id] = set(
                scaling_app_unhappy_tasks[:tasks_to_move_unhappy])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy

            tasks_to_move_happy = min(len(scaling_app_happy_tasks),
                                      num_tasks_to_scale)
            old_app_live_happy_tasks[new_app.id] = set(
                scaling_app_happy_tasks[:tasks_to_move_happy])
            happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:]
        # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with
        # `paasta mark-for-deployment`), then we should undrain them.
        for task in new_app.tasks:
            if task not in protected_draining_tasks:
                drain_method.stop_draining(task)

    # Re-drain any already draining tasks on old apps
    for tasks in old_app_draining_tasks.values():
        for task in tasks:
            drain_method.drain(task)

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \
                (bounce_method, ', '.join(bounce_lib.list_bounce_methods()))
            log_deploy_error(errormsg)
            return (1, errormsg)

        try:
            with bounce_lib.bounce_lock_zookeeper(short_id):
                do_bounce(
                    bounce_func=bounce_func,
                    drain_method=drain_method,
                    config=config,
                    new_app_running=new_app_running,
                    happy_new_tasks=happy_new_tasks,
                    old_app_live_happy_tasks=old_app_live_happy_tasks,
                    old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
                    old_app_draining_tasks=old_app_draining_tasks,
                    service=service,
                    bounce_method=bounce_method,
                    serviceinstance=serviceinstance,
                    cluster=cluster,
                    instance=instance,
                    marathon_jobid=marathon_jobid,
                    client=client,
                    soa_dir=soa_dir,
                )

        except bounce_lib.LockHeldException:
            log.error("Instance %s already being bounced. Exiting", short_id)
            return (1, "Instance %s is already being bounced." % short_id)
    except Exception:
        loglines = ['Exception raised during deploy of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            log_deploy_error(logline, level='debug')
        raise

    return (0, 'Service deployed.')
예제 #36
0
def get_mesos_config_path():
    """
    Determine where to find the configuration for mesos-cli.
    """
    return (load_system_paasta_config().get_mesos_cli_config().get(
        "path", DEFAULT_MESOS_CLI_CONFIG_LOCATION))
예제 #37
0
def get_zookeeper_host_path():
    zk_url = "zk://%s" % load_system_paasta_config().get_zk_hosts()
    parsed = urlparse(zk_url)
    return ZookeeperHostPath(host=parsed.netloc, path=parsed.path)
예제 #38
0
def load_chronos_config():
    try:
        return ChronosConfig(load_system_paasta_config().get_chronos_config())
    except PaastaNotConfiguredError:
        raise ChronosNotConfigured(
            "Could not find chronos_config in configuration directory")
예제 #39
0
def get_zookeeper_host_path():
    zk_url = load_system_paasta_config()['zookeeper']
    parsed = urlparse(zk_url)
    return ZookeeperHostPath(host=parsed.netloc, path=parsed.path)
예제 #40
0
def get_tron_dashboard_for_cluster(cluster: str):
    dashboards = load_system_paasta_config().get_dashboard_links()[cluster]
    if 'Tron' not in dashboards:
        raise Exception(
            f"tron api endpoint is not defined for cluster {cluster}")
    return dashboards['Tron']
예제 #41
0
def paasta_local_run(args):
    if args.action == 'pull' and os.geteuid() != 0:
        paasta_print("Re-executing paasta local-run --pull with sudo..")
        os.execvp("sudo", ["sudo", "-H"] + sys.argv)
    if args.action == 'build' and not makefile_responds_to('cook-image'):
        paasta_print(
            "A local Makefile with a 'cook-image' target is required for --build",
            file=sys.stderr)
        paasta_print(
            "If you meant to pull the docker image from the registry, explicitly pass --pull",
            file=sys.stderr)
        return 1

    try:
        system_paasta_config = load_system_paasta_config()
    except PaastaNotConfiguredError:
        paasta_print(
            PaastaColors.yellow(
                "Warning: Couldn't load config files from '/etc/paasta'. This indicates"
                "PaaSTA is not configured locally on this host, and local-run may not behave"
                "the same way it would behave on a server configured for PaaSTA.",
            ),
            sep='\n',
        )
        system_paasta_config = SystemPaastaConfig({"volumes": []},
                                                  '/etc/paasta')

    local_run_config = system_paasta_config.get_local_run_config()

    service = figure_out_service_name(args, soa_dir=args.yelpsoa_config_root)
    if args.cluster:
        cluster = args.cluster
    else:
        try:
            cluster = local_run_config['default_cluster']
        except KeyError:
            paasta_print(
                PaastaColors.red(
                    "PaaSTA on this machine has not been configured with a default cluster."
                    "Please pass one to local-run using '-c'.", ),
                sep='\n',
                file=sys.stderr,
            )
            return 1
    instance = args.instance
    docker_client = get_docker_client()

    if args.action == 'build':
        default_tag = 'paasta-local-run-%s-%s' % (service, get_username())
        tag = os.environ.get('DOCKER_TAG', default_tag)
        os.environ['DOCKER_TAG'] = tag
        pull_image = False
        cook_return = paasta_cook_image(args=None,
                                        service=service,
                                        soa_dir=args.yelpsoa_config_root)
        if cook_return != 0:
            return cook_return
    elif args.action == 'dry_run':
        pull_image = False
        tag = None
    else:
        pull_image = True
        tag = None

    try:
        return configure_and_run_docker_container(
            docker_client=docker_client,
            docker_hash=tag,
            service=service,
            instance=instance,
            cluster=cluster,
            args=args,
            pull_image=pull_image,
            system_paasta_config=system_paasta_config,
            dry_run=args.action == 'dry_run',
        )
    except errors.APIError as e:
        paasta_print(
            'Can\'t run Docker container. Error: %s' % str(e),
            file=sys.stderr,
        )
        return 1
예제 #42
0
def load_tron_config():
    return TronConfig(load_system_paasta_config().get_tron_config())
예제 #43
0
def deploy_marathon_service(
    service: str,
    instance: str,
    clients: marathon_tools.MarathonClients,
    soa_dir: str,
    marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]],
) -> Tuple[int, float]:
    """deploy the service instance given and proccess return code
    if there was an error we send a sensu alert.

    :param service: The service name to setup
    :param instance: The instance of the service to setup
    :param clients: A MarathonClients object
    :param soa_dir: Path to yelpsoa configs
    :param marathon_apps: A list of all marathon app objects
    :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd
        bounce_in_seconds instructs how long until the deployd should try another bounce
        None means that it is in a steady state and doesn't need to bounce again
    """
    short_id = marathon_tools.format_job_id(service, instance)
    try:
        with bounce_lib.bounce_lock_zookeeper(short_id):
            try:
                service_instance_config = marathon_tools.load_marathon_service_config_no_cache(
                    service,
                    instance,
                    load_system_paasta_config().get_cluster(),
                    soa_dir=soa_dir,
                )
            except NoDeploymentsAvailable:
                log.debug(
                    "No deployments found for %s.%s in cluster %s. Skipping." %
                    (service, instance,
                     load_system_paasta_config().get_cluster()))
                return 0, None
            except NoConfigurationForServiceError:
                error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \
                            (service, instance, load_system_paasta_config().get_cluster())
                log.error(error_msg)
                return 1, None

            try:
                with a_sync.idle_event_loop():
                    status, output, bounce_again_in_seconds = setup_service(
                        service=service,
                        instance=instance,
                        clients=clients,
                        job_config=service_instance_config,
                        marathon_apps_with_clients=marathon_apps_with_clients,
                        soa_dir=soa_dir,
                    )
                sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
                send_event(service, instance, soa_dir, sensu_status, output)
                return 0, bounce_again_in_seconds
            except (KeyError, TypeError, AttributeError, InvalidInstanceConfig,
                    NoSlavesAvailableError):
                error_str = traceback.format_exc()
                log.error(error_str)
                send_event(service, instance, soa_dir,
                           pysensu_yelp.Status.CRITICAL, error_str)
                return 1, None
    except bounce_lib.LockHeldException:
        log.error("Instance %s already being bounced. Exiting", short_id)
        return 0, None
예제 #44
0
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout):
    # Currently only 'marathon' instances are supported for wait_for_deployment because they
    # are the only thing that are worth waiting on.
    service_configs = PaastaServiceConfigLoader(service=service,
                                                soa_dir=soa_dir,
                                                load_deployments=False)

    total_instances = 0
    clusters_data = []
    api_endpoints = load_system_paasta_config().get_api_endpoints()
    for cluster in service_configs.clusters:
        if cluster not in api_endpoints:
            paasta_print(
                PaastaColors.red(
                    'Cluster %s is NOT in paasta-api endpoints config.' %
                    cluster, ))
            raise NoSuchCluster

        instances_queue = Queue()
        for instance_config in service_configs.instance_configs(
                cluster=cluster,
                instance_type_class=MarathonServiceConfig,
        ):
            if instance_config.get_deploy_group() == deploy_group:
                instances_queue.put(instance_config)
                total_instances += 1
        for instance_config in service_configs.instance_configs(
                cluster=cluster,
                instance_type_class=KubernetesDeploymentConfig,
        ):
            if instance_config.get_deploy_group() == deploy_group:
                instances_queue.put(instance_config)
                total_instances += 1

        if not instances_queue.empty():
            clusters_data.append(
                ClusterData(
                    cluster=cluster,
                    service=service,
                    git_sha=git_sha,
                    instances_queue=instances_queue,
                ))

    if not clusters_data:
        _log(
            service=service,
            component='deploy',
            line=
            ("Couldn't find any marathon instances for service {} in deploy group {}. Exiting."
             .format(service, deploy_group)),
            level='event',
        )
        return

    paasta_print("Waiting for deployment of {} for '{}' to complete...".format(
        git_sha, deploy_group))

    deadline = time.time() + timeout
    green_light = Event()
    green_light.set()

    with progressbar.ProgressBar(maxval=total_instances) as bar:
        while time.time() < deadline:
            _query_clusters(clusters_data, green_light)
            if not green_light.is_set():
                raise KeyboardInterrupt

            bar.update(total_instances - sum((c.instances_queue.qsize()
                                              for c in clusters_data)))

            if all((cluster.instances_queue.empty()
                    for cluster in clusters_data)):
                sys.stdout.flush()
                return 0
            else:
                time.sleep(min(60, timeout))
            sys.stdout.flush()

    _log(
        service=service,
        component='deploy',
        line=compose_timeout_message(clusters_data, timeout, deploy_group,
                                     service, git_sha),
        level='event',
    )
    raise TimeoutError
예제 #45
0
def print_output(argv: Optional[Sequence[str]] = None) -> None:
    mesos_available = is_mesos_available()
    kube_available = is_kubernetes_available()

    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    if mesos_available:
        master_kwargs = {}
        # we don't want to be passing False to not override a possible True
        # value from system config
        if args.use_mesos_cache:
            master_kwargs['use_mesos_cache'] = True

        master = get_mesos_master(**master_kwargs)

        marathon_servers = get_marathon_servers(system_paasta_config)
        marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers))

        try:
            mesos_state = a_sync.block(master.state)
            all_mesos_results = _run_mesos_checks(
                mesos_master=master,
                mesos_state=mesos_state,
            )
        except MasterNotAvailableException as e:
            # if we can't connect to master at all,
            # then bomb out early
            paasta_print(PaastaColors.red("CRITICAL:  %s" % '\n'.join(e.args)))
            raise FatalError(2)

        marathon_results = _run_marathon_checks(marathon_clients)
    else:
        marathon_results = [metastatus_lib.HealthCheckResult(
            message='Marathon is not configured to run here',
            healthy=True,
        )]
        all_mesos_results = [metastatus_lib.HealthCheckResult(
            message='Mesos is not configured to run here',
            healthy=True,
        )]

    if kube_available:
        kube_client = KubeClient()
        kube_results = _run_kube_checks(kube_client)
    else:
        kube_results = [metastatus_lib.HealthCheckResult(
            message='Kubernetes is not configured to run here',
            healthy=True,
        )]

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            raise FatalError(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(
            message='Chronos is not configured to run here',
            healthy=True,
        )]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    kube_ok = all(metastatus_lib.status_for_results(kube_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    kube_summary = metastatus_lib.generate_summary_for_check("Kubernetes", kube_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print(f"Master paasta_tools version: {__version__}")
    paasta_print("Mesos leader: %s" % get_mesos_leader())
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1 and mesos_available:
        print_with_indent('Resources Grouped by %s' % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state(
            groupings=args.groupings,
            threshold=args.threshold,
            mesos_state=mesos_state,
        )
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields]
            table = [headers] + [[str(x) for x in asi] for asi in get_autoscaling_info_for_all_resources(mesos_state)]

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent('Per Slave Utilization', 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent('Service-Instance stats:' + str(service_instance_stats), 2)
            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            all_rows, _ = utilization_table_by_grouping_from_mesos_state(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                mesos_state=mesos_state,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be
            # 1 for per-slave resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok, kube_results, args.verbose)
    if args.verbose > 1 and kube_available:
        print_with_indent('Resources Grouped by %s' % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_kube(
            groupings=args.groupings,
            threshold=args.threshold,
            kube_client=kube_client,
        )
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("No autoscaling resources for Kubernetes", 2)

        if args.verbose >= 3:
            print_with_indent('Per Node Utilization', 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent('Service-Instance stats:' + str(service_instance_stats), 2)
            # print info about nodes here. Note that we don't make
            # modifications to the healthy_exit variable here, because we don't
            # care about a single node having high usage.
            all_rows, _ = utilization_table_by_grouping_from_kube(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                kube_client=kube_client,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be
            # 1 for per-node resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        raise FatalError(2)
예제 #46
0
def run_capacity_check():
    options = parse_capacity_check_options()
    system_paasta_config = load_system_paasta_config()
    cluster = options.cluster if options.cluster is not None else system_paasta_config.get_cluster(
    )
    value_to_check = options.type

    client = get_paasta_api_client(cluster=cluster)
    if client is None:
        paasta_print('UNKNOWN Failed to load paasta api client')
        sys.exit(3)

    overrides = read_overrides(options.overrides)

    attributes = options.attributes.split(',')

    try:
        resource_use = client.resources.resources(
            groupings=attributes).result()
    except HTTPError as e:
        paasta_print("UNKNOWN recieved exception from paasta api:\n\t%s" % e)
        sys.exit(3)

    default_check = {
        'warn': {
            'cpus': options.warn,
            'mem': options.warn,
            'disk': options.warn,
        },
        'crit': {
            'cpus': options.crit,
            'mem': options.crit,
            'disk': options.crit,
        },
    }

    failures = defaultdict(list)
    for usage_value in resource_use:
        check = get_check_from_overrides(overrides, default_check,
                                         usage_value['groupings'])
        usage_percent = calc_percent_usage(usage_value, value_to_check)
        for c in ['crit', 'warn']:
            if usage_percent > check[c][value_to_check]:
                failures[c].append({
                    'attrs': [{
                        'attr': a,
                        'value': v
                    } for a, v in usage_value['groupings'].items()],
                    'maximum':
                    check[c][value_to_check],
                    'current':
                    usage_percent,
                })
                break

    return_value = [0]
    if len(failures['crit']) > 0:
        result = error_message(failures['crit'], 'CRITICAL', cluster,
                               value_to_check)
        paasta_print(result)
        return_value.append(2)
    if len(failures['warn']) > 0:
        result = error_message(failures['warn'], 'WARNING', cluster,
                               value_to_check)
        paasta_print(result)
        return_value.append(1)

    if max(return_value) == 0:
        paasta_print(
            f"OK cluster {cluster} is below critical capacity in {value_to_check}"
        )

    sys.exit(max(return_value))
예제 #47
0
def paasta_spark_run(args):
    # argparse does not work as expected with both default and
    # type=validate_work_dir.
    validate_work_dir(args.work_dir)

    try:
        system_paasta_config = load_system_paasta_config()
    except PaastaNotConfiguredError:
        print(
            PaastaColors.yellow(
                "Warning: Couldn't load config files from '/etc/paasta'. This indicates"
                "PaaSTA is not configured locally on this host, and local-run may not behave"
                "the same way it would behave on a server configured for PaaSTA."
            ),
            sep="\n",
        )
        system_paasta_config = SystemPaastaConfig({"volumes": []},
                                                  "/etc/paasta")

    if args.cmd == "jupyter-lab" and not args.build and not args.image:
        print(
            PaastaColors.red(
                "The jupyter-lab command requires a prebuilt image with -I or --image."
            ),
            file=sys.stderr,
        )
        return 1

    # Use the default spark:client instance configs if not provided
    try:
        instance_config = get_instance_config(
            service=args.service,
            instance=args.instance,
            cluster=args.cluster,
            load_deployments=args.build is False and args.image is None,
            soa_dir=args.yelpsoa_config_root,
        )
    except NoConfigurationForServiceError as e:
        print(str(e), file=sys.stderr)
        return 1
    except NoDeploymentsAvailable:
        print(
            PaastaColors.red(
                "Error: No deployments.json found in %(soa_dir)s/%(service)s."
                "You can generate this by running:"
                "generate_deployments_for_service -d %(soa_dir)s -s %(service)s"
                % {
                    "soa_dir": args.yelpsoa_config_root,
                    "service": args.service
                }),
            sep="\n",
            file=sys.stderr,
        )
        return 1

    if not args.cmd and not instance_config.get_cmd():
        print(
            "A command is required, pyspark, spark-shell, spark-submit or jupyter",
            file=sys.stderr,
        )
        return 1

    aws_creds = get_aws_credentials(
        service=args.service,
        no_aws_credentials=args.no_aws_credentials,
        aws_credentials_yaml=args.aws_credentials_yaml,
        profile_name=args.aws_profile,
    )
    docker_image = get_docker_image(args, instance_config)
    if docker_image is None:
        return 1

    volumes = instance_config.get_volumes(system_paasta_config.get_volumes())
    app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd())
    needs_docker_cfg = not args.build and not args.image
    user_spark_opts = _parse_user_spark_args(args.spark_args)
    paasta_instance = get_smart_paasta_instance_name(args)
    spark_conf = get_spark_conf(
        cluster_manager="mesos",
        spark_app_base_name=app_base_name,
        docker_img=docker_image,
        user_spark_opts=user_spark_opts,
        paasta_cluster=args.cluster,
        paasta_pool=args.pool,
        paasta_service=args.service,
        paasta_instance=paasta_instance,
        extra_volumes=volumes,
        aws_creds=aws_creds,
        needs_docker_cfg=needs_docker_cfg,
    )
    return configure_and_run_docker_container(
        args,
        docker_img=docker_image,
        instance_config=instance_config,
        system_paasta_config=system_paasta_config,
        spark_conf=spark_conf,
        aws_creds=aws_creds,
    )
예제 #48
0
def deploy_service(
    service: str,
    instance: str,
    marathon_jobid: str,
    config: marathon_tools.FormattedMarathonAppDict,
    clients: marathon_tools.MarathonClients,
    marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]],
    bounce_method: str,
    drain_method_name: str,
    drain_method_params: Dict[str, Any],
    nerve_ns: str,
    bounce_health_params: Dict[str, Any],
    soa_dir: str,
    job_config: marathon_tools.MarathonServiceConfig,
    bounce_margin_factor: float = 1.0,
) -> Tuple[int, str, Optional[float]]:
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param clients: A MarathonClients object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained
    :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event"""
    def log_deploy_error(errormsg: str, level: str = 'event') -> None:
        return _log(
            service=service,
            line=errormsg,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
        )

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients(
        service=service,
        instance=instance,
        marathon_apps_with_clients=marathon_apps_with_clients,
    )

    new_client = clients.get_current_client_for_service(job_config)

    new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = []
    other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = []

    for a, c in existing_apps_with_clients:
        if a.id == '/%s' % config['id'] and c == new_client:
            new_apps_with_clients_list.append((a, c))
        else:
            other_apps_with_clients.append((a, c))

    serviceinstance = "%s.%s" % (service, instance)

    if new_apps_with_clients_list:
        new_app, new_client = new_apps_with_clients_list[0]
        if len(new_apps_with_clients_list) != 1:
            raise ValueError(
                "Only expected one app per ID per shard; found %d" %
                len(new_apps_with_clients_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(
            new_app,
            service,
            nerve_ns,
            system_paasta_config,
            **bounce_health_params,
        )
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \
            (drain_method_name, ', '.join(drain_lib.list_drain_methods()))
        log_deploy_error(errormsg)
        return (1, errormsg, None)

    try:
        draining_hosts = get_draining_hosts()
    except ReadTimeout as e:
        errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e
        return (1, errormsg, 60)

    (
        old_app_live_happy_tasks,
        old_app_live_unhappy_tasks,
        old_app_draining_tasks,
        old_app_at_risk_tasks,
    ) = get_tasks_by_state(
        other_apps_with_clients=other_apps_with_clients,
        drain_method=drain_method,
        service=service,
        nerve_ns=nerve_ns,
        bounce_health_params=bounce_health_params,
        system_paasta_config=system_paasta_config,
        log_deploy_error=log_deploy_error,
        draining_hosts=draining_hosts,
    )

    # The first thing we need to do is take up the "slack" of old apps, to stop
    # them from launching new things that we are going to have to end up draining
    # and killing anyway.
    for a, c in other_apps_with_clients:
        marathon_tools.take_up_slack(app=a, client=c)

    num_at_risk_tasks = 0
    if new_app_running:
        num_at_risk_tasks = get_num_at_risk_tasks(
            new_app, draining_hosts=draining_hosts)
        if new_app.instances < config['instances'] + num_at_risk_tasks:
            log.info("Scaling %s up from %d to %d instances." %
                     (new_app.id, new_app.instances,
                      config['instances'] + num_at_risk_tasks))
            new_client.scale_app(app_id=new_app.id,
                                 instances=config['instances'] +
                                 num_at_risk_tasks,
                                 force=True)
        # If we have more than the specified number of instances running, we will want to drain some of them.
        # We will start by draining any tasks running on at-risk hosts.
        elif new_app.instances > config['instances']:
            num_tasks_to_scale = max(
                min(len(new_app.tasks), new_app.instances) -
                config['instances'], 0)
            task_dict = get_tasks_by_state_for_app(
                app=new_app,
                drain_method=drain_method,
                service=service,
                nerve_ns=nerve_ns,
                bounce_health_params=bounce_health_params,
                system_paasta_config=system_paasta_config,
                log_deploy_error=log_deploy_error,
                draining_hosts=draining_hosts,
            )
            scaling_app_happy_tasks = list(task_dict['happy'])
            scaling_app_unhappy_tasks = list(task_dict['unhappy'])
            scaling_app_draining_tasks = list(task_dict['draining'])
            scaling_app_at_risk_tasks = list(task_dict['at_risk'])

            tasks_to_move_draining = min(len(scaling_app_draining_tasks),
                                         num_tasks_to_scale)
            old_app_draining_tasks[(new_app.id, new_client)] = set(
                scaling_app_draining_tasks[:tasks_to_move_draining])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining

            tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks),
                                        num_tasks_to_scale)
            old_app_live_unhappy_tasks[(new_app.id, new_client)] = set(
                scaling_app_unhappy_tasks[:tasks_to_move_unhappy], )
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy

            tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks),
                                        num_tasks_to_scale)
            old_app_at_risk_tasks[(new_app.id, new_client)] = set(
                scaling_app_at_risk_tasks[:tasks_to_move_at_risk])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk

            tasks_to_move_happy = min(len(scaling_app_happy_tasks),
                                      num_tasks_to_scale)
            old_app_live_happy_tasks[(new_app.id, new_client)] = set(
                scaling_app_happy_tasks[:tasks_to_move_happy])
            happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:]

            # slack represents remaining the extra remaining instances that are configured
            # in marathon that don't have a launched task yet. When scaling down we want to
            # reduce this slack so marathon doesn't get a chance to launch a new task in
            # that space that we will then have to drain and kill again.
            marathon_tools.take_up_slack(client=new_client, app=new_app)

        # TODO: don't take actions in deploy_service.
        undrain_tasks(
            to_undrain=new_app.tasks,
            leave_draining=old_app_draining_tasks.get((new_app.id, new_client),
                                                      []),
            drain_method=drain_method,
            log_deploy_error=log_deploy_error,
        )

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \
                (bounce_method, ', '.join(bounce_lib.list_bounce_methods()))
            log_deploy_error(errormsg)
            return (1, errormsg, None)

        bounce_again_in_seconds = do_bounce(
            bounce_func=bounce_func,
            drain_method=drain_method,
            config=config,
            new_app_running=new_app_running,
            happy_new_tasks=happy_new_tasks,
            old_app_live_happy_tasks=old_app_live_happy_tasks,
            old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
            old_app_draining_tasks=old_app_draining_tasks,
            old_app_at_risk_tasks=old_app_at_risk_tasks,
            service=service,
            bounce_method=bounce_method,
            serviceinstance=serviceinstance,
            cluster=cluster,
            instance=instance,
            marathon_jobid=marathon_jobid,
            clients=clients,
            soa_dir=soa_dir,
            job_config=job_config,
            bounce_margin_factor=bounce_margin_factor,
        )
    except bounce_lib.LockHeldException:
        logline = 'Failed to get lock to create marathon app for %s.%s' % (
            service, instance)
        log_deploy_error(logline, level='debug')
        return (0, "Couldn't get marathon lock, skipping until next time",
                None)
    except Exception:
        logline = 'Exception raised during deploy of service %s:\n%s' % (
            service, traceback.format_exc())
        log_deploy_error(logline, level='debug')
        raise
    if num_at_risk_tasks:
        bounce_again_in_seconds = 60
    elif new_app_running:
        if new_app.instances > config['instances']:
            bounce_again_in_seconds = 60
    return (0, 'Service deployed.', bounce_again_in_seconds)
예제 #49
0
def add_subparser(subparsers):
    list_parser = subparsers.add_parser(
        "spark-run",
        help="Run Spark on the PaaSTA cluster",
        description=(
            "'paasta spark-run' launches a Spark cluster on PaaSTA. "
            "It analyzes soa-configs and command line arguments to invoke "
            "a 'docker run'. By default, it will pull the Spark service "
            "image from the registry unless the --build option is used.\n\n"),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    group = list_parser.add_mutually_exclusive_group()
    group.add_argument(
        "-b",
        "--build",
        help=
        "Build the docker image from scratch using the local Makefile's cook-image target.",
        action="store_true",
        default=False,
    )
    group.add_argument(
        "-I",
        "--image",
        help="Use the provided image to start the Spark driver and executors.",
    )

    list_parser.add_argument(
        "--docker-registry",
        help="Docker registry to push the Spark image built.",
        default=DEFAULT_SPARK_DOCKER_REGISTRY,
    )

    list_parser.add_argument(
        "-s",
        "--service",
        help="The name of the service from which the Spark image is built.",
        default=DEFAULT_SPARK_SERVICE,
    ).completer = lazy_choices_completer(list_services)

    list_parser.add_argument(
        "-i",
        "--instance",
        help=("Start a docker run for a particular instance of the service."),
        default="adhoc",
    ).completer = lazy_choices_completer(list_instances)

    try:
        system_paasta_config = load_system_paasta_config()
        default_spark_cluster = system_paasta_config.get_spark_run_config(
        ).get("default_cluster")
        default_spark_pool = system_paasta_config.get_spark_run_config().get(
            "default_pool")
    except PaastaNotConfiguredError:
        default_spark_cluster = "pnw-devc"
        default_spark_pool = "batch"

    list_parser.add_argument(
        "-c",
        "--cluster",
        help=("The name of the cluster you wish to run Spark on."),
        default=default_spark_cluster,
    )

    list_parser.add_argument(
        "-p",
        "--pool",
        help="Name of the resource pool to run the Spark job.",
        default=default_spark_pool,
    )

    list_parser.add_argument(
        "-w",
        "--work-dir",
        default="{}:{}".format(os.getcwd(), DEFAULT_SPARK_WORK_DIR),
        help=
        "The read-write volume to mount in format local_abs_dir:container_abs_dir",
    )

    list_parser.add_argument(
        "-y",
        "--yelpsoa-config-root",
        dest="yelpsoa_config_root",
        help="A directory from which yelpsoa-configs should be read from.",
        default=DEFAULT_SOA_DIR,
    )

    list_parser.add_argument(
        "-C",
        "--cmd",
        help=
        "Run the spark-shell, pyspark, spark-submit, jupyter-lab, or history-server command.",
    )

    list_parser.add_argument(
        "-d",
        "--dry-run",
        help="Shows the arguments supplied to docker as json.",
        action="store_true",
        default=False,
    )

    list_parser.add_argument(
        "--spark-args",
        help=
        "Spark configurations documented in https://spark.apache.org/docs/latest/configuration.html. "
        r'For example, --spark-args "spark.mesos.constraints=pool:default\;instance_type:m4.10xlarge '
        'spark.executor.cores=4".',
    )

    list_parser.add_argument(
        "--nvidia",
        help=
        "Use nvidia docker runtime for Spark driver process (requires GPU)",
        action="store_true",
        default=False,
    )

    list_parser.add_argument(
        "--mrjob",
        help=
        "Pass Spark arguments to invoked command in the format expected by mrjobs",
        action="store_true",
        default=False,
    )

    if clusterman_metrics:
        list_parser.add_argument(
            "--suppress-clusterman-metrics-errors",
            help=
            "Continue even if sending resource requirements to Clusterman fails. This may result in the job "
            "failing to acquire resources.",
            action="store_true",
        )

    list_parser.add_argument("-j",
                             "--jars",
                             help=argparse.SUPPRESS,
                             action=DeprecatedAction)

    list_parser.add_argument("--executor-memory",
                             help=argparse.SUPPRESS,
                             action=DeprecatedAction)

    list_parser.add_argument("--executor-cores",
                             help=argparse.SUPPRESS,
                             action=DeprecatedAction)

    list_parser.add_argument("--max-cores",
                             help=argparse.SUPPRESS,
                             action=DeprecatedAction)

    list_parser.add_argument("--driver-max-result-size",
                             help=argparse.SUPPRESS,
                             action=DeprecatedAction)

    list_parser.add_argument("--driver-memory",
                             help=argparse.SUPPRESS,
                             action=DeprecatedAction)

    list_parser.add_argument("--driver-cores",
                             help=argparse.SUPPRESS,
                             action=DeprecatedAction)

    aws_group = list_parser.add_argument_group(
        title="AWS credentials options",
        description="If --aws-credentials-yaml is specified, it overrides all "
        "other options. Otherwise, if -s/--service is specified, spark-run "
        "looks for service credentials in /etc/boto_cfg/[service].yaml. If "
        "it does not find the service credentials or no service is "
        "specified, spark-run falls back to the boto default behavior "
        "(checking ~/.aws/credentials, ~/.boto, etc).",
    )

    aws_group.add_argument(
        "--aws-credentials-yaml",
        help="Load aws keys from the provided yaml file. The yaml file must "
        "have keys for aws_access_key_id and aws_secret_access_key.",
    )

    aws_group.add_argument(
        "--aws-profile",
        help="Name of the AWS profile to load credentials from. Only used when "
        "--aws-credentials-yaml is not specified and --service is either "
        "not specified or the service does not have credentials in "
        "/etc/boto_cfg",
        default="default",
    )

    aws_group.add_argument(
        "--no-aws-credentials",
        help="Do not load any AWS credentials; allow the Spark job to use its "
        "own logic to load credentials",
        action="store_true",
        default=False,
    )

    aws_group.add_argument(
        "--aws-region",
        help=f"Specify an aws region. If the region is not specified, we will"
        f"default to using {DEFAULT_AWS_REGION}.",
        default=DEFAULT_AWS_REGION,
    )

    jupyter_group = list_parser.add_argument_group(
        title="Jupyter kernel culling options",
        description="Idle kernels will be culled by default. Idle "
        "kernels with connections can be overridden not to be culled.",
    )

    jupyter_group.add_argument(
        "--cull-idle-timeout",
        type=int,
        default=7200,
        help="Timeout (in seconds) after which a kernel is considered idle and "
        "ready to be culled.",
    )

    jupyter_group.add_argument(
        "--not-cull-connected",
        action="store_true",
        default=False,
        help="By default, connected idle kernels are culled after timeout. "
        "They can be skipped if not-cull-connected is specified.",
    )

    list_parser.set_defaults(command=paasta_spark_run)
def main():
    """Attempt to set up the marathon service instance given.
    Exits 1 if the deployment failed.
    This is done in the following order:

    - Load the marathon configuration
    - Connect to marathon
    - Load the service instance's configuration
    - Create the complete marathon job configuration
    - Deploy/bounce the service
    - Emit an event about the deployment to sensu"""
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.WARNING)
    try:
        service, instance, _, __ = decompose_job_id(args.service_instance)
    except InvalidJobNameError:
        log.error(
            "Invalid service instance specified. Format is service%sinstance."
            % SPACER)
        sys.exit(1)

    marathon_config = get_main_marathon_config()
    client = marathon_tools.get_marathon_client(marathon_config.get_url(),
                                                marathon_config.get_username(),
                                                marathon_config.get_password())

    try:
        service_instance_config = marathon_tools.load_marathon_service_config(
            service,
            instance,
            load_system_paasta_config().get_cluster(),
            soa_dir=soa_dir,
        )
    except NoDeploymentsAvailable:
        log.debug(
            "No deployments found for %s in cluster %s. Skipping." %
            (args.service_instance, load_system_paasta_config().get_cluster()))
        sys.exit(0)
    except NoConfigurationForServiceError:
        error_msg = "Could not read marathon configuration file for %s in cluster %s" % \
            (args.service_instance, load_system_paasta_config().get_cluster())
        log.error(error_msg)
        sys.exit(1)

    try:
        status, output = setup_service(service, instance, client,
                                       marathon_config,
                                       service_instance_config, soa_dir)
        sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
        send_event(service, instance, soa_dir, sensu_status, output)
        # We exit 0 because the script finished ok and the event was sent to the right team.
        sys.exit(0)
    except (KeyError, TypeError, AttributeError, InvalidInstanceConfig):
        import traceback
        error_str = traceback.format_exc()
        log.error(error_str)
        send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL,
                   error_str)
        # We exit 0 because the script finished ok and the event was sent to the right team.
        sys.exit(0)
예제 #51
0
def perform_command(command,
                    service,
                    instance,
                    cluster,
                    verbose,
                    soa_dir,
                    app_id=None,
                    delta=None,
                    client=None):
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :param client: MarathonClient or CachingMarathonClient
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    job_config = marathon_tools.load_marathon_service_config(service,
                                                             instance,
                                                             cluster,
                                                             soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            paasta_print(
                "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?"
                % job_id)
            return 1

    normal_instance_count = job_config.get_instances()
    proxy_port = marathon_tools.get_proxy_port_for_instance(service,
                                                            instance,
                                                            cluster,
                                                            soa_dir=soa_dir)

    if client is None:
        marathon_config = marathon_tools.load_marathon_config()
        client = marathon_tools.get_marathon_client(
            marathon_config.get_url(),
            marathon_config.get_username(),
            marathon_config.get_password(),
        )

    if command == 'restart':
        restart_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'status':
        paasta_print(
            status_desired_state(service, instance, client, job_config))
        paasta_print(
            status_marathon_job(service, instance, app_id,
                                normal_instance_count, client))
        tasks, out = status_marathon_job_verbose(service, instance, client,
                                                 cluster, soa_dir)
        if verbose > 0:
            paasta_print(out)
        paasta_print(
            status_mesos_tasks(service, instance, normal_instance_count))
        if verbose > 0:
            tail_lines = calculate_tail_lines(verbose_level=verbose)
            paasta_print(
                status_mesos_tasks_verbose(
                    job_id=app_id,
                    get_short_task_id=get_short_task_id,
                    tail_lines=tail_lines,
                ))
        if proxy_port is not None:
            normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
                service,
                instance,
                cluster,
            )
            paasta_print(
                status_smartstack_backends(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    job_config=job_config,
                    tasks=tasks,
                    expected_count=normal_smartstack_count,
                    soa_dir=soa_dir,
                    verbose=verbose > 0,
                    synapse_port=system_config.get_synapse_port(),
                    synapse_haproxy_url_format=system_config.
                    get_synapse_haproxy_url_format(),
                    system_deploy_blacklist=system_config.get_deploy_blacklist(
                    ),
                    system_deploy_whitelist=system_config.get_deploy_whitelist(
                    ),
                ))
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
예제 #52
0
    def format_marathon_app_dict(self):
        """Create the configuration that will be passed to the Marathon REST API.

        Currently compiles the following keys into one nice dict:

        - id: the ID of the image in Marathon
        - container: a dict containing the docker url and docker launch options. Needed by deimos.
        - uris: blank.
        - ports: an array containing the port.
        - env: environment variables for the container.
        - mem: the amount of memory required.
        - cpus: the number of cpus required.
        - disk: the amount of disk space required.
        - constraints: the constraints on the Marathon app.
        - instances: the number of instances required.
        - cmd: the command to be executed.
        - args: an alternative to cmd that requires the docker container to have an entrypoint.

        The last 7 keys are retrieved using the get_<key> functions defined above.

        :param app_id: The app id
        :param docker_url: The url to the docker image the app will actually execute
        :param docker_volumes: The docker volumes to run the image with, via the
                               marathon configuration file
        :param service_namespace_config: The service instance's configuration dict
        :returns: A dict containing all of the keys listed above"""

        # A set of config attributes that don't get included in the hash of the config.
        # These should be things that PaaSTA/Marathon knows how to change without requiring a bounce.
        CONFIG_HASH_BLACKLIST = set(['instances', 'backoff_seconds'])

        system_paasta_config = load_system_paasta_config()
        docker_url = get_docker_url(system_paasta_config.get_docker_registry(),
                                    self.get_docker_image())
        service_namespace_config = load_service_namespace_config(
            service=self.service,
            namespace=self.get_nerve_namespace(),
        )
        docker_volumes = system_paasta_config.get_volumes(
        ) + self.get_extra_volumes()

        net = get_mesos_network_for_net(self.get_net())

        complete_config = {
            'container': {
                'docker': {
                    'image':
                    docker_url,
                    'network':
                    net,
                    "parameters": [
                        {
                            "key": "memory-swap",
                            "value": self.get_mem_swap()
                        },
                    ]
                },
                'type': 'DOCKER',
                'volumes': docker_volumes,
            },
            'uris': [
                system_paasta_config.get_dockercfg_location(),
            ],
            'backoff_seconds': self.get_backoff_seconds(),
            'backoff_factor': 2,
            'health_checks': self.get_healthchecks(service_namespace_config),
            'env': self.get_env(),
            'mem': float(self.get_mem()),
            'cpus': float(self.get_cpus()),
            'disk': float(self.get_disk()),
            'constraints': self.get_constraints(service_namespace_config),
            'instances': self.get_instances(),
            'cmd': self.get_cmd(),
            'args': self.get_args(),
        }

        if net == 'BRIDGE':
            complete_config['container']['docker']['portMappings'] = [
                {
                    'containerPort': CONTAINER_PORT,
                    'hostPort': 0,
                    'protocol': 'tcp',
                },
            ]

        accepted_resource_roles = self.get_accepted_resource_roles()
        if accepted_resource_roles is not None:
            complete_config[
                'accepted_resource_roles'] = accepted_resource_roles

        code_sha = get_code_sha_from_dockerurl(docker_url)

        config_hash = get_config_hash(
            {
                key: value
                for key, value in complete_config.items()
                if key not in CONFIG_HASH_BLACKLIST
            },
            force_bounce=self.get_force_bounce(),
        )
        complete_config['id'] = format_job_id(self.service, self.instance,
                                              code_sha, config_hash)

        log.debug("Complete configuration for instance is: %s",
                  complete_config)
        return complete_config
예제 #53
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    try:
        service, instance, _, __ = decompose_job_id(args.service_instance, spacer=chronos_tools.INTERNAL_SPACER)
    except InvalidJobNameError:
        log.error("Invalid service instance '%s' specified. Format is service%sinstance."
                  % (args.service_instance, SPACER))
        sys.exit(1)

    client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config())
    cluster = load_system_paasta_config().get_cluster()

    try:
        complete_job_config = chronos_tools.create_complete_config(
            service=service,
            job_name=instance,
            soa_dir=soa_dir,
        )
    except (NoDeploymentsAvailable, NoDockerImageError):
        error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % (
            args.service_instance, cluster,
        )
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except NoConfigurationForServiceError as e:
        error_msg = (
            "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) +
            "Error was: %s" % str(e)
        )
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except NoSlavesAvailableError as e:
        error_msg = (
            "There are no PaaSTA slaves that can run %s in cluster %s\n" % (args.service_instance, cluster) +
            "Double check the cluster and the configured constratints/pool/whitelist.\n"
            "Error was: %s" % str(e)
        )
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except chronos_tools.InvalidParentError:
        log.warn("Skipping %s.%s: Parent job could not be found" % (service, instance))
        sys.exit(0)

    modified_config = config_with_historical_stats(
        chronos_client=client,
        service=service,
        instance=instance,
        job_config=complete_job_config,
    )

    status, output = setup_job(
        service=service,
        instance=instance,
        cluster=cluster,
        complete_job_config=modified_config,
        client=client,
    )

    sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
    send_event(
        service=service,
        instance=instance,
        soa_dir=soa_dir,
        status=sensu_status,
        output=output,
    )
    # We exit 0 because the script finished ok and the event was sent to the right team.
    sys.exit(0)
예제 #54
0
def get_marathon_clients_from_config() -> MarathonClients:
    system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = get_marathon_clients(marathon_servers)
    return marathon_clients
예제 #55
0
def main():
    args = parse_args()

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    service, instance = chronos_tools.decompose_job_id(args.service_instance)

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)

    related_jobs = chronos_tools.get_related_jobs_configs(cluster,
                                                          service,
                                                          instance,
                                                          soa_dir=args.soa_dir)
    if not related_jobs:
        error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % (
            args.service_instance,
            cluster,
        )
        paasta_print(error_msg)
        raise NoDeploymentsAvailable

    if not args.run_all_related_jobs:
        # Strip all the configuration for the related services
        # those information will not be used by the rest of the flow
        related_jobs = {
            (service, instance): related_jobs[(service, instance)],
        }

    complete_job_configs = {}
    for (srv, inst) in related_jobs:
        try:
            complete_job_configs.update(
                {
                    (srv, inst):
                    chronos_tools.create_complete_config(
                        service=srv,
                        job_name=inst,
                        soa_dir=args.soa_dir,
                    ),
                }, )
        except (NoDeploymentsAvailable, NoDockerImageError) as e:
            error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % (
                chronos_tools.compose_job_id(srv, inst),
                cluster,
            )
            paasta_print(error_msg)
            raise e
        except NoConfigurationForServiceError as e:
            error_msg = (
                "Could not read chronos configuration file for %s in cluster %s\nError was: %s"
                % (
                    chronos_tools.compose_job_id(srv, inst),
                    cluster,
                    str(e),
                ))
            paasta_print(error_msg)
            raise e
        except chronos_tools.InvalidParentError as e:
            raise e

    if not args.run_all_related_jobs:
        sorted_jobs = [(service, instance)]
    else:
        sorted_jobs = chronos_tools.topological_sort_related_jobs(
            cluster, service, instance, soa_dir=args.soa_dir)

    timestamp = datetime.datetime.utcnow().isoformat()

    chronos_to_add = []
    for (service, instance) in sorted_jobs:
        # complete_job_config is a formatted version of the job,
        # so the command is formatted in the context of 'now'
        # replace it with the 'original' cmd so it can be re rendered
        chronos_job_config = chronos_tools.load_chronos_job_config(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=args.soa_dir,
        )
        original_command = chronos_job_config.get_cmd()
        complete_job_config = complete_job_configs[(service, instance)]
        complete_job_config['command'] = original_command
        clone = clone_job(
            chronos_job=complete_job_config,
            date=datetime.datetime.strptime(args.execution_date,
                                            "%Y-%m-%dT%H:%M:%S"),
            timestamp=timestamp,
            force_disabled=args.force_disabled,
        )

        if not args.run_all_related_jobs and chronos_tools.get_job_type(
                clone) == chronos_tools.JobType.Dependent:
            # If the job is a dependent job and we want to re-run only the specific instance
            # remove the parents and update the schedule to start the job as soon as possible
            clone = set_default_schedule(remove_parents(clone))

        chronos_to_add.append(clone)

    for job_to_add in chronos_to_add:
        client.add(job_to_add)
예제 #56
0
 def my_init(self, filewatcher):
     self.filewatcher = filewatcher
     self.public_config = load_system_paasta_config()
     self.marathon_client = get_marathon_client_from_config()
예제 #57
0
        raise ValueError(
            f"{metric} hasn't been configured as a guage or counter")
    print(f"Sent {path}: {value} to meteorite")


def report_all_metrics_to_meteorite(csv, paasta_cluster):
    for row in csv:
        if row['svname'] == 'BACKEND':
            for metric in GUAGES + COUNTERS:
                report_metric_to_meteorite(
                    backend=row['# pxname'],
                    metric=metric,
                    value=row[metric],
                    paasta_cluster=paasta_cluster,
                )


if __name__ == '__main__':
    system_paasta_config = utils.load_system_paasta_config()
    csv = retrieve_haproxy_csv(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.
        get_synapse_haproxy_url_format(),
    )
    report_all_metrics_to_meteorite(
        csv=csv,
        paasta_cluster=system_paasta_config.get_local_run_config().get(
            'default_cluster'),
    )
예제 #58
0
def print_output(argv: Optional[Sequence[str]] = None) -> None:
    mesos_available = is_mesos_available()
    kube_available = is_kubernetes_available()

    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    if mesos_available:
        master_kwargs = {}
        # we don't want to be passing False to not override a possible True
        # value from system config
        if args.use_mesos_cache:
            master_kwargs["use_mesos_cache"] = True

        master = get_mesos_master(
            mesos_config_path=get_mesos_config_path(system_paasta_config),
            **master_kwargs,
        )

        marathon_servers = get_marathon_servers(system_paasta_config)
        marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers))

        try:
            mesos_state = a_sync.block(master.state)
            all_mesos_results = _run_mesos_checks(
                mesos_master=master, mesos_state=mesos_state
            )
        except MasterNotAvailableException as e:
            # if we can't connect to master at all,
            # then bomb out early
            print(PaastaColors.red("CRITICAL:  %s" % "\n".join(e.args)))
            raise FatalError(2)

        marathon_results = _run_marathon_checks(marathon_clients)
    else:
        marathon_results = [
            metastatus_lib.HealthCheckResult(
                message="Marathon is not configured to run here", healthy=True
            )
        ]
        all_mesos_results = [
            metastatus_lib.HealthCheckResult(
                message="Mesos is not configured to run here", healthy=True
            )
        ]

    if kube_available:
        kube_client = KubeClient()
        kube_results = _run_kube_checks(kube_client)
    else:
        kube_results = [
            metastatus_lib.HealthCheckResult(
                message="Kubernetes is not configured to run here", healthy=True
            )
        ]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    kube_ok = all(metastatus_lib.status_for_results(kube_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok
    )
    kube_summary = metastatus_lib.generate_summary_for_check("Kubernetes", kube_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok]) else False

    print(f"Master paasta_tools version: {__version__}")
    print("Mesos leader: %s" % get_mesos_leader())
    metastatus_lib.print_results_for_healthchecks(
        mesos_summary, mesos_ok, all_mesos_results, args.verbose
    )
    if args.verbose > 1 and mesos_available:
        print_with_indent("Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state(
            groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state
        )
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent("Per Slave Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster
            )
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2
                )
            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            all_rows, _ = utilization_table_by_grouping_from_mesos_state(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                mesos_state=mesos_state,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be
            # 1 for per-slave resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(
        marathon_summary, marathon_ok, marathon_results, args.verbose
    )
    metastatus_lib.print_results_for_healthchecks(
        kube_summary, kube_ok, kube_results, args.verbose
    )
    if args.verbose > 1 and kube_available:
        print_with_indent("Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_kube(
            groupings=args.groupings, threshold=args.threshold, kube_client=kube_client
        )
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent("Per Node Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster
            )
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2
                )
            # print info about nodes here. Note that we don't make
            # modifications to the healthy_exit variable here, because we don't
            # care about a single node having high usage.
            all_rows, _ = utilization_table_by_grouping_from_kube(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                kube_client=kube_client,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be
            # 1 for per-node resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)

    if not healthy_exit:
        raise FatalError(2)
예제 #59
0
def main():
    args = parse_args()
    full_appid = args.appname.lstrip('/')
    soa_dir = args.soa_dir
    marathon_config = marathon_tools.load_marathon_config()
    client = marathon_tools.get_marathon_client(
        url=marathon_config.get_url(),
        user=marathon_config.get_username(),
        passwd=marathon_config.get_password(),
    )

    if not marathon_tools.is_app_id_running(app_id=full_appid, client=client):
        paasta_print("Couldn't find an app named {}".format(full_appid))
        sys.exit(1)

    service, instance, _, __ = (s.replace('--', '_')
                                for s in decompose_job_id(full_appid))
    cluster = load_system_paasta_config().get_cluster()
    service_instance_config = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    )
    complete_config = service_instance_config.format_marathon_app_dict()
    nerve_ns = service_instance_config.get_nerve_namespace()
    service_namespace_config = marathon_tools.load_service_namespace_config(
        service=service, namespace=nerve_ns)
    drain_method = drain_lib.get_drain_method(
        service_instance_config.get_drain_method(service_namespace_config),
        service=service,
        instance=instance,
        nerve_ns=nerve_ns,
        drain_method_params=service_instance_config.get_drain_method_params(
            service_namespace_config),
    )

    bounce_func = bounce_lib.get_bounce_method_func('down')

    while marathon_tools.is_app_id_running(app_id=full_appid, client=client):
        app_to_kill = client.get_app(full_appid)
        (
            old_app_live_happy_tasks,
            old_app_live_unhappy_tasks,
            old_app_draining_tasks,
            old_app_at_risk_tasks,
        ) = get_tasks_by_state(
            other_apps=[app_to_kill],
            drain_method=drain_method,
            service=service,
            nerve_ns=nerve_ns,
            bounce_health_params=service_instance_config.
            get_bounce_health_params(service_namespace_config),
        )
        do_bounce(
            bounce_func=bounce_func,
            drain_method=drain_method,
            config=complete_config,
            new_app_running='',
            happy_new_tasks=[],
            old_app_live_happy_tasks=old_app_live_happy_tasks,
            old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
            old_app_draining_tasks=old_app_draining_tasks,
            serviceinstance="{}.{}".format(service, instance),
            bounce_method='down',
            service=service,
            cluster=cluster,
            instance=instance,
            marathon_jobid=full_appid,
            client=client,
            soa_dir=soa_dir,
        )

        paasta_print("Sleeping for 10 seconds to give the tasks time to drain")
        time.sleep(10)

    paasta_print("Sucessfully killed {}".format(full_appid))
예제 #60
0
def paasta_start_or_stop(args, desired_state):
    """Requests a change of state to start or stop given branches of a service."""
    soa_dir = args.soa_dir

    pargs = apply_args_filters(args)
    if len(pargs) == 0:
        return 1

    affected_services = {
        s
        for service_list in pargs.values() for s in service_list.keys()
    }
    if len(affected_services) > 1:
        paasta_print(
            PaastaColors.red(
                "Warning: trying to start/stop/restart multiple services:"))

        for cluster, services_instances in pargs.items():
            paasta_print("Cluster %s:" % cluster)
            for service, instances in services_instances.items():
                paasta_print("    Service %s:" % service)
                paasta_print("        Instances %s" %
                             ",".join(instances.keys()))

        if sys.stdin.isatty():
            confirm = choice.Binary("Are you sure you want to continue?",
                                    False).ask()
        else:
            confirm = False
        if not confirm:
            paasta_print()
            paasta_print("exiting")
            return 1

    invalid_deploy_groups = []
    marathon_message_printed = False
    affected_flinks = []

    if args.clusters is None or args.instances is None:
        if confirm_to_continue(pargs.items(), desired_state) is False:
            paasta_print()
            paasta_print("exiting")
            return 1

    for cluster, services_instances in pargs.items():
        for service, instances in services_instances.items():
            for instance in instances.keys():
                service_config = get_instance_config(
                    service=service,
                    cluster=cluster,
                    instance=instance,
                    soa_dir=soa_dir,
                    load_deployments=False,
                )
                if isinstance(service_config, FlinkDeploymentConfig):
                    affected_flinks.append(service_config)
                    continue

                try:
                    remote_refs = get_remote_refs(service, soa_dir)
                except remote_git.LSRemoteException as e:
                    msg = (
                        "Error talking to the git server: %s\n"
                        "This PaaSTA command requires access to the git server to operate.\n"
                        "The git server may be down or not reachable from here.\n"
                        "Try again from somewhere where the git server can be reached, "
                        "like your developer environment.") % str(e)
                    paasta_print(msg)
                    return 1

                deploy_group = service_config.get_deploy_group()
                (deploy_tag,
                 _) = get_latest_deployment_tag(remote_refs, deploy_group)

                if deploy_tag not in remote_refs:
                    invalid_deploy_groups.append(deploy_group)
                else:
                    force_bounce = utils.format_timestamp(
                        datetime.datetime.utcnow())
                    if (isinstance(service_config, MarathonServiceConfig)
                            and not marathon_message_printed):
                        print_marathon_message(desired_state)
                        marathon_message_printed = True

                    issue_state_change_for_service(
                        service_config=service_config,
                        force_bounce=force_bounce,
                        desired_state=desired_state,
                    )

    return_val = 0

    # TODO: Refactor to discover if set_state is available for given
    #       instance_type in API
    if affected_flinks:
        print_flink_message(desired_state)
        csi = defaultdict(lambda: defaultdict(list))
        for service_config in affected_flinks:
            csi[service_config.cluster][service_config.service].append(
                service_config.instance)

        system_paasta_config = load_system_paasta_config()
        for cluster, services_instances in csi.items():
            client = get_paasta_api_client(cluster, system_paasta_config)
            if not client:
                paasta_print("Cannot get a paasta-api client")
                exit(1)

            for service, instances in services_instances.items():
                for instance in instances:
                    try:
                        client.service.instance_set_state(
                            service=service,
                            instance=instance,
                            desired_state=desired_state,
                        ).result()
                    except HTTPError as exc:
                        paasta_print(exc.response.text)
                        return exc.status_code

                return_val = 0

    if invalid_deploy_groups:
        paasta_print(
            f"No deploy tags found for {', '.join(invalid_deploy_groups)}.")
        paasta_print(f"Has {service} been deployed there yet?")
        return_val = 1

    return return_val