示例#1
0
def create_marathon_dashboard(
    cluster: str,
    soa_dir: str = DEFAULT_SOA_DIR,
    marathon_clients: MarathonClients = None,
    system_paasta_config: SystemPaastaConfig = None,
) -> Marathon_Dashboard:
    try:
        instances: List = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
    except FileNotFoundError:
        instances = []
    dashboard: Marathon_Dashboard = {cluster: []}
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(
        system_paasta_config=system_paasta_config)
    if marathon_clients is None:
        marathon_clients = get_marathon_clients(
            marathon_servers=marathon_servers, cached=False)
    for service_instance in instances:
        service: str = service_instance[0]
        instance: str = service_instance[1]
        service_config: MarathonServiceConfig = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=cluster,
            load_deployments=False,
            soa_dir=soa_dir,
        )
        client: MarathonClient = marathon_clients.get_current_client_for_service(
            job_config=service_config)
        dashboard_links: Dict = system_paasta_config.get_dashboard_links()
        shard_url: str = client.servers[0]
        if 'Marathon RO' in dashboard_links[cluster]:
            marathon_links = dashboard_links[cluster]['Marathon RO']
            if isinstance(marathon_links, list):
                for shard_number, shard in enumerate(marathon_servers.current):
                    if shard.url[0] == shard_url:
                        shard_url = marathon_links[shard_number]
            elif isinstance(marathon_links, str):
                shard_url = marathon_links.split(' ')[0]
        service_info: Marathon_Dashboard_Item = {
            'service': service,
            'instance': instance,
            'shard_url': shard_url,
        }
        dashboard[cluster].append(service_info)
    return dashboard
示例#2
0
def deploy_service(
    service: str,
    instance: str,
    marathon_jobid: str,
    config: marathon_tools.FormattedMarathonAppDict,
    clients: marathon_tools.MarathonClients,
    marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]],
    bounce_method: str,
    drain_method_name: str,
    drain_method_params: Dict[str, Any],
    nerve_ns: str,
    bounce_health_params: Dict[str, Any],
    soa_dir: str,
    job_config: marathon_tools.MarathonServiceConfig,
    bounce_margin_factor: float = 1.0,
) -> Tuple[int, str, Optional[float]]:
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param clients: A MarathonClients object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained
    :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event"""
    def log_deploy_error(errormsg: str, level: str = 'event') -> None:
        return _log(
            service=service,
            line=errormsg,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
        )

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients(
        service=service,
        instance=instance,
        marathon_apps_with_clients=marathon_apps_with_clients,
    )

    new_client = clients.get_current_client_for_service(job_config)

    new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = []
    other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = []

    for a, c in existing_apps_with_clients:
        if a.id == '/%s' % config['id'] and c == new_client:
            new_apps_with_clients_list.append((a, c))
        else:
            other_apps_with_clients.append((a, c))

    serviceinstance = "%s.%s" % (service, instance)

    if new_apps_with_clients_list:
        new_app, new_client = new_apps_with_clients_list[0]
        if len(new_apps_with_clients_list) != 1:
            raise ValueError(
                "Only expected one app per ID per shard; found %d" %
                len(new_apps_with_clients_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(
            new_app,
            service,
            nerve_ns,
            system_paasta_config,
            **bounce_health_params,
        )
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \
            (drain_method_name, ', '.join(drain_lib.list_drain_methods()))
        log_deploy_error(errormsg)
        return (1, errormsg, None)

    try:
        draining_hosts = get_draining_hosts()
    except ReadTimeout as e:
        errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e
        return (1, errormsg, 60)

    (
        old_app_live_happy_tasks,
        old_app_live_unhappy_tasks,
        old_app_draining_tasks,
        old_app_at_risk_tasks,
    ) = get_tasks_by_state(
        other_apps_with_clients=other_apps_with_clients,
        drain_method=drain_method,
        service=service,
        nerve_ns=nerve_ns,
        bounce_health_params=bounce_health_params,
        system_paasta_config=system_paasta_config,
        log_deploy_error=log_deploy_error,
        draining_hosts=draining_hosts,
    )

    # The first thing we need to do is take up the "slack" of old apps, to stop
    # them from launching new things that we are going to have to end up draining
    # and killing anyway.
    for a, c in other_apps_with_clients:
        marathon_tools.take_up_slack(app=a, client=c)

    num_at_risk_tasks = 0
    if new_app_running:
        num_at_risk_tasks = get_num_at_risk_tasks(
            new_app, draining_hosts=draining_hosts)
        if new_app.instances < config['instances'] + num_at_risk_tasks:
            log.info("Scaling %s up from %d to %d instances." %
                     (new_app.id, new_app.instances,
                      config['instances'] + num_at_risk_tasks))
            new_client.scale_app(app_id=new_app.id,
                                 instances=config['instances'] +
                                 num_at_risk_tasks,
                                 force=True)
        # If we have more than the specified number of instances running, we will want to drain some of them.
        # We will start by draining any tasks running on at-risk hosts.
        elif new_app.instances > config['instances']:
            num_tasks_to_scale = max(
                min(len(new_app.tasks), new_app.instances) -
                config['instances'], 0)
            task_dict = get_tasks_by_state_for_app(
                app=new_app,
                drain_method=drain_method,
                service=service,
                nerve_ns=nerve_ns,
                bounce_health_params=bounce_health_params,
                system_paasta_config=system_paasta_config,
                log_deploy_error=log_deploy_error,
                draining_hosts=draining_hosts,
            )
            scaling_app_happy_tasks = list(task_dict['happy'])
            scaling_app_unhappy_tasks = list(task_dict['unhappy'])
            scaling_app_draining_tasks = list(task_dict['draining'])
            scaling_app_at_risk_tasks = list(task_dict['at_risk'])

            tasks_to_move_draining = min(len(scaling_app_draining_tasks),
                                         num_tasks_to_scale)
            old_app_draining_tasks[(new_app.id, new_client)] = set(
                scaling_app_draining_tasks[:tasks_to_move_draining])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining

            tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks),
                                        num_tasks_to_scale)
            old_app_live_unhappy_tasks[(new_app.id, new_client)] = set(
                scaling_app_unhappy_tasks[:tasks_to_move_unhappy], )
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy

            tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks),
                                        num_tasks_to_scale)
            old_app_at_risk_tasks[(new_app.id, new_client)] = set(
                scaling_app_at_risk_tasks[:tasks_to_move_at_risk])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk

            tasks_to_move_happy = min(len(scaling_app_happy_tasks),
                                      num_tasks_to_scale)
            old_app_live_happy_tasks[(new_app.id, new_client)] = set(
                scaling_app_happy_tasks[:tasks_to_move_happy])
            happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:]

            # slack represents remaining the extra remaining instances that are configured
            # in marathon that don't have a launched task yet. When scaling down we want to
            # reduce this slack so marathon doesn't get a chance to launch a new task in
            # that space that we will then have to drain and kill again.
            marathon_tools.take_up_slack(client=new_client, app=new_app)

        # TODO: don't take actions in deploy_service.
        undrain_tasks(
            to_undrain=new_app.tasks,
            leave_draining=old_app_draining_tasks.get((new_app.id, new_client),
                                                      []),
            drain_method=drain_method,
            log_deploy_error=log_deploy_error,
        )

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \
                (bounce_method, ', '.join(bounce_lib.list_bounce_methods()))
            log_deploy_error(errormsg)
            return (1, errormsg, None)

        bounce_again_in_seconds = do_bounce(
            bounce_func=bounce_func,
            drain_method=drain_method,
            config=config,
            new_app_running=new_app_running,
            happy_new_tasks=happy_new_tasks,
            old_app_live_happy_tasks=old_app_live_happy_tasks,
            old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
            old_app_draining_tasks=old_app_draining_tasks,
            old_app_at_risk_tasks=old_app_at_risk_tasks,
            service=service,
            bounce_method=bounce_method,
            serviceinstance=serviceinstance,
            cluster=cluster,
            instance=instance,
            marathon_jobid=marathon_jobid,
            clients=clients,
            soa_dir=soa_dir,
            job_config=job_config,
            bounce_margin_factor=bounce_margin_factor,
        )
    except bounce_lib.LockHeldException:
        logline = 'Failed to get lock to create marathon app for %s.%s' % (
            service, instance)
        log_deploy_error(logline, level='debug')
        return (0, "Couldn't get marathon lock, skipping until next time",
                None)
    except Exception:
        logline = 'Exception raised during deploy of service %s:\n%s' % (
            service, traceback.format_exc())
        log_deploy_error(logline, level='debug')
        raise
    if num_at_risk_tasks:
        bounce_again_in_seconds = 60
    elif new_app_running:
        if new_app.instances > config['instances']:
            bounce_again_in_seconds = 60
    return (0, 'Service deployed.', bounce_again_in_seconds)
示例#3
0
def do_bounce(
    bounce_func: bounce_lib.BounceMethod,
    drain_method: drain_lib.DrainMethod,
    config: marathon_tools.FormattedMarathonAppDict,
    new_app_running: bool,
    happy_new_tasks: List[Tuple[MarathonTask, MarathonClient]],
    old_app_live_happy_tasks: Dict[Tuple[str, MarathonClient],
                                   Set[MarathonTask]],
    old_app_live_unhappy_tasks: Dict[Tuple[str, MarathonClient],
                                     Set[MarathonTask]],
    old_app_draining_tasks: Dict[Tuple[str, MarathonClient],
                                 Set[MarathonTask]],
    old_app_at_risk_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]],
    service: str,
    bounce_method: str,
    serviceinstance: str,
    cluster: str,
    instance: str,
    marathon_jobid: str,
    clients: marathon_tools.MarathonClients,
    soa_dir: str,
    job_config: marathon_tools.MarathonServiceConfig,
    bounce_margin_factor: float = 1.0,
) -> Optional[float]:
    def log_bounce_action(line: str, level: str = 'debug') -> None:
        return _log(
            service=service,
            line=line,
            component='deploy',
            level=level,
            cluster=cluster,
            instance=instance,
        )

    # log if we're not in a steady state.
    if any([
        (not new_app_running),
            old_app_live_happy_tasks.keys(),
    ]):
        log_bounce_action(
            line=' '.join([
                '%s bounce in progress on %s.' %
                (bounce_method, serviceinstance),
                'New marathon app %s %s.' %
                (marathon_jobid,
                 ('exists' if new_app_running else 'not created yet')),
                '%d new tasks to bring up.' %
                (config['instances'] - len(happy_new_tasks)),
                '%d old tasks receiving traffic and happy.' %
                len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)),
                '%d old tasks unhappy.' %
                len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)),
                '%d old tasks draining.' %
                len(bounce_lib.flatten_tasks(old_app_draining_tasks)),
                '%d old tasks at risk.' %
                len(bounce_lib.flatten_tasks(old_app_at_risk_tasks)),
                '%d old apps.' % len(old_app_live_happy_tasks.keys()),
            ]),
            level='event',
        )
    else:
        log.debug("Nothing to do, bounce is in a steady state")

    new_client = clients.get_current_client_for_service(job_config)

    old_non_draining_tasks = list(
        old_app_tasks_to_task_client_pairs(old_app_live_happy_tasks), ) + list(
            old_app_tasks_to_task_client_pairs(old_app_live_unhappy_tasks),
        ) + list(old_app_tasks_to_task_client_pairs(old_app_at_risk_tasks), )

    actions = bounce_func(
        new_config=config,
        new_app_running=new_app_running,
        happy_new_tasks=happy_new_tasks,
        old_non_draining_tasks=old_non_draining_tasks,
        margin_factor=bounce_margin_factor,
    )

    if actions['create_app'] and not new_app_running:
        log_bounce_action(line='%s bounce creating new app with app_id %s' %
                          (bounce_method, marathon_jobid), )
        with requests_cache.disabled():
            try:
                bounce_lib.create_marathon_app(
                    app_id=marathon_jobid,
                    config=config,
                    client=new_client,
                )
            except MarathonHttpError as e:
                if e.status_code == 409:
                    log.warning(
                        "Failed to create, app %s already exists. This means another bounce beat us to it."
                        " Skipping the rest of the bounce for this run" %
                        marathon_jobid, )
                    return 60
                raise

    tasks_to_kill = drain_tasks_and_find_tasks_to_kill(
        tasks_to_drain=actions['tasks_to_drain'],
        already_draining_tasks=old_app_tasks_to_task_client_pairs(
            old_app_draining_tasks),
        drain_method=drain_method,
        log_bounce_action=log_bounce_action,
        bounce_method=bounce_method,
        at_risk_tasks=old_app_tasks_to_task_client_pairs(
            old_app_at_risk_tasks),
    )

    tasks_to_kill_by_client: Dict[MarathonClient,
                                  List[MarathonTask]] = defaultdict(list)
    for task, client in tasks_to_kill:
        tasks_to_kill_by_client[client].append(task)

    for client, tasks in tasks_to_kill_by_client.items():
        kill_given_tasks(client=client,
                         task_ids=[task.id for task in tasks],
                         scale=True)

    for task in bounce_lib.flatten_tasks(old_app_at_risk_tasks):
        if task in tasks_to_kill:
            hostname = task.host
            try:
                reserve_all_resources([hostname])
            except HTTPError:
                log.warning("Failed to reserve resources on %s" % hostname)

    apps_to_kill: List[Tuple[str, MarathonClient]] = []
    for app, client in old_app_live_happy_tasks.keys():
        if app != '/%s' % marathon_jobid or client != new_client:
            live_happy_tasks = old_app_live_happy_tasks[(app, client)]
            live_unhappy_tasks = old_app_live_unhappy_tasks[(app, client)]
            draining_tasks = old_app_draining_tasks[(app, client)]
            at_risk_tasks = old_app_at_risk_tasks[(app, client)]

            remaining_tasks = (live_happy_tasks | live_unhappy_tasks
                               | draining_tasks | at_risk_tasks)
            for task, _ in tasks_to_kill:
                remaining_tasks.discard(task)

            if 0 == len(remaining_tasks):
                apps_to_kill.append((app, client))

    if apps_to_kill:
        log_bounce_action(
            line='%s bounce removing old unused apps with app_ids: %s' % (
                bounce_method,
                ', '.join([app for app, client in apps_to_kill]),
            ), )
        with requests_cache.disabled():
            for app_id, client in apps_to_kill:
                bounce_lib.kill_old_ids([app_id], client)

    all_old_tasks: Set[MarathonTask] = set()
    all_old_tasks = set.union(all_old_tasks,
                              *old_app_live_happy_tasks.values())
    all_old_tasks = set.union(all_old_tasks,
                              *old_app_live_unhappy_tasks.values())
    all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values())
    all_old_tasks = set.union(all_old_tasks, *old_app_at_risk_tasks.values())

    if all_old_tasks or (not new_app_running):
        # Still have work more work to do, try again in 60 seconds
        return 60
    else:
        # log if we appear to be finished
        if all([
            (apps_to_kill or tasks_to_kill),
                apps_to_kill == list(old_app_live_happy_tasks),
                tasks_to_kill == all_old_tasks,
        ]):
            log_bounce_action(
                line='%s bounce on %s finishing. Now running %s' % (
                    bounce_method,
                    serviceinstance,
                    marathon_jobid,
                ),
                level='event',
            )
        return None
示例#4
0
def create_marathon_dashboard(
        cluster: str,
        soa_dir: str=DEFAULT_SOA_DIR,
        marathon_clients: MarathonClients=None,
        system_paasta_config: SystemPaastaConfig=None,
) -> Marathon_Dashboard:
    try:
        instances: List = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
    except FileNotFoundError:
        instances = []
    dashboard: Marathon_Dashboard = {cluster: []}
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(system_paasta_config=system_paasta_config)
    if marathon_clients is None:
        marathon_clients = get_marathon_clients(marathon_servers=marathon_servers, cached=False)

    dashboard_links: Dict = system_paasta_config.get_dashboard_links()
    marathon_links = dashboard_links.get(cluster, {}).get('Marathon RO')

    # e.g. 'http://10.64.97.75:5052': 'http://marathon-norcal-prod.yelpcorp.com'
    shard_url_to_marathon_link_dict: Dict[str, str] = {}
    if isinstance(marathon_links, list):
        # Sanity check and log error if necessary
        if len(marathon_links) != len(marathon_servers.current):
            log.error('len(marathon_links) != len(marathon_servers.current). This may be a cause of concern')
        for shard_number, shard in enumerate(marathon_servers.current):
            shard_url_to_marathon_link_dict[shard.url[0]] = marathon_links[shard_number]
    elif isinstance(marathon_links, str):
        # In this case, the shard url will be the same for every service instance
        static_shard_url = marathon_links.split(' ')[0]
        return {cluster: [{'service': si[0], 'instance': si[1], 'shard_url': static_shard_url} for si in instances]}

    # Setup with service as key since will instantiate 1 PSCL per service
    service_instances_dict: Dict[str, Set[str]] = defaultdict(set)
    for si in instances:
        service, instance = si[0], si[1]
        service_instances_dict[service].add(instance)

    for service, instance_set in service_instances_dict.items():
        pscl = PaastaServiceConfigLoader(
            service=service,
            soa_dir=soa_dir,
            load_deployments=False,
        )
        for marathon_service_config in pscl.instance_configs(cluster, MarathonServiceConfig):
            if marathon_service_config.get_instance() in instance_set:
                client: MarathonClient = \
                    marathon_clients.get_current_client_for_service(job_config=marathon_service_config)
                ip_url: str = client.servers[0]
                # Convert to a marathon link if possible else default to the originalIP address
                shard_url: str = shard_url_to_marathon_link_dict.get(ip_url, ip_url)
                service_info: Marathon_Dashboard_Item = {
                    'service': service,
                    'instance': instance,
                    'shard_url': shard_url,
                }
                dashboard[cluster].append(service_info)
    return dashboard
示例#5
0
def perform_command(
    command: str,
    service: str,
    instance: str,
    cluster: str,
    verbose: int,
    soa_dir: str,
    clients: marathon_tools.MarathonClients,
    job_config: marathon_tools.MarathonServiceConfig,
    app_id: str = None,
) -> int:
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :param client: MarathonClient or CachingMarathonClient
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()["id"]
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            paasta_print(
                "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?"
                % job_id
            )
            return 1

    normal_instance_count = job_config.get_instances()

    current_client = clients.get_current_client_for_service(job_config)

    if command == "restart":
        restart_marathon_job(service, instance, app_id, current_client, cluster)
    elif command == "status":
        paasta_print(
            status_desired_state(service, instance, current_client, job_config)
        )
        dashboards = get_marathon_dashboard_links(clients, system_config)
        tasks, out = status_marathon_job(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
            dashboards=dashboards,
            normal_instance_count=normal_instance_count,
            clients=clients,
            job_config=job_config,
            desired_app_id=app_id,
            verbose=verbose,
        )
        paasta_print(out)
        service_namespace_config = marathon_tools.load_service_namespace_config(
            service=service, namespace=job_config.get_nerve_namespace(), soa_dir=soa_dir
        )

        paasta_print(
            status_mesos_tasks(service, instance, normal_instance_count, verbose)
        )

        proxy_port = service_namespace_config.get("proxy_port")
        if proxy_port is not None:
            normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
                service, instance, cluster
            )
            paasta_print(
                status_smartstack_backends(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    job_config=job_config,
                    service_namespace_config=service_namespace_config,
                    tasks=tasks,
                    expected_count=normal_smartstack_count,
                    soa_dir=soa_dir,
                    verbose=verbose > 0,
                    synapse_port=system_config.get_synapse_port(),
                    synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
                    system_deploy_blacklist=system_config.get_deploy_blacklist(),
                    system_deploy_whitelist=system_config.get_deploy_whitelist(),
                )
            )
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0