def get_service_instances_needing_update( marathon_clients: MarathonClients, instances: Collection[Tuple[str, str]], cluster: str, ) -> List[Tuple[str, str, MarathonServiceConfig, str]]: marathon_apps = {} for marathon_client in marathon_clients.get_all_clients(): marathon_apps.update( {app.id: app for app in get_all_marathon_apps(marathon_client)}) marathon_app_ids = marathon_apps.keys() service_instances = [] for service, instance in instances: try: config = load_marathon_service_config_no_cache( service=service, instance=instance, cluster=cluster, soa_dir=DEFAULT_SOA_DIR, ) config_app = config.format_marathon_app_dict() app_id = "/{}".format(config_app["id"]) # Not ideal but we rely on a lot of user input to create the app dict # and we really can't afford to bail if just one app definition is malformed except Exception as e: print("ERROR: Skipping {}.{} because: '{}'".format( service, instance, str(e))) continue if (app_id not in marathon_app_ids or marathon_apps[app_id].instances != config_app["instances"]): service_instances.append((service, instance, config, app_id)) return service_instances
def test_get_service_instances_that_need_bouncing_at_risk(): with mock.patch( "paasta_tools.list_marathon_service_instances.get_desired_marathon_configs", autospec=True, ) as mock_get_desired_marathon_configs, mock.patch( "paasta_tools.list_marathon_service_instances.get_num_at_risk_tasks", autospec=True, ) as mock_get_num_at_risk_tasks, mock.patch( "paasta_tools.list_marathon_service_instances.get_draining_hosts", autospec=True): mock_get_desired_marathon_configs.return_value = ( { "fake--service.fake--instance.sha.config": { "instances": 5 } }, { "fake--service.fake--instance.sha.config": mock.Mock(get_marathon_shard=mock.Mock(return_value=None)) }, ) fake_apps = [ mock.MagicMock(instances=5, id="/fake--service.fake--instance.sha.config") ] mock_client = mock.MagicMock(list_apps=mock.MagicMock( return_value=fake_apps)) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) mock_get_num_at_risk_tasks.return_value = 1 assert set( list_marathon_service_instances. get_service_instances_that_need_bouncing( marathon_clients=fake_clients, soa_dir="/fake/soa/dir")) == {"fake_service.fake_instance"}
def get_service_instances_needing_update( marathon_clients: MarathonClients, instances: Collection[Tuple[str, str]], cluster: str, ) -> List[Tuple[str, str]]: marathon_apps = {} for marathon_client in marathon_clients.get_all_clients(): marathon_apps.update( {app.id: app for app in get_all_marathon_apps(marathon_client)}) marathon_app_ids = marathon_apps.keys() service_instances = [] for service, instance in instances: try: config = load_marathon_service_config_no_cache( service=service, instance=instance, cluster=cluster, soa_dir=DEFAULT_SOA_DIR, ) config_app = config.format_marathon_app_dict() app_id = '/{}'.format(config_app['id']) except (NoDockerImageError, InvalidJobNameError, NoDeploymentsAvailable) as e: print("DEBUG: Skipping %s.%s because: '%s'" % (service, instance, str(e))) continue if app_id not in marathon_app_ids: service_instances.append((service, instance)) elif marathon_apps[app_id].instances != config_app['instances']: service_instances.append((service, instance)) return service_instances
def test_create_marathon_dashboard( mock_get_services_for_cluster, mock_pscl, mock_load_system_paasta_config ): soa_dir = "/fake/soa/dir" cluster = "fake_cluster" mock_load_system_paasta_config.return_value = SystemPaastaConfig( {"dashboard_links": {}}, "fake_directory" ) mock_get_services_for_cluster.return_value = [ ("fake_service", "foo"), ("fake_service", "bar"), ] mock_pscl.return_value.instance_configs.return_value = [ MarathonServiceConfig("fake_service", "fake_cluster", "foo", {}, {}, soa_dir), MarathonServiceConfig("fake_service", "fake_cluster", "bar", {}, {}, soa_dir), ] mock_client = mock.Mock(servers=["hi"]) mock_clients = MarathonClients(current=[mock_client], previous=[mock_client]) expected_output = { "fake_cluster": [ {"service": "fake_service", "instance": "foo", "shard_url": "hi"}, {"service": "fake_service", "instance": "bar", "shard_url": "hi"}, ] } assert ( marathon_dashboard.create_marathon_dashboard( cluster=cluster, soa_dir=soa_dir, marathon_clients=mock_clients ) == expected_output )
def create_marathon_dashboard( cluster: str, soa_dir: str = DEFAULT_SOA_DIR, marathon_clients: MarathonClients = None, system_paasta_config: SystemPaastaConfig = None, ) -> Marathon_Dashboard: try: instances: List = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) except FileNotFoundError: instances = [] dashboard: Marathon_Dashboard = {cluster: []} if system_paasta_config is None: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers( system_paasta_config=system_paasta_config) if marathon_clients is None: marathon_clients = get_marathon_clients( marathon_servers=marathon_servers, cached=False) for service_instance in instances: service: str = service_instance[0] instance: str = service_instance[1] service_config: MarathonServiceConfig = load_marathon_service_config( service=service, instance=instance, cluster=cluster, load_deployments=False, soa_dir=soa_dir, ) client: MarathonClient = marathon_clients.get_current_client_for_service( job_config=service_config) dashboard_links: Dict = system_paasta_config.get_dashboard_links() shard_url: str = client.servers[0] if 'Marathon RO' in dashboard_links[cluster]: marathon_links = dashboard_links[cluster]['Marathon RO'] if isinstance(marathon_links, list): for shard_number, shard in enumerate(marathon_servers.current): if shard.url[0] == shard_url: shard_url = marathon_links[shard_number] elif isinstance(marathon_links, str): shard_url = marathon_links.split(' ')[0] service_info: Marathon_Dashboard_Item = { 'service': service, 'instance': instance, 'shard_url': shard_url, } dashboard[cluster].append(service_info) return dashboard
def status_marathon_job_verbose( service: str, instance: str, clients: marathon_tools.MarathonClients, cluster: str, soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, dashboards: Dict[marathon_tools.MarathonClient, str], ) -> Tuple[List[MarathonTask], str]: """Returns detailed information about a marathon apps for a service and instance. Does not make assumptions about what the *exact* appid is, but instead does a fuzzy match on any marathon apps that match the given service.instance""" all_tasks: List[MarathonTask] = [] all_output: List[str] = [] # For verbose mode, we want to see *any* matching app. As it may # not be the one that we think should be deployed. For example # during a bounce we want to see the old and new ones. marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service(job_config), embed_tasks=True, ) autoscaling_info = get_autoscaling_info(clients, job_config) if autoscaling_info: all_output.append(" Autoscaling Info:") headers = [ field.replace("_", " ").capitalize() for field in ServiceAutoscalingInfo._fields ] table = [headers, autoscaling_info] all_output.append('\n'.join( [" %s" % line for line in format_table(table)])) for app, client in marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients): tasks, output = get_verbose_status_of_marathon_app( marathon_client=client, app=app, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, ) all_tasks.extend(tasks) all_output.append(output) return all_tasks, "\n".join(all_output)
def test_get_service_instances_that_need_bouncing(): with mock.patch( 'paasta_tools.list_marathon_service_instances.get_desired_marathon_configs', autospec=True, ) as mock_get_desired_marathon_configs, mock.patch( 'paasta_tools.list_marathon_service_instances.get_num_at_risk_tasks', autospec=True, ) as mock_get_num_at_risk_tasks, mock.patch( 'paasta_tools.list_marathon_service_instances.get_draining_hosts', autospec=True, ): mock_get_desired_marathon_configs.return_value = ( { 'fake--service.fake--instance.sha.config': { 'instances': 5 }, 'fake--service2.fake--instance.sha.config': { 'instances': 5 }, }, { 'fake--service.fake--instance.sha.config': mock.Mock(get_marathon_shard=mock.Mock(return_value=None)), 'fake--service2.fake--instance.sha.config': mock.Mock(get_marathon_shard=mock.Mock(return_value=None)), }, ) fake_apps = [ mock.MagicMock(instances=5, id='/fake--service.fake--instance.sha.config2'), mock.MagicMock(instances=5, id='/fake--service2.fake--instance.sha.config'), ] mock_client = mock.MagicMock(list_apps=mock.MagicMock( return_value=fake_apps)) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) mock_get_num_at_risk_tasks.return_value = 0 assert set( list_marathon_service_instances. get_service_instances_that_need_bouncing( marathon_clients=fake_clients, soa_dir='/fake/soa/dir', )) == {'fake_service.fake_instance'}
def test_create_marathon_dashboard(mock_get_services_for_cluster, mock_pscl, mock_load_system_paasta_config): soa_dir = '/fake/soa/dir' cluster = 'fake_cluster' mock_load_system_paasta_config.return_value = SystemPaastaConfig( { 'dashboard_links': {}, }, 'fake_directory', ) mock_get_services_for_cluster.return_value = [ ('fake_service', 'foo'), ('fake_service', 'bar'), ] mock_pscl.return_value.instance_configs.return_value = [ MarathonServiceConfig('fake_service', 'fake_cluster', 'foo', {}, {}, soa_dir), MarathonServiceConfig('fake_service', 'fake_cluster', 'bar', {}, {}, soa_dir), ] mock_client = mock.Mock(servers=['hi']) mock_clients = MarathonClients(current=[mock_client], previous=[mock_client]) expected_output = { 'fake_cluster': [ { 'service': 'fake_service', 'instance': 'foo', 'shard_url': 'hi', }, { 'service': 'fake_service', 'instance': 'bar', 'shard_url': 'hi', }, ], } assert marathon_dashboard.create_marathon_dashboard( cluster=cluster, soa_dir=soa_dir, marathon_clients=mock_clients, ) == expected_output
def deploy_marathon_service( service: str, instance: str, clients: marathon_tools.MarathonClients, soa_dir: str, marathon_apps_with_clients: Optional[Collection[Tuple[MarathonApp, MarathonClient]]], ) -> Tuple[int, float]: """deploy the service instance given and proccess return code if there was an error we send a sensu alert. :param service: The service name to setup :param instance: The instance of the service to setup :param clients: A MarathonClients object :param soa_dir: Path to yelpsoa configs :param marathon_apps: A list of all marathon app objects :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd bounce_in_seconds instructs how long until the deployd should try another bounce None means that it is in a steady state and doesn't need to bounce again """ short_id = marathon_tools.format_job_id(service, instance) try: with bounce_lib.bounce_lock_zookeeper(short_id): try: service_instance_config = marathon_tools.load_marathon_service_config_no_cache( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, load_system_paasta_config().get_cluster())) return 0, None except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \ (service, instance, load_system_paasta_config().get_cluster()) log.error(error_msg) return 1, None if marathon_apps_with_clients is None: marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service( job_config=service_instance_config), embed_tasks=True, ) try: with a_sync.idle_event_loop(): status, output, bounce_again_in_seconds = setup_service( service=service, instance=instance, clients=clients, job_config=service_instance_config, marathon_apps_with_clients=marathon_apps_with_clients, soa_dir=soa_dir, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) return 0, bounce_again_in_seconds except (KeyError, TypeError, AttributeError, InvalidInstanceConfig, NoSlavesAvailableError): error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) return 1, None except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return 0, None
def deploy_service( service: str, instance: str, marathon_jobid: str, config: marathon_tools.FormattedMarathonAppDict, clients: marathon_tools.MarathonClients, marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]], bounce_method: str, drain_method_name: str, drain_method_params: Dict[str, Any], nerve_ns: str, bounce_health_params: Dict[str, Any], soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Tuple[int, str, Optional[float]]: """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param clients: A MarathonClients object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event""" def log_deploy_error(errormsg: str, level: str = 'event') -> None: return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance, ) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service=service, instance=instance, marathon_apps_with_clients=marathon_apps_with_clients, ) new_client = clients.get_current_client_for_service(job_config) new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = [] other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = [] for a, c in existing_apps_with_clients: if a.id == '/%s' % config['id'] and c == new_client: new_apps_with_clients_list.append((a, c)) else: other_apps_with_clients.append((a, c)) serviceinstance = "%s.%s" % (service, instance) if new_apps_with_clients_list: new_app, new_client = new_apps_with_clients_list[0] if len(new_apps_with_clients_list) != 1: raise ValueError( "Only expected one app per ID per shard; found %d" % len(new_apps_with_clients_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks( new_app, service, nerve_ns, system_paasta_config, **bounce_health_params, ) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg, None) try: draining_hosts = get_draining_hosts() except ReadTimeout as e: errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e return (1, errormsg, 60) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps_with_clients=other_apps_with_clients, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) # The first thing we need to do is take up the "slack" of old apps, to stop # them from launching new things that we are going to have to end up draining # and killing anyway. for a, c in other_apps_with_clients: marathon_tools.take_up_slack(app=a, client=c) num_at_risk_tasks = 0 if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks( new_app, draining_hosts=draining_hosts) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s up from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) new_client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( app=new_app, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[(new_app.id, new_client)] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[(new_app.id, new_client)] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy], ) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[(new_app.id, new_client)] = set( scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[(new_app.id, new_client)] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # slack represents remaining the extra remaining instances that are configured # in marathon that don't have a launched task yet. When scaling down we want to # reduce this slack so marathon doesn't get a chance to launch a new task in # that space that we will then have to drain and kill again. marathon_tools.take_up_slack(client=new_client, app=new_app) # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get((new_app.id, new_client), []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg, None) bounce_again_in_seconds = do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, clients=clients, soa_dir=soa_dir, job_config=job_config, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: logline = 'Failed to get lock to create marathon app for %s.%s' % ( service, instance) log_deploy_error(logline, level='debug') return (0, "Couldn't get marathon lock, skipping until next time", None) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % ( service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise if num_at_risk_tasks: bounce_again_in_seconds = 60 elif new_app_running: if new_app.instances > config['instances']: bounce_again_in_seconds = 60 return (0, 'Service deployed.', bounce_again_in_seconds)
def do_bounce( bounce_func: bounce_lib.BounceMethod, drain_method: drain_lib.DrainMethod, config: marathon_tools.FormattedMarathonAppDict, new_app_running: bool, happy_new_tasks: List[Tuple[MarathonTask, MarathonClient]], old_app_live_happy_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_live_unhappy_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_draining_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_at_risk_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], service: str, bounce_method: str, serviceinstance: str, cluster: str, instance: str, marathon_jobid: str, clients: marathon_tools.MarathonClients, soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Optional[float]: def log_bounce_action(line: str, level: str = 'debug') -> None: return _log( service=service, line=line, component='deploy', level=level, cluster=cluster, instance=instance, ) # log if we're not in a steady state. if any([ (not new_app_running), old_app_live_happy_tasks.keys(), ]): log_bounce_action( line=' '.join([ '%s bounce in progress on %s.' % (bounce_method, serviceinstance), 'New marathon app %s %s.' % (marathon_jobid, ('exists' if new_app_running else 'not created yet')), '%d new tasks to bring up.' % (config['instances'] - len(happy_new_tasks)), '%d old tasks receiving traffic and happy.' % len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)), '%d old tasks unhappy.' % len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)), '%d old tasks draining.' % len(bounce_lib.flatten_tasks(old_app_draining_tasks)), '%d old tasks at risk.' % len(bounce_lib.flatten_tasks(old_app_at_risk_tasks)), '%d old apps.' % len(old_app_live_happy_tasks.keys()), ]), level='event', ) else: log.debug("Nothing to do, bounce is in a steady state") new_client = clients.get_current_client_for_service(job_config) old_non_draining_tasks = list( old_app_tasks_to_task_client_pairs(old_app_live_happy_tasks), ) + list( old_app_tasks_to_task_client_pairs(old_app_live_unhappy_tasks), ) + list(old_app_tasks_to_task_client_pairs(old_app_at_risk_tasks), ) actions = bounce_func( new_config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_non_draining_tasks=old_non_draining_tasks, margin_factor=bounce_margin_factor, ) if actions['create_app'] and not new_app_running: log_bounce_action(line='%s bounce creating new app with app_id %s' % (bounce_method, marathon_jobid), ) with requests_cache.disabled(): try: bounce_lib.create_marathon_app( app_id=marathon_jobid, config=config, client=new_client, ) except MarathonHttpError as e: if e.status_code == 409: log.warning( "Failed to create, app %s already exists. This means another bounce beat us to it." " Skipping the rest of the bounce for this run" % marathon_jobid, ) return 60 raise tasks_to_kill = drain_tasks_and_find_tasks_to_kill( tasks_to_drain=actions['tasks_to_drain'], already_draining_tasks=old_app_tasks_to_task_client_pairs( old_app_draining_tasks), drain_method=drain_method, log_bounce_action=log_bounce_action, bounce_method=bounce_method, at_risk_tasks=old_app_tasks_to_task_client_pairs( old_app_at_risk_tasks), ) tasks_to_kill_by_client: Dict[MarathonClient, List[MarathonTask]] = defaultdict(list) for task, client in tasks_to_kill: tasks_to_kill_by_client[client].append(task) for client, tasks in tasks_to_kill_by_client.items(): kill_given_tasks(client=client, task_ids=[task.id for task in tasks], scale=True) for task in bounce_lib.flatten_tasks(old_app_at_risk_tasks): if task in tasks_to_kill: hostname = task.host try: reserve_all_resources([hostname]) except HTTPError: log.warning("Failed to reserve resources on %s" % hostname) apps_to_kill: List[Tuple[str, MarathonClient]] = [] for app, client in old_app_live_happy_tasks.keys(): if app != '/%s' % marathon_jobid or client != new_client: live_happy_tasks = old_app_live_happy_tasks[(app, client)] live_unhappy_tasks = old_app_live_unhappy_tasks[(app, client)] draining_tasks = old_app_draining_tasks[(app, client)] at_risk_tasks = old_app_at_risk_tasks[(app, client)] remaining_tasks = (live_happy_tasks | live_unhappy_tasks | draining_tasks | at_risk_tasks) for task, _ in tasks_to_kill: remaining_tasks.discard(task) if 0 == len(remaining_tasks): apps_to_kill.append((app, client)) if apps_to_kill: log_bounce_action( line='%s bounce removing old unused apps with app_ids: %s' % ( bounce_method, ', '.join([app for app, client in apps_to_kill]), ), ) with requests_cache.disabled(): for app_id, client in apps_to_kill: bounce_lib.kill_old_ids([app_id], client) all_old_tasks: Set[MarathonTask] = set() all_old_tasks = set.union(all_old_tasks, *old_app_live_happy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_live_unhappy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_at_risk_tasks.values()) if all_old_tasks or (not new_app_running): # Still have work more work to do, try again in 60 seconds return 60 else: # log if we appear to be finished if all([ (apps_to_kill or tasks_to_kill), apps_to_kill == list(old_app_live_happy_tasks), tasks_to_kill == all_old_tasks, ]): log_bounce_action( line='%s bounce on %s finishing. Now running %s' % ( bounce_method, serviceinstance, marathon_jobid, ), level='event', ) return None
def create_marathon_dashboard( cluster: str, soa_dir: str=DEFAULT_SOA_DIR, marathon_clients: MarathonClients=None, system_paasta_config: SystemPaastaConfig=None, ) -> Marathon_Dashboard: try: instances: List = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) except FileNotFoundError: instances = [] dashboard: Marathon_Dashboard = {cluster: []} if system_paasta_config is None: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config=system_paasta_config) if marathon_clients is None: marathon_clients = get_marathon_clients(marathon_servers=marathon_servers, cached=False) dashboard_links: Dict = system_paasta_config.get_dashboard_links() marathon_links = dashboard_links.get(cluster, {}).get('Marathon RO') # e.g. 'http://10.64.97.75:5052': 'http://marathon-norcal-prod.yelpcorp.com' shard_url_to_marathon_link_dict: Dict[str, str] = {} if isinstance(marathon_links, list): # Sanity check and log error if necessary if len(marathon_links) != len(marathon_servers.current): log.error('len(marathon_links) != len(marathon_servers.current). This may be a cause of concern') for shard_number, shard in enumerate(marathon_servers.current): shard_url_to_marathon_link_dict[shard.url[0]] = marathon_links[shard_number] elif isinstance(marathon_links, str): # In this case, the shard url will be the same for every service instance static_shard_url = marathon_links.split(' ')[0] return {cluster: [{'service': si[0], 'instance': si[1], 'shard_url': static_shard_url} for si in instances]} # Setup with service as key since will instantiate 1 PSCL per service service_instances_dict: Dict[str, Set[str]] = defaultdict(set) for si in instances: service, instance = si[0], si[1] service_instances_dict[service].add(instance) for service, instance_set in service_instances_dict.items(): pscl = PaastaServiceConfigLoader( service=service, soa_dir=soa_dir, load_deployments=False, ) for marathon_service_config in pscl.instance_configs(cluster, MarathonServiceConfig): if marathon_service_config.get_instance() in instance_set: client: MarathonClient = \ marathon_clients.get_current_client_for_service(job_config=marathon_service_config) ip_url: str = client.servers[0] # Convert to a marathon link if possible else default to the originalIP address shard_url: str = shard_url_to_marathon_link_dict.get(ip_url, ip_url) service_info: Marathon_Dashboard_Item = { 'service': service, 'instance': instance, 'shard_url': shard_url, } dashboard[cluster].append(service_info) return dashboard
def test_get_service_instances_needing_update(): with mock.patch( "paasta_tools.deployd.common.get_all_marathon_apps", autospec=True ) as mock_get_marathon_apps, mock.patch( "paasta_tools.deployd.common.load_marathon_service_config_no_cache", autospec=True, ) as mock_load_marathon_service_config: mock_marathon_apps = [ mock.Mock(id="/universe.c137.c1.g1", instances=2), mock.Mock(id="/universe.c138.c1.g1", instances=2), ] mock_get_marathon_apps.return_value = mock_marathon_apps mock_service_instances = [("universe", "c137"), ("universe", "c138")] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c137.c1.g1", "instances": 2 })), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c138.c2.g2", "instances": 2 })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, "westeros-prod") assert mock_get_marathon_apps.called calls = [ mock.call( service="universe", instance="c137", cluster="westeros-prod", soa_dir=DEFAULT_SOA_DIR, ), mock.call( service="universe", instance="c138", cluster="westeros-prod", soa_dir=DEFAULT_SOA_DIR, ), ] mock_load_marathon_service_config.assert_has_calls(calls) assert ret == [("universe", "c138", mock.ANY)] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c137.c1.g1", "instances": 3 })), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c138.c2.g2", "instances": 2 })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, "westeros-prod") assert ret == [("universe", "c137", mock.ANY), ("universe", "c138", mock.ANY)] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock( side_effect=NoDockerImageError)), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c138.c2.g2", "instances": 2 })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, "westeros-prod") assert ret == [("universe", "c138", mock.ANY)] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock( side_effect=NoSlavesAvailableError)), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c138.c2.g2", "instances": 2 })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, "westeros-prod") assert ret == [("universe", "c138", mock.ANY)] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock( side_effect=InvalidJobNameError)), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c138.c2.g2", "instances": 2 })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, "westeros-prod") assert ret == [("universe", "c138", mock.ANY)] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock( side_effect=NoDeploymentsAvailable)), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c138.c2.g2", "instances": 2 })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, "westeros-prod") assert ret == [("universe", "c138", mock.ANY)] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock( side_effect=Exception)), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ "id": "universe.c138.c2.g2", "instances": 2 })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, "westeros-prod") assert ret == [("universe", "c138", mock.ANY)]
def perform_command( command: str, service: str, instance: str, cluster: str, verbose: int, soa_dir: str, clients: marathon_tools.MarathonClients, job_config: marathon_tools.MarathonServiceConfig, app_id: str = None, ) -> int: """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :param client: MarathonClient or CachingMarathonClient :returns: A unix-style return code """ system_config = load_system_paasta_config() if not app_id: try: app_id = job_config.format_marathon_app_dict()["id"] except NoDockerImageError: job_id = compose_job_id(service, instance) paasta_print( "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id ) return 1 normal_instance_count = job_config.get_instances() current_client = clients.get_current_client_for_service(job_config) if command == "restart": restart_marathon_job(service, instance, app_id, current_client, cluster) elif command == "status": paasta_print( status_desired_state(service, instance, current_client, job_config) ) dashboards = get_marathon_dashboard_links(clients, system_config) tasks, out = status_marathon_job( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, normal_instance_count=normal_instance_count, clients=clients, job_config=job_config, desired_app_id=app_id, verbose=verbose, ) paasta_print(out) service_namespace_config = marathon_tools.load_service_namespace_config( service=service, namespace=job_config.get_nerve_namespace(), soa_dir=soa_dir ) paasta_print( status_mesos_tasks(service, instance, normal_instance_count, verbose) ) proxy_port = service_namespace_config.get("proxy_port") if proxy_port is not None: normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace( service, instance, cluster ) paasta_print( status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, service_namespace_config=service_namespace_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose > 0, synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(), system_deploy_blacklist=system_config.get_deploy_blacklist(), system_deploy_whitelist=system_config.get_deploy_whitelist(), ) ) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def status_marathon_job( service: str, instance: str, cluster: str, soa_dir: str, dashboards: Dict[marathon_tools.MarathonClient, str], normal_instance_count: int, clients: marathon_tools.MarathonClients, job_config: marathon_tools.MarathonServiceConfig, desired_app_id: str, verbose: int, ) -> Tuple[List[MarathonTask], str]: marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service(job_config), embed_tasks=True, service_name=service, ) all_tasks = [] all_output = [ "" ] # One entry that will be replaced with status_marathon_job_human output later. running_instances = 0 if verbose > 0: autoscaling_info = get_autoscaling_info(marathon_apps_with_clients, job_config) if autoscaling_info: all_output.append(" Autoscaling Info:") headers = [ field.replace("_", " ").capitalize() for field in ServiceAutoscalingInfo._fields ] table = [headers, humanize_autoscaling_info(autoscaling_info)] all_output.append( "\n".join([" %s" % line for line in format_table(table)]) ) deploy_status_for_desired_app = "Waiting for bounce" matching_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service, instance, marathon_apps_with_clients ) for app, client in matching_apps_with_clients: all_tasks.extend(app.tasks) ( deploy_status_for_current_app, running_instances_for_current_app, out, ) = status_marathon_app( marathon_client=client, app=app, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, verbose=verbose, ) if app.id.lstrip("/") == desired_app_id.lstrip("/"): deploy_status_for_desired_app = marathon_tools.MarathonDeployStatus.tostring( deploy_status_for_current_app ) running_instances += running_instances_for_current_app all_output.append(out) all_output[0] = status_marathon_job_human( service=service, instance=instance, deploy_status=deploy_status_for_desired_app, desired_app_id=desired_app_id, app_count=len(matching_apps_with_clients), running_instances=running_instances, normal_instance_count=normal_instance_count, ) return all_tasks, "\n".join(all_output)
def test_get_service_instances_needing_update(): with mock.patch( 'paasta_tools.deployd.common.get_all_marathon_apps', autospec=True, ) as mock_get_marathon_apps, mock.patch( 'paasta_tools.deployd.common.load_marathon_service_config_no_cache', autospec=True, ) as mock_load_marathon_service_config: mock_marathon_apps = [ mock.Mock(id='/universe.c137.c1.g1', instances=2), mock.Mock(id='/universe.c138.c1.g1', instances=2), ] mock_get_marathon_apps.return_value = mock_marathon_apps mock_service_instances = [('universe', 'c137'), ('universe', 'c138')] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ 'id': 'universe.c137.c1.g1', 'instances': 2, })), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ 'id': 'universe.c138.c2.g2', 'instances': 2, })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, 'westeros-prod') assert mock_get_marathon_apps.called calls = [ mock.call( service='universe', instance='c137', cluster='westeros-prod', soa_dir=DEFAULT_SOA_DIR, ), mock.call( service='universe', instance='c138', cluster='westeros-prod', soa_dir=DEFAULT_SOA_DIR, ), ] mock_load_marathon_service_config.assert_has_calls(calls) assert ret == [('universe', 'c138')] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ 'id': 'universe.c137.c1.g1', 'instances': 3, })), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ 'id': 'universe.c138.c2.g2', 'instances': 2, })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, 'westeros-prod') assert ret == [('universe', 'c137'), ('universe', 'c138')] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock( side_effect=NoDockerImageError)), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ 'id': 'universe.c138.c2.g2', 'instances': 2, })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, 'westeros-prod') assert ret == [('universe', 'c138')] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock( side_effect=InvalidJobNameError)), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ 'id': 'universe.c138.c2.g2', 'instances': 2, })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, 'westeros-prod') assert ret == [('universe', 'c138')] mock_configs = [ mock.Mock(format_marathon_app_dict=mock.Mock( side_effect=NoDeploymentsAvailable)), mock.Mock(format_marathon_app_dict=mock.Mock(return_value={ 'id': 'universe.c138.c2.g2', 'instances': 2, })), ] mock_load_marathon_service_config.side_effect = mock_configs mock_client = mock.Mock(servers=["foo"]) fake_clients = MarathonClients(current=[mock_client], previous=[mock_client]) ret = get_service_instances_needing_update(fake_clients, mock_service_instances, 'westeros-prod') assert ret == [('universe', 'c138')]