def delete_app(app_id, client): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) try: with bounce_lib.bounce_lock_zookeeper(marathon_tools.compose_job_id(service, instance)): bounce_lib.delete_marathon_app(app_id, client) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log(service=service, component='deploy', level='event', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=log_line) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log(service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline) raise
def deploy_marathon_service(service, instance, client, soa_dir, marathon_config): try: service_instance_config = marathon_tools.load_marathon_service_config( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug("No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, load_system_paasta_config().get_cluster())) return 0 except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \ (service, instance, load_system_paasta_config().get_cluster()) log.error(error_msg) return 1 try: status, output = setup_service(service, instance, client, marathon_config, service_instance_config, soa_dir) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) return 0 except (KeyError, TypeError, AttributeError, InvalidInstanceConfig): error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) return 1
def create_complete_config(service, instance, marathon_config, soa_dir=DEFAULT_SOA_DIR): """Generates a complete dictionary to be POST'ed to create an app on Marathon""" system_paasta_config = load_system_paasta_config() partial_id = format_job_id(service=service, instance=instance) instance_config = load_marathon_service_config( service=service, instance=instance, cluster=load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) docker_url = get_docker_url(system_paasta_config.get_docker_registry(), instance_config.get_docker_image()) service_namespace_config = load_service_namespace_config( service=service, namespace=instance_config.get_nerve_namespace(), ) docker_volumes = system_paasta_config.get_volumes() + instance_config.get_extra_volumes() complete_config = instance_config.format_marathon_app_dict( app_id=partial_id, docker_url=docker_url, docker_volumes=docker_volumes, service_namespace_config=service_namespace_config, ) code_sha = get_code_sha_from_dockerurl(docker_url) config_hash = get_config_hash( complete_config, force_bounce=instance_config.get_force_bounce(), ) full_id = format_job_id(service, instance, code_sha, config_hash) complete_config['id'] = full_id return complete_config
def create_complete_config(service, instance, marathon_config, soa_dir=DEFAULT_SOA_DIR): """Generates a complete dictionary to be POST'ed to create an app on Marathon""" # A set of config attributes that don't get included in the hash of the config. # These should be things that PaaSTA/Marathon knows how to change without requiring a bounce. CONFIG_HASH_BLACKLIST = set(['instances', 'backoff_seconds']) system_paasta_config = load_system_paasta_config() partial_id = format_job_id(service=service, instance=instance) instance_config = load_marathon_service_config( service=service, instance=instance, cluster=load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) docker_url = get_docker_url(system_paasta_config.get_docker_registry(), instance_config.get_docker_image()) service_namespace_config = load_service_namespace_config( service=service, namespace=instance_config.get_nerve_namespace(), ) docker_volumes = system_paasta_config.get_volumes() + instance_config.get_extra_volumes() complete_config = instance_config.format_marathon_app_dict( app_id=partial_id, docker_url=docker_url, docker_volumes=docker_volumes, service_namespace_config=service_namespace_config, ) code_sha = get_code_sha_from_dockerurl(docker_url) config_hash = get_config_hash( {key: value for key, value in complete_config.items() if key not in CONFIG_HASH_BLACKLIST}, force_bounce=instance_config.get_force_bounce(), ) full_id = format_job_id(service, instance, code_sha, config_hash) complete_config['id'] = full_id return complete_config
def test_load_system_paasta_config_file_non_existent_dir(): fake_path = "/var/dir_of_fake" with contextlib.nested(mock.patch("os.path.isdir", return_value=False)) as (isdir_patch,): with raises(utils.PaastaNotConfiguredError) as excinfo: utils.load_system_paasta_config(fake_path) expected = "Could not find system paasta configuration directory: %s" % fake_path assert str(excinfo.value) == expected
def main(): """Attempt to set up the marathon service instance given. Exits 1 if the deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance) except InvalidJobNameError: log.error("Invalid service instance specified. Format is service%sinstance." % SPACER) sys.exit(1) marathon_config = get_main_marathon_config() client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) try: service_instance_config = marathon_tools.load_marathon_service_config( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug("No deployments found for %s in cluster %s. Skipping." % (args.service_instance, load_system_paasta_config().get_cluster())) sys.exit(0) except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s in cluster %s" % \ (args.service_instance, load_system_paasta_config().get_cluster()) log.error(error_msg) sys.exit(1) try: status, output = setup_service(service, instance, client, marathon_config, service_instance_config, soa_dir) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0) except (KeyError, TypeError, AttributeError, InvalidInstanceConfig): import traceback error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def delete_app(app_id, client, soa_dir): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) cluster = load_system_paasta_config().get_cluster() try: short_app_id = marathon_tools.compose_job_id(service, instance) with bounce_lib.bounce_lock_zookeeper(short_app_id): bounce_lib.delete_marathon_app(app_id, client) send_event( service=service, check_name='check_marathon_services_replication.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='setup_marathon_job.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='paasta_bounce_progress.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log( service=service, component='deploy', level='event', cluster=cluster, instance=instance, line=log_line, ) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log( service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline, ) raise
def test_load_system_paasta_config_file_dne(): fake_path = "/var/dir_of_fake" with contextlib.nested( mock.patch("os.path.isdir", return_value=True), mock.patch("os.access", return_value=True), mock.patch("paasta_tools.utils.open", create=True, side_effect=IOError(2, "a", "b")), mock.patch("paasta_tools.utils.get_readable_files_in_glob", autospec=True, return_value=[fake_path]), ) as (isdir_patch, access_patch, open_patch, mock_get_readable_files_in_glob): with raises(utils.PaastaNotConfiguredError) as excinfo: utils.load_system_paasta_config(fake_path) assert str(excinfo.value) == "Could not load system paasta config file b: a"
def test_load_system_paasta_config_file_non_readable_dir(): fake_path = '/var/dir_of_fake' with contextlib.nested( mock.patch('os.path.isdir', return_value=True), mock.patch('os.access', return_value=False), ) as ( isdir_patch, access_patch, ): with raises(utils.PaastaNotConfiguredError) as excinfo: utils.load_system_paasta_config(fake_path) expected = "Could not read from system paasta configuration directory: %s" % fake_path assert str(excinfo.value) == expected
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() mesos_tasks = get_running_tasks_from_active_frameworks('') for config in configs: try: autoscale_marathon_instance(config, marathon_tasks, mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e, level='event') except LockHeldException: pass
def service_instance_status_error(context, error_code, job_id): marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) settings.cluster = load_system_paasta_config().get_cluster() settings.soa_dir = context.soa_dir (service, instance, _, __) = decompose_job_id(job_id) request = testing.DummyRequest() request.matchdict = {'service': service, 'instance': instance} response = None try: response = instance_status(request) except InstanceFailure as exc: print exc.msg assert exc.err == int(error_code) except: raise assert not response
def create_complete_config(service, job_name, soa_dir=DEFAULT_SOA_DIR): """Generates a complete dictionary to be POST'ed to create a job on Chronos""" system_paasta_config = load_system_paasta_config() chronos_job_config = load_chronos_job_config(service, job_name, system_paasta_config.get_cluster(), soa_dir=soa_dir) docker_url = get_docker_url(system_paasta_config.get_docker_registry(), chronos_job_config.get_docker_image()) docker_volumes = system_paasta_config.get_volumes() + chronos_job_config.get_extra_volumes() complete_config = chronos_job_config.format_chronos_job_dict(docker_url, docker_volumes) code_sha = get_code_sha_from_dockerurl(docker_url) config_hash = get_config_hash(complete_config) # Chronos clears the history for a job whenever it is updated, so we use a new job name for each revision # so that we can keep history of old job revisions rather than just the latest version full_id = compose_job_id(service, job_name, code_sha, config_hash) complete_config["name"] = full_id desired_state = chronos_job_config.get_desired_state() # If the job was previously stopped, we should stop the new job as well # NOTE this clobbers the 'disabled' param specified in the config file! if desired_state == "start": complete_config["disabled"] = False elif desired_state == "stop": complete_config["disabled"] = True log.debug("Complete configuration for instance is: %s" % complete_config) return complete_config
def get_marathon_services_running_here_for_nerve(cluster, soa_dir): if not cluster: try: cluster = load_system_paasta_config().get_cluster() # In the cases where there is *no* cluster or in the case # where there isn't a Paasta configuration file at *all*, then # there must be no marathon services running here, so we catch # these custom exceptions and return []. except (PaastaNotConfiguredError): return [] # When a cluster is defined in mesos, let's iterate through marathon services marathon_services = marathon_services_running_here() nerve_list = [] for name, instance, port in marathon_services: try: namespace = read_namespace_for_service_instance(name, instance, cluster, soa_dir) nerve_dict = load_service_namespace_config(name, namespace, soa_dir) if not nerve_dict.is_in_smartstack(): continue nerve_dict['port'] = port nerve_name = compose_job_id(name, namespace) nerve_list.append((nerve_name, nerve_dict)) except KeyError: continue # SOA configs got deleted for this app, it'll get cleaned up return nerve_list
def send_event(service, instance, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param instance: The instance of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event """ cluster = load_system_paasta_config().get_cluster() monitoring_overrides = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ).get_monitoring() # In order to let sensu know how often to expect this check to fire, # we need to set the ``check_every`` to the frequency of our cron job, which # is 10s. monitoring_overrides['check_every'] = '10s' # Most deploy_chronos_jobs failures are transient and represent issues # that will probably be fixed eventually, so we set an alert_after # to suppress extra noise monitoring_overrides['alert_after'] = '10m' check_name = 'setup_chronos_job.%s' % compose_job_id(service, instance) monitoring_tools.send_event( service=service, check_name=check_name, overrides=monitoring_overrides, status=status, output=output, soa_dir=soa_dir, )
def paasta_emergency_start(args): """Performs an emergency start on a given service instance on a given cluster Warning: This command is not magic and cannot actually get a service to start if it couldn't run before. This includes configurations that prevent the service from running, such as 'instances: 0' (for Marathon apps). All it does for Marathon apps is ask Marathon to resume normal operation by scaling up to the instance count defined in the service's config. All it does for Chronos jobs is send the latest version of the job config to Chronos and run it immediately. """ system_paasta_config = load_system_paasta_config() service = figure_out_service_name(args, soa_dir=args.soa_dir) print "Performing an emergency start on %s..." % compose_job_id(service, args.instance) output = execute_paasta_serviceinit_on_remote_master( subcommand="start", cluster=args.cluster, service=service, instances=args.instance, system_paasta_config=system_paasta_config, ) print "%s" % "\n".join(paasta_emergency_start.__doc__.splitlines()[-8:]) print "Output: %s" % PaastaColors.grey(output) print "Run this command to see the status:" print "paasta status --service %s --clusters %s" % (service, args.cluster)
def main(): args = parse_args() soa_dir = args.soa_dir logging.basicConfig() if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) cluster = load_system_paasta_config().get_cluster() service_instances = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir) config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) for service, instance in service_instances: check_service_replication( client=client, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, )
def create_complete_config(service, job_name, soa_dir=DEFAULT_SOA_DIR): """Generates a complete dictionary to be POST'ed to create a job on Chronos""" system_paasta_config = load_system_paasta_config() chronos_job_config = load_chronos_job_config( service, job_name, system_paasta_config.get_cluster(), soa_dir=soa_dir) docker_url = get_docker_url( system_paasta_config.get_docker_registry(), chronos_job_config.get_docker_image()) docker_volumes = system_paasta_config.get_volumes() + chronos_job_config.get_extra_volumes() complete_config = chronos_job_config.format_chronos_job_dict( docker_url, docker_volumes, system_paasta_config.get_dockercfg_location(), ) complete_config['name'] = compose_job_id(service, job_name) # resolve conflicts between the 'desired_state' and soa_configs disabled # flag. desired_state = chronos_job_config.get_desired_state() soa_disabled_state = complete_config['disabled'] resolved_disabled_state = determine_disabled_state(desired_state, soa_disabled_state) complete_config['disabled'] = resolved_disabled_state # we use the undocumented description field to store a hash of the chronos config. # this makes it trivial to compare configs and know when to bounce. complete_config['description'] = get_config_hash(complete_config) log.debug("Complete configuration for instance is: %s" % complete_config) return complete_config
def main(argv=None): args = parse_paasta_api_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) if args.soa_dir: settings.soa_dir = args.soa_dir # Exit on exceptions while loading settings settings.cluster = load_system_paasta_config().get_cluster() marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=30) server = WSGIServer(('', int(args.port)), make_app()) log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir)) try: server.serve_forever() except KeyboardInterrupt: sys.exit(0)
def run(self): self.setup_logging() all_service_config = read_services_configuration() system_config = load_system_paasta_config() service_replication = self.get_service_replication( all_services=all_service_config.keys(), synapse_host=system_config.get_default_synapse_host(), synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(), ) checked_services = [] for service, service_config in all_service_config.iteritems(): do_monitoring, monitoring_config = extract_replication_info( service_config ) if do_monitoring: self.log.debug("Checking {0}".format(service)) replication = service_replication.get('%s.main' % service, 0) event = do_replication_check(service, monitoring_config, replication) checked_services.append(service) self.log.debug("Result for {0}: {1}".format(service, event['output'])) report_event(event) else: self.log.debug("Not checking {0}".format(service)) self.ok("Finished checking services: {0}".format(checked_services))
def load_performance_check_config(): try: return load_system_paasta_config().get_performance_check_config() except PaastaNotConfiguredError as e: print "No performance check config to use. Safely bailing." print e.strerror sys.exit(0)
def create_complete_config(service, job_name, soa_dir=DEFAULT_SOA_DIR): """Generates a complete dictionary to be POST'ed to create a job on Chronos""" system_paasta_config = load_system_paasta_config() chronos_job_config = load_chronos_job_config( service, job_name, system_paasta_config.get_cluster(), soa_dir=soa_dir) docker_url = get_docker_url( system_paasta_config.get_docker_registry(), chronos_job_config.get_docker_image()) docker_volumes = system_paasta_config.get_volumes() + chronos_job_config.get_extra_volumes() complete_config = chronos_job_config.format_chronos_job_dict( docker_url, docker_volumes, ) complete_config['name'] = compose_job_id(service, job_name) desired_state = chronos_job_config.get_desired_state() # If the job was previously stopped, we should stop the new job as well # NOTE this clobbers the 'disabled' param specified in the config file! if desired_state == 'start': complete_config['disabled'] = False elif desired_state == 'stop': complete_config['disabled'] = True # we use the undocumented description field to store a hash of the chronos config. # this makes it trivial to compare configs and know when to bounce. complete_config['description'] = get_config_hash(complete_config) log.debug("Complete configuration for instance is: %s" % complete_config) return complete_config
def main(args): config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() # get those jobs listed in configs configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir) service_job_mapping = build_service_job_mapping(client, configured_jobs) for service_instance, job_state_pairs in service_job_mapping.items(): service, instance = service_instance[0], service_instance[1] sensu_output, sensu_status = sensu_message_status_for_jobs(service, instance, job_state_pairs) monitoring_overrides = compose_monitoring_overrides_for_service( cluster=system_paasta_config.get_cluster(), service=service, instance=instance, soa_dir=args.soa_dir ) send_event_to_sensu( service=service, instance=instance, monitoring_overrides=monitoring_overrides, status_code=sensu_status, message=sensu_output, soa_dir=args.soa_dir, )
def get_paasta_api_client(cluster=None, system_paasta_config=None): if not system_paasta_config: system_paasta_config = load_system_paasta_config() if not cluster: cluster = system_paasta_config.get_cluster() api_endpoints = system_paasta_config.get_api_endpoints() if cluster not in api_endpoints: log.error('Cluster %s not in paasta-api endpoints config', cluster) return None url = str(api_endpoints[cluster]) parsed = urlparse(url) if not parsed: log.error('Unsupported paasta-api url %s', url) return None api_server = parsed.netloc # Get swagger spec from file system instead of the api server paasta_api_path = os.path.dirname(sys.modules['paasta_tools.api'].__file__) swagger_file = os.path.join(paasta_api_path, 'api_docs/swagger.json') if not os.path.isfile(swagger_file): log.error('paasta-api swagger spec %s does not exist', swagger_file) return None with open(swagger_file) as f: spec_dict = json.load(f) # replace localhost in swagger.json with actual api server spec_dict['host'] = api_server return SwaggerClient.from_spec(spec_dict=spec_dict)
def main(argv): args = parse_args(argv) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() drivers = [] schedulers = [] for service, instance in get_paasta_native_jobs_for_cluster(cluster=cluster, soa_dir=args.soa_dir): scheduler = PaastaScheduler( service_name=service, instance_name=instance, cluster=cluster, system_paasta_config=system_paasta_config, soa_dir=args.soa_dir, ) schedulers.append(scheduler) driver = create_driver( service=service, instance=instance, scheduler=scheduler, system_paasta_config=system_paasta_config, ) driver.start() drivers.append(driver) end_time = time.time() + args.stay_alive_seconds while time.time() < end_time: sleep(args.periodic_interval) for scheduler, driver in zip(schedulers, drivers): scheduler.periodic(driver) return schedulers
def paasta_status(args): """Print the status of a Yelp service running on PaaSTA. :param args: argparse.Namespace obj created from sys.args by cli""" soa_dir = args.soa_dir service = figure_out_service_name(args, soa_dir) actual_deployments = get_actual_deployments(service, soa_dir) system_paasta_config = load_system_paasta_config() if args.clusters is not None: cluster_whitelist = args.clusters.split(",") else: cluster_whitelist = [] if args.instances is not None: instance_whitelist = args.instances.split(",") else: instance_whitelist = [] if actual_deployments: deploy_pipeline = list(get_planned_deployments(service, soa_dir)) report_status( service=service, deploy_pipeline=deploy_pipeline, actual_deployments=actual_deployments, cluster_whitelist=cluster_whitelist, instance_whitelist=instance_whitelist, system_paasta_config=system_paasta_config, verbose=args.verbose, ) else: print missing_deployments_message(service)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. log.info("Inspecting %s for autoscaling" % job_id) marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning("Skipping autoscaling run for services because the lock is held") pass
def test_load_system_paasta_config(): json_load_return_value = {'foo': 'bar'} expected = utils.SystemPaastaConfig(json_load_return_value, '/some/fake/dir') file_mock = mock.MagicMock(spec=file) with contextlib.nested( mock.patch('os.path.isdir', return_value=True), mock.patch('os.access', return_value=True), mock.patch('paasta_tools.utils.open', create=True, return_value=file_mock), mock.patch('paasta_tools.utils.get_readable_files_in_glob', autospec=True, return_value=['/some/fake/dir/some_file.json']), mock.patch('paasta_tools.utils.json.load', autospec=True, return_value=json_load_return_value) ) as ( os_is_dir_patch, os_access_patch, open_file_patch, mock_get_readable_files_in_glob, json_patch, ): actual = utils.load_system_paasta_config() assert actual == expected # Kinda weird but without this load_system_paasta_config() can (and # did! during development) return a plain dict without the test # complaining. assert actual.__class__ == expected.__class__ open_file_patch.assert_any_call('/some/fake/dir/some_file.json') json_patch.assert_any_call(file_mock.__enter__()) assert json_patch.call_count == 1
def start_paasta_native_framework(context, reconcile_backoff): clear_mesos_tools_cache() system_paasta_config = load_system_paasta_config() system_paasta_config['docker_registry'] = 'docker.io' # so busybox runs. context.scheduler = PaastaScheduler( service_name=context.service, instance_name=context.instance, cluster=context.cluster, system_paasta_config=system_paasta_config, service_config=context.new_config, reconcile_backoff=int(reconcile_backoff), ) context.driver = create_driver( service=context.service, instance=context.instance, scheduler=context.scheduler, system_paasta_config=system_paasta_config, ) context.driver.start() if not hasattr(context, 'framework_ids'): context.framework_ids = [] for _ in xrange(10): if context.scheduler.framework_id: context.framework_ids.append(context.scheduler.framework_id) break time.sleep(1) else: raise Exception("Expected scheduler to successfully register before timeout")
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None): """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :returns: A unix-style return code """ system_config = load_system_paasta_config() marathon_config = marathon_tools.load_marathon_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = job_config.format_marathon_app_dict()['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id return 1 normal_instance_count = job_config.get_instances() normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance, cluster) proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) if command == 'restart': restart_marathon_job(service, instance, app_id, client, cluster) elif command == 'status': print status_desired_state(service, instance, client, job_config) print status_marathon_job(service, instance, app_id, normal_instance_count, client) tasks, out = status_marathon_job_verbose(service, instance, client) if verbose > 0: print out print status_mesos_tasks(service, instance, normal_instance_count) if verbose > 0: tail_lines = calculate_tail_lines(verbose_level=verbose) print status_mesos_tasks_verbose( job_id=app_id, get_short_task_id=get_short_task_id, tail_lines=tail_lines, ) if proxy_port is not None: print status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose > 0, synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(), ) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def paasta_emergency_stop(args): """Performs an emergency stop on a given service instance on a given cluster Warning: This command does not permanently stop the service. The next time the service is updated (config change, deploy, bounce, etc.), those settings will override the emergency stop. If you want this stop to be permanant, adjust the relevant config file to reflect that. For example, this can be done for Marathon apps by setting 'instances: 0', or for Chronos jobs by setting 'disabled: True'. Alternatively, remove the config yaml entirely. """ system_paasta_config = load_system_paasta_config() service = figure_out_service_name(args, soa_dir=args.soa_dir) print "Performing an emergency stop on %s..." % compose_job_id(service, args.instance) output = execute_paasta_serviceinit_on_remote_master( subcommand="stop", cluster=args.cluster, service=service, instances=args.instance, system_paasta_config=system_paasta_config, app_id=args.appid, ) print "Output: %s" % output print "%s" % "\n".join(paasta_emergency_stop.__doc__.splitlines()[-7:]) print "To start this service again asap, run:" print "paasta emergency-start --service %s --instance %s --cluster %s" % (service, args.instance, args.cluster)
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() running_jobs = set(deployed_job_names(client)) expected_service_jobs = { chronos_tools.compose_job_id(*job) for job in chronos_tools.get_chronos_jobs_for_cluster( soa_dir=args.soa_dir) } all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs))) expired_tmp_jobs = set( filter_expired_tmp_jobs(client, all_tmp_jobs, cluster=cluster, soa_dir=soa_dir)) valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs task_responses = cleanup_tasks(client, to_delete) task_successes = [] task_failures = [] for response in task_responses: if isinstance(response[-1], Exception): task_failures.append(response) else: task_successes.append(response) job_responses = cleanup_jobs(client, to_delete) job_successes = [] job_failures = [] for response in job_responses: if isinstance(response[-1], Exception): job_failures.append(response) else: job_successes.append(response) try: (service, instance) = chronos_tools.decompose_job_id(response[0]) monitoring_tools.send_event( check_name=check_chronos_job_name(service, instance), service=service, overrides={}, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, output= "This instance was removed and is no longer supposed to be scheduled.", ) except InvalidJobNameError: # If we deleted some bogus job with a bogus jobid that could not be parsed, # Just move on, no need to send any kind of paasta event. pass if len(to_delete) == 0: paasta_print('No Chronos Jobs to remove') else: if len(task_successes) > 0: paasta_print( format_list_output( "Successfully Removed Tasks (if any were running) for:", [job[0] for job in task_successes], )) # if there are any failures, print and exit appropriately if len(task_failures) > 0: paasta_print( format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures])) if len(job_successes) > 0: paasta_print( format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes])) # if there are any failures, print and exit appropriately if len(job_failures) > 0: paasta_print( format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures])) if len(job_failures) > 0 or len(task_failures) > 0: sys.exit(1)
def send_event(service, check_name, overrides, status, output, soa_dir, ttl=None, cluster=None): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param check_name: The name of the check as it appears in Sensu :param overrides: A dictionary containing overrides for monitoring options (e.g. notification_email, ticket, page) :param status: The status to emit for this event :param output: The output to emit for this event :param soa_dir: The service directory to read monitoring information from :param cluster: The cluster name (optional) """ # This function assumes the input is a string like "mumble.main" team = get_team(overrides, service, soa_dir) if not team: return system_paasta_config = load_system_paasta_config() if cluster is None: try: cluster = system_paasta_config.get_cluster() except PaastaNotConfiguredError: cluster = "localhost" alert_after = overrides.get("alert_after", "5m") result_dict = { "name": check_name, "runbook": overrides.get("runbook", "http://y/paasta-troubleshooting"), "status": status, "output": output, "team": team, "page": get_page(overrides, service, soa_dir), "tip": get_tip(overrides, service, soa_dir), "notification_email": get_notification_email(overrides, service, soa_dir), "check_every": overrides.get("check_every", "1m"), "realert_every": overrides.get("realert_every", monitoring_defaults("realert_every")), "alert_after": f"{alert_after}s" if isinstance(alert_after, int) else alert_after, "irc_channels": get_irc_channels(overrides, service, soa_dir), "slack_channels": get_slack_channels(overrides, service, soa_dir), "ticket": get_ticket(overrides, service, soa_dir), "project": get_project(overrides, service, soa_dir), "priority": get_priority(overrides, service, soa_dir), "source": "paasta-%s" % cluster, "tags": get_tags(overrides, service, soa_dir), "ttl": ttl, "sensu_host": system_paasta_config.get_sensu_host(), "sensu_port": system_paasta_config.get_sensu_port(), "component": get_component(overrides, service, soa_dir), "description": get_description(overrides, service, soa_dir), } if result_dict.get("sensu_host"): pysensu_yelp.send_event(**result_dict)
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys from paasta_tools.smartstack_tools import are_services_up_on_ip_port from paasta_tools.utils import load_system_paasta_config system_paasta_config = load_system_paasta_config() synapse_port = system_paasta_config.get_synapse_port() synapse_host = '169.254.255.254' synapse_haproxy_url_format = system_paasta_config.get_synapse_haproxy_url_format( ) host_ip = os.environ['PAASTA_POD_IP'] port = sys.argv[1] services = sys.argv[2:] if are_services_up_on_ip_port( synapse_host=synapse_host, synapse_port=synapse_port, synapse_haproxy_url_format=synapse_haproxy_url_format, services=services, host_ip=host_ip, host_port=int(port),
def set_boost_factor( zk_boost_path: str, region: str='', pool: str='', send_clusterman_metrics: bool=False, factor: float=DEFAULT_BOOST_FACTOR, duration_minutes: int=DEFAULT_BOOST_DURATION, override: bool=False, ) -> bool: """ Set a boost factor for a path in zk Can be used to boost either cluster or service autoscalers. If using for cluster you must specify region, pool and set send_clusterman_metrics=True so that clusterman metrics are updated otherwise just zk_boost_path is enough. """ if factor < MIN_BOOST_FACTOR: log.error(f'Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}') return False if factor > MAX_BOOST_FACTOR: log.warning('Boost factor {} does not sound reasonable. Defaulting to {}'.format( factor, MAX_BOOST_FACTOR, )) factor = MAX_BOOST_FACTOR if duration_minutes > MAX_BOOST_DURATION: log.warning('Boost duration of {} minutes is too much. Falling back to {}.'.format( duration_minutes, MAX_BOOST_DURATION, )) duration_minutes = MAX_BOOST_DURATION current_time = get_time() end_time = current_time + 60 * duration_minutes if clusterman_metrics and send_clusterman_metrics: cluster = load_system_paasta_config().get_cluster() metrics_client = clusterman_metrics.ClustermanMetricsBotoClient(region_name=region, app_identifier='default') with metrics_client.get_writer(clusterman_metrics.APP_METRICS) as writer: metrics_key = clusterman_metrics.generate_key_with_dimensions( 'boost_factor', {'cluster': cluster, 'pool': pool}, ) writer.send((metrics_key, current_time, factor)) if duration_minutes > 0: writer.send((metrics_key, end_time, 1.0)) zk_end_time_path = zk_boost_path + '/end_time' zk_factor_path = zk_boost_path + '/factor' zk_expected_load_path = zk_boost_path + '/expected_load' with ZookeeperPool() as zk: if ( not override and current_time < get_boost_values(zk_boost_path, zk).end_time ): log.error('Boost already active. Not overriding.') return False try: zk.ensure_path(zk_end_time_path) zk.ensure_path(zk_factor_path) zk.ensure_path(zk_expected_load_path) zk.set(zk_end_time_path, str(end_time).encode('utf-8')) zk.set(zk_factor_path, str(factor).encode('utf-8')) zk.set(zk_expected_load_path, '0'.encode('utf-8')) except Exception: log.error('Error setting the boost in Zookeeper') raise log.info('Load boost: Set capacity boost factor {} at path {} until {}'.format( factor, zk_boost_path, datetime.fromtimestamp(end_time).strftime('%c'), )) # Let's check that this factor has been properly written to zk return get_boost_values(zk_boost_path, zk) == BoostValues( end_time=end_time, boost_factor=factor, expected_load=0, )
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log(service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance) short_id = marathon_tools.format_job_id(service, instance) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, system_paasta_config, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks( other_apps, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) if new_app_running: protected_draining_tasks = set() if new_app.instances < config['instances']: client.scale_app(app_id=new_app.id, instances=config['instances'], force=True) elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_old_happy_unhappy_draining_tasks_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) protected_draining_tasks.update( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. for task in new_app.tasks: if task not in protected_draining_tasks: drain_method.stop_draining(task) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ['Exception raised during deploy of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def get_mesos_config_path(): """ Determine where to find the configuration for mesos-cli. """ return (load_system_paasta_config().get_mesos_cli_config().get( "path", DEFAULT_MESOS_CLI_CONFIG_LOCATION))
def get_zookeeper_host_path(): zk_url = "zk://%s" % load_system_paasta_config().get_zk_hosts() parsed = urlparse(zk_url) return ZookeeperHostPath(host=parsed.netloc, path=parsed.path)
def load_chronos_config(): try: return ChronosConfig(load_system_paasta_config().get_chronos_config()) except PaastaNotConfiguredError: raise ChronosNotConfigured( "Could not find chronos_config in configuration directory")
def get_zookeeper_host_path(): zk_url = load_system_paasta_config()['zookeeper'] parsed = urlparse(zk_url) return ZookeeperHostPath(host=parsed.netloc, path=parsed.path)
def get_tron_dashboard_for_cluster(cluster: str): dashboards = load_system_paasta_config().get_dashboard_links()[cluster] if 'Tron' not in dashboards: raise Exception( f"tron api endpoint is not defined for cluster {cluster}") return dashboards['Tron']
def paasta_local_run(args): if args.action == 'pull' and os.geteuid() != 0: paasta_print("Re-executing paasta local-run --pull with sudo..") os.execvp("sudo", ["sudo", "-H"] + sys.argv) if args.action == 'build' and not makefile_responds_to('cook-image'): paasta_print( "A local Makefile with a 'cook-image' target is required for --build", file=sys.stderr) paasta_print( "If you meant to pull the docker image from the registry, explicitly pass --pull", file=sys.stderr) return 1 try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: paasta_print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and local-run may not behave" "the same way it would behave on a server configured for PaaSTA.", ), sep='\n', ) system_paasta_config = SystemPaastaConfig({"volumes": []}, '/etc/paasta') local_run_config = system_paasta_config.get_local_run_config() service = figure_out_service_name(args, soa_dir=args.yelpsoa_config_root) if args.cluster: cluster = args.cluster else: try: cluster = local_run_config['default_cluster'] except KeyError: paasta_print( PaastaColors.red( "PaaSTA on this machine has not been configured with a default cluster." "Please pass one to local-run using '-c'.", ), sep='\n', file=sys.stderr, ) return 1 instance = args.instance docker_client = get_docker_client() if args.action == 'build': default_tag = 'paasta-local-run-%s-%s' % (service, get_username()) tag = os.environ.get('DOCKER_TAG', default_tag) os.environ['DOCKER_TAG'] = tag pull_image = False cook_return = paasta_cook_image(args=None, service=service, soa_dir=args.yelpsoa_config_root) if cook_return != 0: return cook_return elif args.action == 'dry_run': pull_image = False tag = None else: pull_image = True tag = None try: return configure_and_run_docker_container( docker_client=docker_client, docker_hash=tag, service=service, instance=instance, cluster=cluster, args=args, pull_image=pull_image, system_paasta_config=system_paasta_config, dry_run=args.action == 'dry_run', ) except errors.APIError as e: paasta_print( 'Can\'t run Docker container. Error: %s' % str(e), file=sys.stderr, ) return 1
def load_tron_config(): return TronConfig(load_system_paasta_config().get_tron_config())
def deploy_marathon_service( service: str, instance: str, clients: marathon_tools.MarathonClients, soa_dir: str, marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]], ) -> Tuple[int, float]: """deploy the service instance given and proccess return code if there was an error we send a sensu alert. :param service: The service name to setup :param instance: The instance of the service to setup :param clients: A MarathonClients object :param soa_dir: Path to yelpsoa configs :param marathon_apps: A list of all marathon app objects :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd bounce_in_seconds instructs how long until the deployd should try another bounce None means that it is in a steady state and doesn't need to bounce again """ short_id = marathon_tools.format_job_id(service, instance) try: with bounce_lib.bounce_lock_zookeeper(short_id): try: service_instance_config = marathon_tools.load_marathon_service_config_no_cache( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, load_system_paasta_config().get_cluster())) return 0, None except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \ (service, instance, load_system_paasta_config().get_cluster()) log.error(error_msg) return 1, None try: with a_sync.idle_event_loop(): status, output, bounce_again_in_seconds = setup_service( service=service, instance=instance, clients=clients, job_config=service_instance_config, marathon_apps_with_clients=marathon_apps_with_clients, soa_dir=soa_dir, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) return 0, bounce_again_in_seconds except (KeyError, TypeError, AttributeError, InvalidInstanceConfig, NoSlavesAvailableError): error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) return 1, None except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return 0, None
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout): # Currently only 'marathon' instances are supported for wait_for_deployment because they # are the only thing that are worth waiting on. service_configs = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir, load_deployments=False) total_instances = 0 clusters_data = [] api_endpoints = load_system_paasta_config().get_api_endpoints() for cluster in service_configs.clusters: if cluster not in api_endpoints: paasta_print( PaastaColors.red( 'Cluster %s is NOT in paasta-api endpoints config.' % cluster, )) raise NoSuchCluster instances_queue = Queue() for instance_config in service_configs.instance_configs( cluster=cluster, instance_type_class=MarathonServiceConfig, ): if instance_config.get_deploy_group() == deploy_group: instances_queue.put(instance_config) total_instances += 1 for instance_config in service_configs.instance_configs( cluster=cluster, instance_type_class=KubernetesDeploymentConfig, ): if instance_config.get_deploy_group() == deploy_group: instances_queue.put(instance_config) total_instances += 1 if not instances_queue.empty(): clusters_data.append( ClusterData( cluster=cluster, service=service, git_sha=git_sha, instances_queue=instances_queue, )) if not clusters_data: _log( service=service, component='deploy', line= ("Couldn't find any marathon instances for service {} in deploy group {}. Exiting." .format(service, deploy_group)), level='event', ) return paasta_print("Waiting for deployment of {} for '{}' to complete...".format( git_sha, deploy_group)) deadline = time.time() + timeout green_light = Event() green_light.set() with progressbar.ProgressBar(maxval=total_instances) as bar: while time.time() < deadline: _query_clusters(clusters_data, green_light) if not green_light.is_set(): raise KeyboardInterrupt bar.update(total_instances - sum((c.instances_queue.qsize() for c in clusters_data))) if all((cluster.instances_queue.empty() for cluster in clusters_data)): sys.stdout.flush() return 0 else: time.sleep(min(60, timeout)) sys.stdout.flush() _log( service=service, component='deploy', line=compose_timeout_message(clusters_data, timeout, deploy_group, service, git_sha), level='event', ) raise TimeoutError
def print_output(argv: Optional[Sequence[str]] = None) -> None: mesos_available = is_mesos_available() kube_available = is_kubernetes_available() chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() if mesos_available: master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % '\n'.join(e.args))) raise FatalError(2) marathon_results = _run_marathon_checks(marathon_clients) else: marathon_results = [metastatus_lib.HealthCheckResult( message='Marathon is not configured to run here', healthy=True, )] all_mesos_results = [metastatus_lib.HealthCheckResult( message='Mesos is not configured to run here', healthy=True, )] if kube_available: kube_client = KubeClient() kube_results = _run_kube_checks(kube_client) else: kube_results = [metastatus_lib.HealthCheckResult( message='Kubernetes is not configured to run here', healthy=True, )] # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) raise FatalError(2) else: chronos_results = [metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, )] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) kube_ok = all(metastatus_lib.status_for_results(kube_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) kube_summary = metastatus_lib.generate_summary_for_check("Kubernetes", kube_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print(f"Master paasta_tools version: {__version__}") paasta_print("Mesos leader: %s" % get_mesos_leader()) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1 and mesos_available: print_with_indent('Resources Grouped by %s' % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state, ) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields] table = [headers] + [[str(x) for x in asi] for asi in get_autoscaling_info_for_all_resources(mesos_state)] for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats(args.service, args.instance, cluster) if service_instance_stats: print_with_indent('Service-Instance stats:' + str(service_instance_stats), 2) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok, kube_results, args.verbose) if args.verbose > 1 and kube_available: print_with_indent('Resources Grouped by %s' % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_kube( groupings=args.groupings, threshold=args.threshold, kube_client=kube_client, ) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("No autoscaling resources for Kubernetes", 2) if args.verbose >= 3: print_with_indent('Per Node Utilization', 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats(args.service, args.instance, cluster) if service_instance_stats: print_with_indent('Service-Instance stats:' + str(service_instance_stats), 2) # print info about nodes here. Note that we don't make # modifications to the healthy_exit variable here, because we don't # care about a single node having high usage. all_rows, _ = utilization_table_by_grouping_from_kube( groupings=args.groupings + ["hostname"], threshold=args.threshold, kube_client=kube_client, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be # 1 for per-node resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: raise FatalError(2)
def run_capacity_check(): options = parse_capacity_check_options() system_paasta_config = load_system_paasta_config() cluster = options.cluster if options.cluster is not None else system_paasta_config.get_cluster( ) value_to_check = options.type client = get_paasta_api_client(cluster=cluster) if client is None: paasta_print('UNKNOWN Failed to load paasta api client') sys.exit(3) overrides = read_overrides(options.overrides) attributes = options.attributes.split(',') try: resource_use = client.resources.resources( groupings=attributes).result() except HTTPError as e: paasta_print("UNKNOWN recieved exception from paasta api:\n\t%s" % e) sys.exit(3) default_check = { 'warn': { 'cpus': options.warn, 'mem': options.warn, 'disk': options.warn, }, 'crit': { 'cpus': options.crit, 'mem': options.crit, 'disk': options.crit, }, } failures = defaultdict(list) for usage_value in resource_use: check = get_check_from_overrides(overrides, default_check, usage_value['groupings']) usage_percent = calc_percent_usage(usage_value, value_to_check) for c in ['crit', 'warn']: if usage_percent > check[c][value_to_check]: failures[c].append({ 'attrs': [{ 'attr': a, 'value': v } for a, v in usage_value['groupings'].items()], 'maximum': check[c][value_to_check], 'current': usage_percent, }) break return_value = [0] if len(failures['crit']) > 0: result = error_message(failures['crit'], 'CRITICAL', cluster, value_to_check) paasta_print(result) return_value.append(2) if len(failures['warn']) > 0: result = error_message(failures['warn'], 'WARNING', cluster, value_to_check) paasta_print(result) return_value.append(1) if max(return_value) == 0: paasta_print( f"OK cluster {cluster} is below critical capacity in {value_to_check}" ) sys.exit(max(return_value))
def paasta_spark_run(args): # argparse does not work as expected with both default and # type=validate_work_dir. validate_work_dir(args.work_dir) try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and local-run may not behave" "the same way it would behave on a server configured for PaaSTA." ), sep="\n", ) system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta") if args.cmd == "jupyter-lab" and not args.build and not args.image: print( PaastaColors.red( "The jupyter-lab command requires a prebuilt image with -I or --image." ), file=sys.stderr, ) return 1 # Use the default spark:client instance configs if not provided try: instance_config = get_instance_config( service=args.service, instance=args.instance, cluster=args.cluster, load_deployments=args.build is False and args.image is None, soa_dir=args.yelpsoa_config_root, ) except NoConfigurationForServiceError as e: print(str(e), file=sys.stderr) return 1 except NoDeploymentsAvailable: print( PaastaColors.red( "Error: No deployments.json found in %(soa_dir)s/%(service)s." "You can generate this by running:" "generate_deployments_for_service -d %(soa_dir)s -s %(service)s" % { "soa_dir": args.yelpsoa_config_root, "service": args.service }), sep="\n", file=sys.stderr, ) return 1 if not args.cmd and not instance_config.get_cmd(): print( "A command is required, pyspark, spark-shell, spark-submit or jupyter", file=sys.stderr, ) return 1 aws_creds = get_aws_credentials( service=args.service, no_aws_credentials=args.no_aws_credentials, aws_credentials_yaml=args.aws_credentials_yaml, profile_name=args.aws_profile, ) docker_image = get_docker_image(args, instance_config) if docker_image is None: return 1 volumes = instance_config.get_volumes(system_paasta_config.get_volumes()) app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd()) needs_docker_cfg = not args.build and not args.image user_spark_opts = _parse_user_spark_args(args.spark_args) paasta_instance = get_smart_paasta_instance_name(args) spark_conf = get_spark_conf( cluster_manager="mesos", spark_app_base_name=app_base_name, docker_img=docker_image, user_spark_opts=user_spark_opts, paasta_cluster=args.cluster, paasta_pool=args.pool, paasta_service=args.service, paasta_instance=paasta_instance, extra_volumes=volumes, aws_creds=aws_creds, needs_docker_cfg=needs_docker_cfg, ) return configure_and_run_docker_container( args, docker_img=docker_image, instance_config=instance_config, system_paasta_config=system_paasta_config, spark_conf=spark_conf, aws_creds=aws_creds, )
def deploy_service( service: str, instance: str, marathon_jobid: str, config: marathon_tools.FormattedMarathonAppDict, clients: marathon_tools.MarathonClients, marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]], bounce_method: str, drain_method_name: str, drain_method_params: Dict[str, Any], nerve_ns: str, bounce_health_params: Dict[str, Any], soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Tuple[int, str, Optional[float]]: """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param clients: A MarathonClients object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event""" def log_deploy_error(errormsg: str, level: str = 'event') -> None: return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance, ) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service=service, instance=instance, marathon_apps_with_clients=marathon_apps_with_clients, ) new_client = clients.get_current_client_for_service(job_config) new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = [] other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = [] for a, c in existing_apps_with_clients: if a.id == '/%s' % config['id'] and c == new_client: new_apps_with_clients_list.append((a, c)) else: other_apps_with_clients.append((a, c)) serviceinstance = "%s.%s" % (service, instance) if new_apps_with_clients_list: new_app, new_client = new_apps_with_clients_list[0] if len(new_apps_with_clients_list) != 1: raise ValueError( "Only expected one app per ID per shard; found %d" % len(new_apps_with_clients_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks( new_app, service, nerve_ns, system_paasta_config, **bounce_health_params, ) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg, None) try: draining_hosts = get_draining_hosts() except ReadTimeout as e: errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e return (1, errormsg, 60) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps_with_clients=other_apps_with_clients, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) # The first thing we need to do is take up the "slack" of old apps, to stop # them from launching new things that we are going to have to end up draining # and killing anyway. for a, c in other_apps_with_clients: marathon_tools.take_up_slack(app=a, client=c) num_at_risk_tasks = 0 if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks( new_app, draining_hosts=draining_hosts) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s up from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) new_client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( app=new_app, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[(new_app.id, new_client)] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[(new_app.id, new_client)] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy], ) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[(new_app.id, new_client)] = set( scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[(new_app.id, new_client)] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # slack represents remaining the extra remaining instances that are configured # in marathon that don't have a launched task yet. When scaling down we want to # reduce this slack so marathon doesn't get a chance to launch a new task in # that space that we will then have to drain and kill again. marathon_tools.take_up_slack(client=new_client, app=new_app) # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get((new_app.id, new_client), []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg, None) bounce_again_in_seconds = do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, clients=clients, soa_dir=soa_dir, job_config=job_config, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: logline = 'Failed to get lock to create marathon app for %s.%s' % ( service, instance) log_deploy_error(logline, level='debug') return (0, "Couldn't get marathon lock, skipping until next time", None) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % ( service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise if num_at_risk_tasks: bounce_again_in_seconds = 60 elif new_app_running: if new_app.instances > config['instances']: bounce_again_in_seconds = 60 return (0, 'Service deployed.', bounce_again_in_seconds)
def add_subparser(subparsers): list_parser = subparsers.add_parser( "spark-run", help="Run Spark on the PaaSTA cluster", description=( "'paasta spark-run' launches a Spark cluster on PaaSTA. " "It analyzes soa-configs and command line arguments to invoke " "a 'docker run'. By default, it will pull the Spark service " "image from the registry unless the --build option is used.\n\n"), formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) group = list_parser.add_mutually_exclusive_group() group.add_argument( "-b", "--build", help= "Build the docker image from scratch using the local Makefile's cook-image target.", action="store_true", default=False, ) group.add_argument( "-I", "--image", help="Use the provided image to start the Spark driver and executors.", ) list_parser.add_argument( "--docker-registry", help="Docker registry to push the Spark image built.", default=DEFAULT_SPARK_DOCKER_REGISTRY, ) list_parser.add_argument( "-s", "--service", help="The name of the service from which the Spark image is built.", default=DEFAULT_SPARK_SERVICE, ).completer = lazy_choices_completer(list_services) list_parser.add_argument( "-i", "--instance", help=("Start a docker run for a particular instance of the service."), default="adhoc", ).completer = lazy_choices_completer(list_instances) try: system_paasta_config = load_system_paasta_config() default_spark_cluster = system_paasta_config.get_spark_run_config( ).get("default_cluster") default_spark_pool = system_paasta_config.get_spark_run_config().get( "default_pool") except PaastaNotConfiguredError: default_spark_cluster = "pnw-devc" default_spark_pool = "batch" list_parser.add_argument( "-c", "--cluster", help=("The name of the cluster you wish to run Spark on."), default=default_spark_cluster, ) list_parser.add_argument( "-p", "--pool", help="Name of the resource pool to run the Spark job.", default=default_spark_pool, ) list_parser.add_argument( "-w", "--work-dir", default="{}:{}".format(os.getcwd(), DEFAULT_SPARK_WORK_DIR), help= "The read-write volume to mount in format local_abs_dir:container_abs_dir", ) list_parser.add_argument( "-y", "--yelpsoa-config-root", dest="yelpsoa_config_root", help="A directory from which yelpsoa-configs should be read from.", default=DEFAULT_SOA_DIR, ) list_parser.add_argument( "-C", "--cmd", help= "Run the spark-shell, pyspark, spark-submit, jupyter-lab, or history-server command.", ) list_parser.add_argument( "-d", "--dry-run", help="Shows the arguments supplied to docker as json.", action="store_true", default=False, ) list_parser.add_argument( "--spark-args", help= "Spark configurations documented in https://spark.apache.org/docs/latest/configuration.html. " r'For example, --spark-args "spark.mesos.constraints=pool:default\;instance_type:m4.10xlarge ' 'spark.executor.cores=4".', ) list_parser.add_argument( "--nvidia", help= "Use nvidia docker runtime for Spark driver process (requires GPU)", action="store_true", default=False, ) list_parser.add_argument( "--mrjob", help= "Pass Spark arguments to invoked command in the format expected by mrjobs", action="store_true", default=False, ) if clusterman_metrics: list_parser.add_argument( "--suppress-clusterman-metrics-errors", help= "Continue even if sending resource requirements to Clusterman fails. This may result in the job " "failing to acquire resources.", action="store_true", ) list_parser.add_argument("-j", "--jars", help=argparse.SUPPRESS, action=DeprecatedAction) list_parser.add_argument("--executor-memory", help=argparse.SUPPRESS, action=DeprecatedAction) list_parser.add_argument("--executor-cores", help=argparse.SUPPRESS, action=DeprecatedAction) list_parser.add_argument("--max-cores", help=argparse.SUPPRESS, action=DeprecatedAction) list_parser.add_argument("--driver-max-result-size", help=argparse.SUPPRESS, action=DeprecatedAction) list_parser.add_argument("--driver-memory", help=argparse.SUPPRESS, action=DeprecatedAction) list_parser.add_argument("--driver-cores", help=argparse.SUPPRESS, action=DeprecatedAction) aws_group = list_parser.add_argument_group( title="AWS credentials options", description="If --aws-credentials-yaml is specified, it overrides all " "other options. Otherwise, if -s/--service is specified, spark-run " "looks for service credentials in /etc/boto_cfg/[service].yaml. If " "it does not find the service credentials or no service is " "specified, spark-run falls back to the boto default behavior " "(checking ~/.aws/credentials, ~/.boto, etc).", ) aws_group.add_argument( "--aws-credentials-yaml", help="Load aws keys from the provided yaml file. The yaml file must " "have keys for aws_access_key_id and aws_secret_access_key.", ) aws_group.add_argument( "--aws-profile", help="Name of the AWS profile to load credentials from. Only used when " "--aws-credentials-yaml is not specified and --service is either " "not specified or the service does not have credentials in " "/etc/boto_cfg", default="default", ) aws_group.add_argument( "--no-aws-credentials", help="Do not load any AWS credentials; allow the Spark job to use its " "own logic to load credentials", action="store_true", default=False, ) aws_group.add_argument( "--aws-region", help=f"Specify an aws region. If the region is not specified, we will" f"default to using {DEFAULT_AWS_REGION}.", default=DEFAULT_AWS_REGION, ) jupyter_group = list_parser.add_argument_group( title="Jupyter kernel culling options", description="Idle kernels will be culled by default. Idle " "kernels with connections can be overridden not to be culled.", ) jupyter_group.add_argument( "--cull-idle-timeout", type=int, default=7200, help="Timeout (in seconds) after which a kernel is considered idle and " "ready to be culled.", ) jupyter_group.add_argument( "--not-cull-connected", action="store_true", default=False, help="By default, connected idle kernels are culled after timeout. " "They can be skipped if not-cull-connected is specified.", ) list_parser.set_defaults(command=paasta_spark_run)
def main(): """Attempt to set up the marathon service instance given. Exits 1 if the deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance) except InvalidJobNameError: log.error( "Invalid service instance specified. Format is service%sinstance." % SPACER) sys.exit(1) marathon_config = get_main_marathon_config() client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) try: service_instance_config = marathon_tools.load_marathon_service_config( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s in cluster %s. Skipping." % (args.service_instance, load_system_paasta_config().get_cluster())) sys.exit(0) except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s in cluster %s" % \ (args.service_instance, load_system_paasta_config().get_cluster()) log.error(error_msg) sys.exit(1) try: status, output = setup_service(service, instance, client, marathon_config, service_instance_config, soa_dir) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0) except (KeyError, TypeError, AttributeError, InvalidInstanceConfig): import traceback error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None, client=None): """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :param client: MarathonClient or CachingMarathonClient :returns: A unix-style return code """ system_config = load_system_paasta_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = job_config.format_marathon_app_dict()['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) paasta_print( "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id) return 1 normal_instance_count = job_config.get_instances() proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir) if client is None: marathon_config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), ) if command == 'restart': restart_marathon_job(service, instance, app_id, client, cluster) elif command == 'status': paasta_print( status_desired_state(service, instance, client, job_config)) paasta_print( status_marathon_job(service, instance, app_id, normal_instance_count, client)) tasks, out = status_marathon_job_verbose(service, instance, client, cluster, soa_dir) if verbose > 0: paasta_print(out) paasta_print( status_mesos_tasks(service, instance, normal_instance_count)) if verbose > 0: tail_lines = calculate_tail_lines(verbose_level=verbose) paasta_print( status_mesos_tasks_verbose( job_id=app_id, get_short_task_id=get_short_task_id, tail_lines=tail_lines, )) if proxy_port is not None: normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace( service, instance, cluster, ) paasta_print( status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose > 0, synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config. get_synapse_haproxy_url_format(), system_deploy_blacklist=system_config.get_deploy_blacklist( ), system_deploy_whitelist=system_config.get_deploy_whitelist( ), )) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def format_marathon_app_dict(self): """Create the configuration that will be passed to the Marathon REST API. Currently compiles the following keys into one nice dict: - id: the ID of the image in Marathon - container: a dict containing the docker url and docker launch options. Needed by deimos. - uris: blank. - ports: an array containing the port. - env: environment variables for the container. - mem: the amount of memory required. - cpus: the number of cpus required. - disk: the amount of disk space required. - constraints: the constraints on the Marathon app. - instances: the number of instances required. - cmd: the command to be executed. - args: an alternative to cmd that requires the docker container to have an entrypoint. The last 7 keys are retrieved using the get_<key> functions defined above. :param app_id: The app id :param docker_url: The url to the docker image the app will actually execute :param docker_volumes: The docker volumes to run the image with, via the marathon configuration file :param service_namespace_config: The service instance's configuration dict :returns: A dict containing all of the keys listed above""" # A set of config attributes that don't get included in the hash of the config. # These should be things that PaaSTA/Marathon knows how to change without requiring a bounce. CONFIG_HASH_BLACKLIST = set(['instances', 'backoff_seconds']) system_paasta_config = load_system_paasta_config() docker_url = get_docker_url(system_paasta_config.get_docker_registry(), self.get_docker_image()) service_namespace_config = load_service_namespace_config( service=self.service, namespace=self.get_nerve_namespace(), ) docker_volumes = system_paasta_config.get_volumes( ) + self.get_extra_volumes() net = get_mesos_network_for_net(self.get_net()) complete_config = { 'container': { 'docker': { 'image': docker_url, 'network': net, "parameters": [ { "key": "memory-swap", "value": self.get_mem_swap() }, ] }, 'type': 'DOCKER', 'volumes': docker_volumes, }, 'uris': [ system_paasta_config.get_dockercfg_location(), ], 'backoff_seconds': self.get_backoff_seconds(), 'backoff_factor': 2, 'health_checks': self.get_healthchecks(service_namespace_config), 'env': self.get_env(), 'mem': float(self.get_mem()), 'cpus': float(self.get_cpus()), 'disk': float(self.get_disk()), 'constraints': self.get_constraints(service_namespace_config), 'instances': self.get_instances(), 'cmd': self.get_cmd(), 'args': self.get_args(), } if net == 'BRIDGE': complete_config['container']['docker']['portMappings'] = [ { 'containerPort': CONTAINER_PORT, 'hostPort': 0, 'protocol': 'tcp', }, ] accepted_resource_roles = self.get_accepted_resource_roles() if accepted_resource_roles is not None: complete_config[ 'accepted_resource_roles'] = accepted_resource_roles code_sha = get_code_sha_from_dockerurl(docker_url) config_hash = get_config_hash( { key: value for key, value in complete_config.items() if key not in CONFIG_HASH_BLACKLIST }, force_bounce=self.get_force_bounce(), ) complete_config['id'] = format_job_id(self.service, self.instance, code_sha, config_hash) log.debug("Complete configuration for instance is: %s", complete_config) return complete_config
def main(): args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) try: service, instance, _, __ = decompose_job_id(args.service_instance, spacer=chronos_tools.INTERNAL_SPACER) except InvalidJobNameError: log.error("Invalid service instance '%s' specified. Format is service%sinstance." % (args.service_instance, SPACER)) sys.exit(1) client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config()) cluster = load_system_paasta_config().get_cluster() try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError): error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster, ) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except NoConfigurationForServiceError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e) ) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except NoSlavesAvailableError as e: error_msg = ( "There are no PaaSTA slaves that can run %s in cluster %s\n" % (args.service_instance, cluster) + "Double check the cluster and the configured constratints/pool/whitelist.\n" "Error was: %s" % str(e) ) send_event( service=service, instance=instance, soa_dir=soa_dir, status=pysensu_yelp.Status.CRITICAL, output=error_msg, ) log.error(error_msg) sys.exit(0) except chronos_tools.InvalidParentError: log.warn("Skipping %s.%s: Parent job could not be found" % (service, instance)) sys.exit(0) modified_config = config_with_historical_stats( chronos_client=client, service=service, instance=instance, job_config=complete_job_config, ) status, output = setup_job( service=service, instance=instance, cluster=cluster, complete_job_config=modified_config, client=client, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event( service=service, instance=instance, soa_dir=soa_dir, status=sensu_status, output=output, ) # We exit 0 because the script finished ok and the event was sent to the right team. sys.exit(0)
def get_marathon_clients_from_config() -> MarathonClients: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = get_marathon_clients(marathon_servers) return marathon_clients
def main(): args = parse_args() system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) related_jobs = chronos_tools.get_related_jobs_configs(cluster, service, instance, soa_dir=args.soa_dir) if not related_jobs: error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster, ) paasta_print(error_msg) raise NoDeploymentsAvailable if not args.run_all_related_jobs: # Strip all the configuration for the related services # those information will not be used by the rest of the flow related_jobs = { (service, instance): related_jobs[(service, instance)], } complete_job_configs = {} for (srv, inst) in related_jobs: try: complete_job_configs.update( { (srv, inst): chronos_tools.create_complete_config( service=srv, job_name=inst, soa_dir=args.soa_dir, ), }, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( chronos_tools.compose_job_id(srv, inst), cluster, ) paasta_print(error_msg) raise e except NoConfigurationForServiceError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\nError was: %s" % ( chronos_tools.compose_job_id(srv, inst), cluster, str(e), )) paasta_print(error_msg) raise e except chronos_tools.InvalidParentError as e: raise e if not args.run_all_related_jobs: sorted_jobs = [(service, instance)] else: sorted_jobs = chronos_tools.topological_sort_related_jobs( cluster, service, instance, soa_dir=args.soa_dir) timestamp = datetime.datetime.utcnow().isoformat() chronos_to_add = [] for (service, instance) in sorted_jobs: # complete_job_config is a formatted version of the job, # so the command is formatted in the context of 'now' # replace it with the 'original' cmd so it can be re rendered chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=args.soa_dir, ) original_command = chronos_job_config.get_cmd() complete_job_config = complete_job_configs[(service, instance)] complete_job_config['command'] = original_command clone = clone_job( chronos_job=complete_job_config, date=datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S"), timestamp=timestamp, force_disabled=args.force_disabled, ) if not args.run_all_related_jobs and chronos_tools.get_job_type( clone) == chronos_tools.JobType.Dependent: # If the job is a dependent job and we want to re-run only the specific instance # remove the parents and update the schedule to start the job as soon as possible clone = set_default_schedule(remove_parents(clone)) chronos_to_add.append(clone) for job_to_add in chronos_to_add: client.add(job_to_add)
def my_init(self, filewatcher): self.filewatcher = filewatcher self.public_config = load_system_paasta_config() self.marathon_client = get_marathon_client_from_config()
raise ValueError( f"{metric} hasn't been configured as a guage or counter") print(f"Sent {path}: {value} to meteorite") def report_all_metrics_to_meteorite(csv, paasta_cluster): for row in csv: if row['svname'] == 'BACKEND': for metric in GUAGES + COUNTERS: report_metric_to_meteorite( backend=row['# pxname'], metric=metric, value=row[metric], paasta_cluster=paasta_cluster, ) if __name__ == '__main__': system_paasta_config = utils.load_system_paasta_config() csv = retrieve_haproxy_csv( synapse_host=system_paasta_config.get_default_synapse_host(), synapse_port=system_paasta_config.get_synapse_port(), synapse_haproxy_url_format=system_paasta_config. get_synapse_haproxy_url_format(), ) report_all_metrics_to_meteorite( csv=csv, paasta_cluster=system_paasta_config.get_local_run_config().get( 'default_cluster'), )
def print_output(argv: Optional[Sequence[str]] = None) -> None: mesos_available = is_mesos_available() kube_available = is_kubernetes_available() args = parse_args(argv) system_paasta_config = load_system_paasta_config() if mesos_available: master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs["use_mesos_cache"] = True master = get_mesos_master( mesos_config_path=get_mesos_config_path(system_paasta_config), **master_kwargs, ) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % "\n".join(e.args))) raise FatalError(2) marathon_results = _run_marathon_checks(marathon_clients) else: marathon_results = [ metastatus_lib.HealthCheckResult( message="Marathon is not configured to run here", healthy=True ) ] all_mesos_results = [ metastatus_lib.HealthCheckResult( message="Mesos is not configured to run here", healthy=True ) ] if kube_available: kube_client = KubeClient() kube_results = _run_kube_checks(kube_client) else: kube_results = [ metastatus_lib.HealthCheckResult( message="Kubernetes is not configured to run here", healthy=True ) ] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) kube_ok = all(metastatus_lib.status_for_results(kube_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok ) kube_summary = metastatus_lib.generate_summary_for_check("Kubernetes", kube_ok) healthy_exit = True if all([mesos_ok, marathon_ok]) else False print(f"Master paasta_tools version: {__version__}") print("Mesos leader: %s" % get_mesos_leader()) metastatus_lib.print_results_for_healthchecks( mesos_summary, mesos_ok, all_mesos_results, args.verbose ) if args.verbose > 1 and mesos_available: print_with_indent("Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state ) for line in format_table(all_rows): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent("Per Slave Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster ) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2 ) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks( marathon_summary, marathon_ok, marathon_results, args.verbose ) metastatus_lib.print_results_for_healthchecks( kube_summary, kube_ok, kube_results, args.verbose ) if args.verbose > 1 and kube_available: print_with_indent("Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_kube( groupings=args.groupings, threshold=args.threshold, kube_client=kube_client ) for line in format_table(all_rows): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent("Per Node Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster ) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2 ) # print info about nodes here. Note that we don't make # modifications to the healthy_exit variable here, because we don't # care about a single node having high usage. all_rows, _ = utilization_table_by_grouping_from_kube( groupings=args.groupings + ["hostname"], threshold=args.threshold, kube_client=kube_client, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be # 1 for per-node resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) if not healthy_exit: raise FatalError(2)
def main(): args = parse_args() full_appid = args.appname.lstrip('/') soa_dir = args.soa_dir marathon_config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) if not marathon_tools.is_app_id_running(app_id=full_appid, client=client): paasta_print("Couldn't find an app named {}".format(full_appid)) sys.exit(1) service, instance, _, __ = (s.replace('--', '_') for s in decompose_job_id(full_appid)) cluster = load_system_paasta_config().get_cluster() service_instance_config = marathon_tools.load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) complete_config = service_instance_config.format_marathon_app_dict() nerve_ns = service_instance_config.get_nerve_namespace() service_namespace_config = marathon_tools.load_service_namespace_config( service=service, namespace=nerve_ns) drain_method = drain_lib.get_drain_method( service_instance_config.get_drain_method(service_namespace_config), service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=service_instance_config.get_drain_method_params( service_namespace_config), ) bounce_func = bounce_lib.get_bounce_method_func('down') while marathon_tools.is_app_id_running(app_id=full_appid, client=client): app_to_kill = client.get_app(full_appid) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps=[app_to_kill], drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=service_instance_config. get_bounce_health_params(service_namespace_config), ) do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=complete_config, new_app_running='', happy_new_tasks=[], old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, serviceinstance="{}.{}".format(service, instance), bounce_method='down', service=service, cluster=cluster, instance=instance, marathon_jobid=full_appid, client=client, soa_dir=soa_dir, ) paasta_print("Sleeping for 10 seconds to give the tasks time to drain") time.sleep(10) paasta_print("Sucessfully killed {}".format(full_appid))
def paasta_start_or_stop(args, desired_state): """Requests a change of state to start or stop given branches of a service.""" soa_dir = args.soa_dir pargs = apply_args_filters(args) if len(pargs) == 0: return 1 affected_services = { s for service_list in pargs.values() for s in service_list.keys() } if len(affected_services) > 1: paasta_print( PaastaColors.red( "Warning: trying to start/stop/restart multiple services:")) for cluster, services_instances in pargs.items(): paasta_print("Cluster %s:" % cluster) for service, instances in services_instances.items(): paasta_print(" Service %s:" % service) paasta_print(" Instances %s" % ",".join(instances.keys())) if sys.stdin.isatty(): confirm = choice.Binary("Are you sure you want to continue?", False).ask() else: confirm = False if not confirm: paasta_print() paasta_print("exiting") return 1 invalid_deploy_groups = [] marathon_message_printed = False affected_flinks = [] if args.clusters is None or args.instances is None: if confirm_to_continue(pargs.items(), desired_state) is False: paasta_print() paasta_print("exiting") return 1 for cluster, services_instances in pargs.items(): for service, instances in services_instances.items(): for instance in instances.keys(): service_config = get_instance_config( service=service, cluster=cluster, instance=instance, soa_dir=soa_dir, load_deployments=False, ) if isinstance(service_config, FlinkDeploymentConfig): affected_flinks.append(service_config) continue try: remote_refs = get_remote_refs(service, soa_dir) except remote_git.LSRemoteException as e: msg = ( "Error talking to the git server: %s\n" "This PaaSTA command requires access to the git server to operate.\n" "The git server may be down or not reachable from here.\n" "Try again from somewhere where the git server can be reached, " "like your developer environment.") % str(e) paasta_print(msg) return 1 deploy_group = service_config.get_deploy_group() (deploy_tag, _) = get_latest_deployment_tag(remote_refs, deploy_group) if deploy_tag not in remote_refs: invalid_deploy_groups.append(deploy_group) else: force_bounce = utils.format_timestamp( datetime.datetime.utcnow()) if (isinstance(service_config, MarathonServiceConfig) and not marathon_message_printed): print_marathon_message(desired_state) marathon_message_printed = True issue_state_change_for_service( service_config=service_config, force_bounce=force_bounce, desired_state=desired_state, ) return_val = 0 # TODO: Refactor to discover if set_state is available for given # instance_type in API if affected_flinks: print_flink_message(desired_state) csi = defaultdict(lambda: defaultdict(list)) for service_config in affected_flinks: csi[service_config.cluster][service_config.service].append( service_config.instance) system_paasta_config = load_system_paasta_config() for cluster, services_instances in csi.items(): client = get_paasta_api_client(cluster, system_paasta_config) if not client: paasta_print("Cannot get a paasta-api client") exit(1) for service, instances in services_instances.items(): for instance in instances: try: client.service.instance_set_state( service=service, instance=instance, desired_state=desired_state, ).result() except HTTPError as exc: paasta_print(exc.response.text) return exc.status_code return_val = 0 if invalid_deploy_groups: paasta_print( f"No deploy tags found for {', '.join(invalid_deploy_groups)}.") paasta_print(f"Has {service} been deployed there yet?") return_val = 1 return return_val