def get_service_instances_needing_update( marathon_clients: MarathonClients, instances: Collection[Tuple[str, str]], cluster: str, ) -> List[Tuple[str, str, MarathonServiceConfig, str]]: marathon_apps = {} for marathon_client in marathon_clients.get_all_clients(): marathon_apps.update( {app.id: app for app in get_all_marathon_apps(marathon_client)}) marathon_app_ids = marathon_apps.keys() service_instances = [] for service, instance in instances: try: config = load_marathon_service_config_no_cache( service=service, instance=instance, cluster=cluster, soa_dir=DEFAULT_SOA_DIR, ) config_app = config.format_marathon_app_dict() app_id = "/{}".format(config_app["id"]) # Not ideal but we rely on a lot of user input to create the app dict # and we really can't afford to bail if just one app definition is malformed except Exception as e: print("ERROR: Skipping {}.{} because: '{}'".format( service, instance, str(e))) continue if (app_id not in marathon_app_ids or marathon_apps[app_id].instances != config_app["instances"]): service_instances.append((service, instance, config, app_id)) return service_instances
def get_service_instances_needing_update(marathon_client, instances, cluster): marathon_apps = { app.id: app for app in get_all_marathon_apps(marathon_client) } marathon_app_ids = marathon_apps.keys() service_instances = [] for service, instance in instances: try: config = load_marathon_service_config_no_cache( service=service, instance=instance, cluster=cluster, soa_dir=DEFAULT_SOA_DIR, ) config_app = config.format_marathon_app_dict() app_id = '/{}'.format(config_app['id']) except (NoDockerImageError, InvalidJobNameError, NoDeploymentsAvailable) as e: print("DEBUG: Skipping %s.%s because: '%s'" % (service, instance, str(e))) continue if app_id not in marathon_app_ids: service_instances.append((service, instance)) elif marathon_apps[app_id].instances != config_app['instances']: service_instances.append((service, instance)) return service_instances
def get_at_risk_service_instances(self, draining_hosts): marathon_apps = get_all_marathon_apps(self.marathon_client, embed_tasks=True) at_risk_tasks = [ task for app in marathon_apps for task in app.tasks if task.host in draining_hosts ] self.log.info("At risk tasks: {}".format(at_risk_tasks)) service_instances = [] for task in at_risk_tasks: app_id = task.app_id.strip('/') service, instance, _, __ = deformat_job_id(app_id) # check we haven't already added this instance, # no need to add the same instance to the bounce queue # more than once if not any([(service, instance) == (si.service, si.instance) for si in service_instances]): service_instances.append( ServiceInstance( service=service, instance=instance, cluster=self.config.get_cluster(), bounce_by=int(time.time()), watcher=type(self).__name__, bounce_timers=None, failures=0, )) return service_instances
def get_service_instances_needing_update( marathon_clients: MarathonClients, instances: Collection[Tuple[str, str]], cluster: str, ) -> List[Tuple[str, str]]: marathon_apps = {} for marathon_client in marathon_clients.get_all_clients(): marathon_apps.update( {app.id: app for app in get_all_marathon_apps(marathon_client)}) marathon_app_ids = marathon_apps.keys() service_instances = [] for service, instance in instances: try: config = load_marathon_service_config_no_cache( service=service, instance=instance, cluster=cluster, soa_dir=DEFAULT_SOA_DIR, ) config_app = config.format_marathon_app_dict() app_id = '/{}'.format(config_app['id']) except (NoDockerImageError, InvalidJobNameError, NoDeploymentsAvailable, NoSlavesAvailableError) as e: print("DEBUG: Skipping {}.{} because: '{}'".format( service, instance, str(e))) continue if app_id not in marathon_app_ids: service_instances.append((service, instance)) elif marathon_apps[app_id].instances != config_app['instances']: service_instances.append((service, instance)) return service_instances
def run(self): self.log.info("{} starting up".format(self.name)) while True: service_instance = self.bounce_q.get() failures = service_instance.failures bounce_timers = self.setup_timers(service_instance) self.log.info("{} processing {}.{}".format( self.name, service_instance.service, service_instance.instance)) marathon_apps = marathon_tools.get_all_marathon_apps( self.marathon_client, embed_failures=True) bounce_timers.setup_marathon.start() try: return_code, bounce_again_in_seconds = deploy_marathon_service( service=service_instance.service, instance=service_instance.instance, client=self.marathon_client, soa_dir=marathon_tools.DEFAULT_SOA_DIR, marathon_config=self.marathon_config, marathon_apps=marathon_apps) except Exception as e: self.log.warning( "deploy_marathon_service caused exception: {}".format(e)) return_code = -2 if return_code != 0: failures += 1 bounce_again_in_seconds = exponential_back_off( failures=failures, factor=self.config. get_deployd_worker_failure_backoff_factor(), base=2, max_time=6000) bounce_timers.setup_marathon.stop() self.log.info( "setup marathon completed with exit code {} for {}.{}".format( return_code, service_instance.service, service_instance.instance)) if bounce_again_in_seconds: bounce_timers.processed_by_worker.start() self.log.info( "{}.{} not in steady state so bouncing again in {} " "seconds".format(service_instance.service, service_instance.instance, bounce_again_in_seconds)) service_instance = ServiceInstance( service=service_instance.service, instance=service_instance.instance, bounce_by=int(time.time()) + bounce_again_in_seconds, watcher=self.name, bounce_timers=bounce_timers, failures=failures) self.inbox_q.put(service_instance) else: bounce_timers.bounce_length.stop() self.log.info("{}.{} in steady state".format( service_instance.service, service_instance.instance)) time.sleep(0.1)
def assert_marathon_apps( clients: Sequence[MarathonClient], ) -> HealthCheckResult: num_apps = [len(get_all_marathon_apps(c)) for c in clients] if sum(num_apps) < 1: return HealthCheckResult(message="CRITICAL: No marathon apps running", healthy=False) else: return HealthCheckResult(message="marathon apps: %10d" % sum(num_apps), healthy=True)
def when_setup_service_initiated(context): with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, # Wrap function call so we can select a subset of tasks or test # intermediate steps, like when an app is not completely up side_effect=lambda app, _, __, ___, **kwargs: get_happy_tasks( app, context.service, "fake_nerve_ns", context.system_paasta_config)[:context.max_tasks], ), mock.patch('paasta_tools.bounce_lib.bounce_lock_zookeeper', autospec=True), mock.patch('paasta_tools.bounce_lib.create_app_lock', autospec=True), mock.patch('paasta_tools.bounce_lib.time.sleep', autospec=True), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), mock.patch('paasta_tools.setup_marathon_job._log', autospec=True), mock.patch('paasta_tools.marathon_tools.get_config_hash', autospec=True, return_value='confighash'), mock.patch('paasta_tools.marathon_tools.get_code_sha_from_dockerurl', autospec=True, return_value='newapp'), mock.patch('paasta_tools.marathon_tools.get_docker_url', autospec=True, return_value='busybox'), mock.patch('paasta_tools.mesos_maintenance.get_principal', autospec=True), mock.patch('paasta_tools.mesos_maintenance.get_secret', autospec=True), ) as ( _, _, _, _, mock_load_system_paasta_config, _, _, _, _, mock_get_principal, mock_get_secret, ): credentials = mesos_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret') mock_get_principal.return_value = credentials.principal mock_get_secret.return_value = credentials.secret mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster) # 120 * 0.5 = 60 seconds for _ in xrange(120): try: marathon_apps = marathon_tools.get_all_marathon_apps(context.marathon_client, embed_failures=True) (code, message) = setup_marathon_job.setup_service( service=context.service, instance=context.instance, client=context.marathon_client, marathon_apps=marathon_apps, service_marathon_config=context.new_marathon_service_config, soa_dir='/nail/etc/services', ) assert code == 0, message return except MarathonHttpError: time.sleep(0.5) raise Exception("Unable to acquire app lock for setup_marathon_job.setup_service")
def main(): """Attempt to set up a list of marathon service instances given. Exits 1 if any service.instance deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Do the following for each service.instance: - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) # Setting up transparent cache for http API calls requests_cache.install_cache("setup_marathon_jobs", backend="memory") marathon_config = get_main_marathon_config() client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), ) marathon_apps = marathon_tools.get_all_marathon_apps(client, embed_tasks=True) num_failed_deployments = 0 for service_instance in args.service_instance_list: try: service, instance, _, __ = decompose_job_id(service_instance) except InvalidJobNameError: log.error( "Invalid service instance specified. Format is service%sinstance." % SPACER) num_failed_deployments = num_failed_deployments + 1 else: if deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps)[0]: num_failed_deployments = num_failed_deployments + 1 requests_cache.uninstall_cache() log.debug("%d out of %d service.instances failed to deploy." % (num_failed_deployments, len(args.service_instance_list))) sys.exit(1 if num_failed_deployments else 0)
def when_setup_service_initiated(context): with mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, # Wrap function call so we can select a subset of tasks or test # intermediate steps, like when an app is not completely up side_effect=lambda app, _, __, ___, **kwargs: get_happy_tasks( app, context.service, "fake_nerve_ns", context.system_paasta_config, )[:context.max_tasks], ), mock.patch( 'paasta_tools.bounce_lib.bounce_lock_zookeeper', autospec=True, ), mock.patch( 'paasta_tools.bounce_lib.time.sleep', autospec=True, ), mock.patch( 'paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True, ) as mock_load_system_paasta_config, mock.patch( 'paasta_tools.setup_marathon_job._log', autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.get_config_hash', autospec=True, return_value='confighash', ), mock.patch( 'paasta_tools.marathon_tools.get_code_sha_from_dockerurl', autospec=True, return_value='newapp', ), mock.patch( 'paasta_tools.utils.InstanceConfig.get_docker_url', autospec=True, return_value='busybox', ), mock.patch( 'paasta_tools.mesos_maintenance.get_principal', autospec=True, ) as mock_get_principal, mock.patch( 'paasta_tools.mesos_maintenance.get_secret', autospec=True, ) as mock_get_secret: credentials = mesos_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret') mock_get_principal.return_value = credentials.principal mock_get_secret.return_value = credentials.secret mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster) # 120 * 0.5 = 60 seconds for _ in range(120): try: marathon_apps = marathon_tools.get_all_marathon_apps(context.marathon_client, embed_tasks=True) (code, message, bounce_again) = setup_marathon_job.setup_service( service=context.service, instance=context.instance, client=context.marathon_client, marathon_apps=marathon_apps, service_marathon_config=context.new_marathon_service_config, soa_dir='/nail/etc/services', ) assert code == 0, message return except MarathonHttpError: time.sleep(0.5) raise Exception("Unable to acquire app lock for setup_marathon_job.setup_service")
def main(): """Attempt to set up a list of marathon service instances given. Exits 1 if any service.instance deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Do the following for each service.instance: - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) # Setting up transparent cache for http API calls requests_cache.install_cache("setup_marathon_jobs", backend="memory") marathon_config = get_main_marathon_config() client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) marathon_apps = marathon_tools.get_all_marathon_apps(client, embed_failures=True) num_failed_deployments = 0 for service_instance in args.service_instance_list: try: service, instance, _, __ = decompose_job_id(service_instance) except InvalidJobNameError: log.error("Invalid service instance specified. Format is service%sinstance." % SPACER) num_failed_deployments = num_failed_deployments + 1 else: if deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps): num_failed_deployments = num_failed_deployments + 1 requests_cache.uninstall_cache() log.debug("%d out of %d service.instances failed to deploy." % (num_failed_deployments, len(args.service_instance_list))) sys.exit(1 if num_failed_deployments else 0)
def run(self): self.log.info("{} starting up".format(self.name)) while True: service_instance = self.bounce_q.get() bounce_timers = self.setup_timers(service_instance) self.log.info("{} processing {}.{}".format( self.name, service_instance.service, service_instance.instance)) marathon_apps = marathon_tools.get_all_marathon_apps( self.marathon_client, embed_failures=True) bounce_timers.setup_marathon.start() return_code, bounce_again_in_seconds = deploy_marathon_service( service=service_instance.service, instance=service_instance.instance, client=self.marathon_client, soa_dir=marathon_tools.DEFAULT_SOA_DIR, marathon_config=self.marathon_config, marathon_apps=marathon_apps) bounce_timers.setup_marathon.stop() self.log.info( "setup marathon completed with exit code {} for {}.{}".format( return_code, service_instance.service, service_instance.instance)) if bounce_again_in_seconds: bounce_timers.processed_by_worker.start() self.log.info( "{}.{} not in steady state so bouncing again in {} " "seconds".format(service_instance.service, service_instance.instance, bounce_again_in_seconds)) service_instance = ServiceInstance( service=service_instance.service, instance=service_instance.instance, bounce_by=int(time.time()) + bounce_again_in_seconds, watcher=self.name, bounce_timers=bounce_timers) self.inbox_q.put(service_instance) else: bounce_timers.bounce_length.stop() self.log.info("{}.{} in steady state".format( service_instance.service, service_instance.instance)) time.sleep(0.1)
def process_service_instance(self, service_instance): bounce_timers = self.setup_timers(service_instance) self.log.info("{} processing {}.{}".format(self.name, service_instance.service, service_instance.instance)) marathon_apps = marathon_tools.get_all_marathon_apps( self.marathon_client, embed_tasks=True) bounce_timers.setup_marathon.start() return_code, bounce_again_in_seconds = deploy_marathon_service( service=service_instance.service, instance=service_instance.instance, client=self.marathon_client, soa_dir=marathon_tools.DEFAULT_SOA_DIR, marathon_config=self.marathon_config, marathon_apps=marathon_apps, ) bounce_timers.setup_marathon.stop() self.log.info( "setup marathon completed with exit code {} for {}.{}".format( return_code, service_instance.service, service_instance.instance, )) if bounce_again_in_seconds: bounce_timers.processed_by_worker.start() self.log.info("{}.{} not in steady state so bouncing again in {} " "seconds".format( service_instance.service, service_instance.instance, bounce_again_in_seconds, )) else: bounce_timers.bounce_length.stop() self.log.info("{}.{} in steady state".format( service_instance.service, service_instance.instance, )) return BounceResults(bounce_again_in_seconds, return_code, bounce_timers)
def get_service_instances_needing_update(marathon_client, instances, cluster): marathon_apps = { app.id: app for app in get_all_marathon_apps(marathon_client) } marathon_app_ids = marathon_apps.keys() service_instances = [] for service, instance in instances: config = load_marathon_service_config_no_cache(service=service, instance=instance, cluster=cluster, soa_dir=DEFAULT_SOA_DIR) try: config_app = config.format_marathon_app_dict() app_id = '/{}'.format(config_app['id']) except NoDockerImageError: config_app = None if not config_app: service_instances.append((service, instance)) elif app_id not in marathon_app_ids: service_instances.append((service, instance)) elif marathon_apps[app_id].instances != config_app['instances']: service_instances.append((service, instance)) return service_instances