def _format_disabled_status(job): status = PaastaColors.red("UNKNOWN") if job.get("disabled", False): status = PaastaColors.red("Disabled") else: status = PaastaColors.green("Enabled") return status
def paasta_rollback(args): """Call mark_for_deployment with rollback parameters :param args: contains all the arguments passed onto the script: service, deploy groups and sha. These arguments will be verified and passed onto mark_for_deployment. """ service = figure_out_service_name(args) git_url = get_git_url(service) commit = args.commit given_deploy_groups = [deploy_group for deploy_group in args.deploy_groups.split(",") if deploy_group] service_deploy_groups = set(config.get_deploy_group() for config in get_instance_config_for_service( soa_dir=DEFAULT_SOA_DIR, service=service, )) deploy_groups, invalid = validate_given_deploy_groups(service_deploy_groups, given_deploy_groups) if len(invalid) > 0: print PaastaColors.yellow("These deploy groups are not valid and will be skipped: %s.\n" % (",").join(invalid)) if len(deploy_groups) == 0: print PaastaColors.red("ERROR: No valid deploy groups specified for %s.\n" % (service)) returncode = 1 for deploy_group in deploy_groups: returncode = mark_for_deployment( git_url=git_url, service=service, deploy_group=deploy_group, commit=commit, ) sys.exit(returncode)
def start_chronos_job(service, instance, job_id, client, cluster, job_config, complete_job_config, emergency=False): """ Calls the 'manual start' Chronos endpoint (https://mesos.github.io/chronos/docs/api.html#manually-starting-a-job), running the job now regardless of its 'schedule'. The job's "schedule" is unmodified. If a job is disabled, this function does not do anything. """ name = PaastaColors.cyan(job_id) # The job should be run immediately as long as the job is not disabled via the 'disabled' key in soa-configs or has # been previously stopped. if complete_job_config['disabled']: print PaastaColors.red("You cannot emergency start a disabled job. Run `paasta start` first.") else: log_reason = PaastaColors.red("EmergencyStart") if emergency else "Brutal bounce" _log( service=service, line="%s: Starting manual run of %s in Chronos" % (log_reason, name), component="deploy", level="event", cluster=cluster, instance=instance ) client.update(complete_job_config) client.run(job_id)
def main(): marathon_config = None chronos_config = None args = parse_args() try: mesos_state = get_mesos_state_from_leader() except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose, humanize_output=args.humanize) # Check to see if Marathon should be running here by checking for config try: marathon_config = marathon_tools.load_marathon_config() except MarathonNotConfigured: marathon_results = [('marathon is not configured to run here', True)] # Check to see if Chronos should be running here by checking for config try: chronos_config = load_chronos_config() except ChronosNotConfigured: chronos_results = [('chronos is not configured to run here', True)] if marathon_config: marathon_client = get_marathon_client(marathon_config) try: marathon_results = get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = get_chronos_status(chronos_client) except ServerNotFoundError as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) mesos_ok = all(status_for_results(mesos_results)) marathon_ok = all(status_for_results(marathon_results)) chronos_ok = all(status_for_results(chronos_results)) mesos_summary = generate_summary_for_check("Mesos", mesos_ok) marathon_summary = generate_summary_for_check("Marathon", marathon_ok) chronos_summary = generate_summary_for_check("Chronos", chronos_ok) print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose) print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not all([mesos_ok, marathon_ok, chronos_ok]): sys.exit(2) else: sys.exit(0)
def _format_schedule(job): if job.get('parents') is not None: schedule = PaastaColors.yellow("None (Dependent Job).") else: schedule = job.get("schedule", PaastaColors.red("UNKNOWN")) epsilon = job.get("epsilon", PaastaColors.red("UNKNOWN")) formatted_schedule = "%s Epsilon: %s" % (schedule, epsilon) return formatted_schedule
def desired_state_human(desired_state, instances): if desired_state == 'start' and instances != 0: return PaastaColors.bold('Started') elif desired_state == 'start' and instances == 0: return PaastaColors.bold('Stopped') elif desired_state == 'stop': return PaastaColors.red('Stopped') else: return PaastaColors.red('Unknown (desired_state: %s)' % desired_state)
def bouncing_status_human(app_count, bounce_method): if app_count == 0: return PaastaColors.red("Disabled") elif app_count == 1: return PaastaColors.green("Configured") elif app_count > 1: return PaastaColors.yellow("Bouncing (%s)" % bounce_method) else: return PaastaColors.red("Unknown (count: %s)" % app_count)
def get_desired_state_human(self): desired_state = self.get_desired_state() if desired_state == 'start' and self.get_instances() != 0: return PaastaColors.bold('Started') elif desired_state == 'start' and self.get_instances() == 0: return PaastaColors.bold('Stopped') elif desired_state == 'stop': return PaastaColors.red('Stopped') else: return PaastaColors.red('Unknown (desired_state: %s)' % desired_state)
def get_desired_state_human(self): desired_state = self.get_desired_state() if desired_state == "start" and self.get_instances() != 0: return PaastaColors.bold("Started") elif desired_state == "start" and self.get_instances() == 0: return PaastaColors.bold("Stopped") elif desired_state == "stop": return PaastaColors.red("Stopped") else: return PaastaColors.red("Unknown (desired_state: %s)" % desired_state)
def _format_mesos_status(job, running_tasks): mesos_status = PaastaColors.red("UNKNOWN") num_tasks = len(running_tasks) if num_tasks == 0: mesos_status = PaastaColors.grey("Not running") elif num_tasks == 1: mesos_status = PaastaColors.yellow("Running") else: mesos_status = PaastaColors.red("Critical - %d tasks running (expected 1)" % num_tasks) return mesos_status
def print_results_for_healthchecks(ok, results, verbose, indent=2): if verbose >= 1: for health_check_result in results: if health_check_result.healthy: print_with_indent(health_check_result.message, indent) else: print_with_indent(PaastaColors.red(health_check_result.message), indent) elif not ok: unhealthy_results = critical_events_in_outputs(results) for health_check_result in unhealthy_results: print_with_indent(PaastaColors.red(health_check_result.message), indent)
def _format_schedule(job): if job.get('parents') is not None: schedule = PaastaColors.yellow("None (Dependent Job).") else: schedule = job.get("schedule", PaastaColors.red("UNKNOWN")) epsilon = job.get("epsilon", PaastaColors.red("UNKNOWN")) schedule_time_zone = job.get("scheduleTimeZone", "null") if schedule_time_zone == "null": # This is what Chronos returns. schedule_time_zone = "UTC" formatted_schedule = "%s (%s) Epsilon: %s" % (schedule, schedule_time_zone, epsilon) return formatted_schedule
def get_bouncing_status(service, instance, client, job_config): apps = marathon_tools.get_matching_appids(service, instance, client) bounce_method = job_config.get_bounce_method() app_count = len(apps) if app_count == 0: return PaastaColors.red("Stopped") elif app_count == 1: return PaastaColors.green("Running") elif app_count > 1: return PaastaColors.yellow("Bouncing (%s)" % bounce_method) else: return PaastaColors.red("Unknown (count: %s)" % app_count)
def _cleanup_container(docker_client, container_id): if docker_client.inspect_container(container_id)['State'].get('OOMKilled', False): sys.stderr.write(PaastaColors.red("Your service was killed by the OOM Killer!\n")) sys.stderr.write(PaastaColors.red( "You've exceeded the memory limit, try increasing the mem parameter in your soa_configs\n")) sys.stdout.write("\nStopping and removing the old container %s...\n" % container_id) sys.stdout.write("(Please wait or you may leave an orphaned container.)\n") sys.stdout.flush() try: docker_client.stop(container_id) docker_client.remove_container(container_id) sys.stdout.write("...done\n") except errors.APIError: sys.stdout.write(PaastaColors.yellow( "Could not clean up container! You should stop and remove container '%s' manually.\n" % container_id))
def status_mesos_tasks(service, instance, normal_instance_count): job_id = marathon_tools.format_job_id(service, instance) running_and_active_tasks = get_running_tasks_from_active_frameworks(job_id) count = len(running_and_active_tasks) if count >= normal_instance_count: status = PaastaColors.green("Healthy") count = PaastaColors.green("(%d/%d)" % (count, normal_instance_count)) elif count == 0: status = PaastaColors.red("Critical") count = PaastaColors.red("(%d/%d)" % (count, normal_instance_count)) else: status = PaastaColors.yellow("Warning") count = PaastaColors.yellow("(%d/%d)" % (count, normal_instance_count)) running_string = PaastaColors.bold('TASK_RUNNING') return "Mesos: %s - %s tasks in the %s state." % (status, count, running_string)
def test_get_mesos_status( mock_get_mesos_stats, mock_get_num_masters, mock_get_configured_quorum_size, mock_getfqdn, ): mock_getfqdn.return_value = 'fakename' mock_get_mesos_stats.return_value = { 'master/cpus_total': 10, 'master/cpus_used': 8, 'master/mem_total': 10240, 'master/mem_used': 2048, 'master/disk_total': 10240, 'master/disk_used': 3072, 'master/tasks_running': 3, 'master/tasks_staging': 4, 'master/tasks_starting': 0, 'master/slaves_active': 4, 'master/slaves_inactive': 0, } mesos_state = { 'flags': { 'zk': 'zk://1.1.1.1:2222/fake_cluster', 'quorum': 2, }, 'frameworks': [ { 'name': 'test_framework1', }, { 'name': 'test_framework1', }, ] } mock_get_num_masters.return_value = 5 mock_get_configured_quorum_size.return_value = 3 expected_cpus_output = "CPUs: 8.00 / 10 in use (%s)" % PaastaColors.green("80.00%") expected_mem_output = \ "Memory: 2.00 / 10.00GB in use (%s)" % PaastaColors.green("20.00%") expected_disk_output = "Disk: 3.00 / 10.00GB in use (%s)" % PaastaColors.green("30.00%") expected_tasks_output = \ "tasks: running: 3 staging: 4 starting: 0" expected_duplicate_frameworks_output = \ "frameworks:\n%s" % \ PaastaColors.red(" CRITICAL: Framework test_framework1 has 2 instances running--expected no more than 1.") expected_slaves_output = \ "slaves: active: 4 inactive: 0" expected_masters_quorum_output = \ "quorum: masters: 5 configured quorum: 3 " results = paasta_metastatus.get_mesos_status(mesos_state, verbosity=0) assert mock_get_mesos_stats.called_once() assert (expected_masters_quorum_output, True) in results assert (expected_cpus_output, True) in results assert (expected_mem_output, True) in results assert (expected_disk_output, True) in results assert (expected_tasks_output, True) in results assert (expected_duplicate_frameworks_output, False) in results assert (expected_slaves_output, True) in results
def report_status_for_cluster(service, cluster, deploy_pipeline, actual_deployments, verbose=False): """With a given service and cluster, prints the status of the instances in that cluster""" # Get cluster.instance in the order in which they appear in deploy.yaml print print "cluster: %s" % cluster for namespace in deploy_pipeline: cluster_in_pipeline, instance = namespace.split('.') if cluster_in_pipeline != cluster: # This function only prints things that are relevant to cluster # We skip anything not in this cluster continue # Case: service deployed to cluster.instance if namespace in actual_deployments: unformatted_instance = instance instance = PaastaColors.blue(instance) version = actual_deployments[namespace][:8] # TODO: Perform sanity checks once per cluster instead of for each namespace status = execute_paasta_serviceinit_on_remote_master('status', cluster, service, unformatted_instance, verbose=verbose) # Case: service NOT deployed to cluster.instance else: instance = PaastaColors.red(instance) version = 'None' status = None print ' instance: %s' % instance print ' Git sha: %s' % version if status is not None: for line in status.rstrip().split('\n'): print ' %s' % line
def generate_summary_for_check(name, ok): """Given a check name and a boolean indicating if the service is OK, return a formatted message. """ status = PaastaColors.green("OK") if ok is True else PaastaColors.red("CRITICAL") summary = "%s Status: %s" % (name, status) return summary
def test_format_chronos_job_name_does_not_exist(): example_job = {} desired_state = '' running_tasks = [] verbose = False actual = chronos_serviceinit.format_chronos_job_status(example_job, desired_state, running_tasks, verbose) assert PaastaColors.red('UNKNOWN') in actual
def assert_no_duplicate_frameworks(state): """A function which asserts that there are no duplicate frameworks running, where frameworks are identified by their name. Note the extra spaces in the output strings: this is to account for the extra indentation we add, so we can have: frameworks: framework: marathon count: 1 :param state: the state info from the Mesos master :returns: a tuple containing (output, ok): output is a log of the state of frameworks, ok a boolean indicating if there are any duplicate frameworks. """ frameworks = state['frameworks'] framework_counts = OrderedDict(sorted(Counter([fw['name'] for fw in frameworks]).items())) output = ["frameworks:"] ok = True for framework, count in framework_counts.iteritems(): if count > 1: ok = False output.append(PaastaColors.red( " CRITICAL: Framework %s has %d instances running--expected no more than 1." % (framework, count))) else: output.append(" framework: %s count: %d" % (framework, count)) return (("\n").join(output), ok)
def print_logs_by_time(self, service, start_time, end_time, levels, components, clusters, raw_mode): aggregated_logs = [] if 'marathon' in components or 'chronos' in components: sys.stderr.write(PaastaColors.red("Warning, you have chosen to get marathon or chronos logs based " "on time. This command may take a dozen minutes or so to run " "because marathon and chronos are on shared streams.\n")) def callback(component, stream_info, scribe_env, cluster): if stream_info.per_cluster: stream_name = stream_info.stream_name_fn(service, cluster) else: stream_name = stream_info.stream_name_fn(service) ctx = self.scribe_get_from_time(scribe_env, stream_name, start_time, end_time) self.filter_and_aggregate_scribe_logs(ctx, scribe_env, stream_name, levels, service, components, clusters, aggregated_logs, filter_fn=stream_info.filter_fn, parser_fn=stream_info.parse_fn, start_time=start_time, end_time=end_time) self.run_code_over_scribe_envs(clusters=clusters, components=components, callback=callback) aggregated_logs.sort(key=lambda log_line: log_line['sort_key']) for line in aggregated_logs: print_log(line['raw_line'], levels, raw_mode)
def report_status_for_cluster(service, cluster, deploy_pipeline, actual_deployments, instance_whitelist, verbose=0): """With a given service and cluster, prints the status of the instances in that cluster""" print print "cluster: %s" % cluster seen_instances = [] for namespace in deploy_pipeline: cluster_in_pipeline, instance = namespace.split('.') seen_instances.append(instance) if cluster_in_pipeline != cluster: continue if instance_whitelist and instance not in instance_whitelist: continue # Case: service deployed to cluster.instance if namespace in actual_deployments: formatted_instance = PaastaColors.blue(instance) version = actual_deployments[namespace][:8] # TODO: Perform sanity checks once per cluster instead of for each namespace status = execute_paasta_serviceinit_on_remote_master('status', cluster, service, instance, verbose=verbose) # Case: service NOT deployed to cluster.instance else: formatted_instance = PaastaColors.red(instance) version = 'None' status = None print ' instance: %s' % formatted_instance print ' Git sha: %s' % version if status is not None: for line in status.rstrip().split('\n'): print ' %s' % line print report_invalid_whitelist_values(instance_whitelist, seen_instances, 'instance')
def get_cpu_usage(task): """Calculates a metric of used_cpu/allocated_cpu To do this, we take the total number of cpu-seconds the task has consumed, (the sum of system and user time), OVER the total cpu time the task has been allocated. The total time a task has been allocated is the total time the task has been running (https://github.com/mesosphere/mesos/blob/0b092b1b0/src/webui/master/static/js/controllers.js#L140) multiplied by the "shares" a task has. """ try: start_time = round(task['statuses'][0]['timestamp']) current_time = int(datetime.datetime.now().strftime('%s')) duration_seconds = current_time - start_time # The CPU shares has an additional .1 allocated to it for executor overhead. # We subtract this to the true number # (https://github.com/apache/mesos/blob/dc7c4b6d0bcf778cc0cad57bb108564be734143a/src/slave/constants.hpp#L100) cpu_shares = task.cpu_limit - .1 allocated_seconds = duration_seconds * cpu_shares used_seconds = task.stats.get('cpus_system_time_secs', 0.0) + task.stats.get('cpus_user_time_secs', 0.0) if allocated_seconds == 0: return "Undef" percent = round(100 * (used_seconds / allocated_seconds), 1) percent_string = "%s%%" % percent if percent > 90: return PaastaColors.red(percent_string) else: return percent_string except (AttributeError, SlaveDoesNotExist): return "None" except TimeoutError: return "Timed Out"
def get_cluster_dashboards(cluster): """Returns the direct dashboards for humans to use for a given cluster""" SPACER = ' ' try: dashboards = load_system_paasta_config().get_dashboard_links()[cluster] except KeyError as e: if e.args[0] == cluster: output = [PaastaColors.red('No dashboards configured for %s!' % cluster)] else: output = [PaastaColors.red('No dashboards configured!')] else: output = ['Dashboards:'] spacing = max((len(label) for label in dashboards.keys())) + 1 for label, url in dashboards.items(): output.append(' %s:%s%s' % (label, SPACER * (spacing - len(label)), PaastaColors.cyan(url))) return '\n'.join(output)
def format_haproxy_backend_row(backend, is_correct_instance): """Pretty Prints the status of a given haproxy backend Takes the fields described in the CSV format of haproxy: http://www.haproxy.org/download/1.5/doc/configuration.txt And tries to make a good guess about how to represent them in text """ backend_name = backend['svname'] backend_hostname = backend_name.split("_")[-1] backend_port = backend_name.split("_")[0].split(":")[-1] pretty_backend_name = "%s:%s" % (backend_hostname, backend_port) if backend['status'] == "UP": status = PaastaColors.default(backend['status']) elif backend['status'] == 'DOWN' or backend['status'] == 'MAINT': status = PaastaColors.red(backend['status']) else: status = PaastaColors.yellow(backend['status']) lastcheck = "%s/%s in %sms" % (backend['check_status'], backend['check_code'], backend['check_duration']) lastchange = humanize.naturaltime(datetime.timedelta(seconds=int(backend['lastchg']))) row = ( ' %s' % pretty_backend_name, lastcheck, lastchange, status, ) if is_correct_instance: return row else: return tuple(PaastaColors.grey(remove_ansi_escape_sequences(col)) for col in row)
def assert_cpu_health(metrics, threshold=10): total, used, available = get_mesos_cpu_status(metrics) try: perc_used = percent_used(total, used) except ZeroDivisionError: return (PaastaColors.red("Error reading total available cpu from mesos!"), False) if check_threshold(perc_used, threshold): return ("CPUs: %.2f / %d in use (%s)" % (used, total, PaastaColors.green("%.2f%%" % perc_used)), True) else: return (PaastaColors.red( "CRITICAL: Less than %d%% CPUs available. (Currently using %.2f%% of %d)" % (threshold, perc_used, total)), False)
def haproxy_backend_report(normal_instance_count, up_backends): """Given that a service is in smartstack, this returns a human readable report of the up backends""" # TODO: Take into account a configurable threshold, PAASTA-1102 crit_threshold = 50 under_replicated, ratio = is_under_replicated(num_available=up_backends, expected_count=normal_instance_count, crit_threshold=crit_threshold) if under_replicated: status = PaastaColors.red("Critical") count = PaastaColors.red("(%d/%d, %d%%)" % (up_backends, normal_instance_count, ratio)) else: status = PaastaColors.green("Healthy") count = PaastaColors.green("(%d/%d)" % (up_backends, normal_instance_count)) up_string = PaastaColors.bold('UP') return "%s - in haproxy with %s total backends %s in this namespace." % (status, count, up_string)
def guess_instance(service, cluster, args): """Returns instance from args if available, otherwise uses 'main' if it is a valid instance, otherwise takes a good guess and returns the first instance available""" if args.instance: instance = args.instance else: try: instances = list_all_instances_for_service( service=service, clusters=[cluster], instance_type=None, soa_dir=args.yelpsoa_config_root ) if "main" in instances: instance = "main" else: instance = list(instances)[0] except NoConfigurationForServiceError: sys.stderr.write( PaastaColors.red( "Could not automatically detect instance to emulate. Please specify one with the --instance option.\n" ) ) sys.exit(2) sys.stderr.write( PaastaColors.yellow( "Guessing instance configuration for %s. To override, use the --instance option.\n" % instance ) ) return instance
def get_desired_state_human(self): desired_state = self.get_desired_state() if desired_state == 'start': return PaastaColors.bold('Scheduled') elif desired_state == 'stop': return PaastaColors.bold('Disabled') else: return PaastaColors.red('Unknown (desired_state: %s)' % desired_state)
def assert_disk_health(metrics, threshold=10): total = metrics['master/disk_total'] / float(1024) used = metrics['master/disk_used'] / float(1024) try: perc_used = percent_used(total, used) except ZeroDivisionError: return (PaastaColors.red("Error reading total available disk from mesos!"), False) if check_threshold(perc_used, threshold): return ("Disk: %0.2f / %0.2fGB in use (%s)" % (used, total, PaastaColors.green("%.2f%%" % perc_used)), True) else: return (PaastaColors.red( "CRITICAL: Less than %d%% disk available. (Currently using %.2f%%)" % (threshold, perc_used)), False)
async def test_get_cpu_usage_bad(): fake_task = mock.create_autospec(mesos.task.Task) fake_task.cpu_limit = asynctest.CoroutineMock(return_value=1.1) fake_duration = 100 fake_task.stats = asynctest.CoroutineMock(return_value={ "cpus_system_time_secs": 50.0, "cpus_user_time_secs": 50.0 }) current_time = datetime.datetime.now() fake_task.__getitem__.return_value = [{ "state": "TASK_RUNNING", "timestamp": int(current_time.strftime("%s")) - fake_duration, }] with asynctest.patch("paasta_tools.mesos_tools.datetime.datetime", autospec=True) as mock_datetime: mock_datetime.now.return_value = current_time actual = await mesos_tools.get_cpu_usage(fake_task) assert PaastaColors.red("100.0%") in actual
def guess_instance(service, cluster, args): """Returns instance from args if available, otherwise uses 'main' if it is a valid instance, otherwise takes a good guess and returns the first instance available""" if args.instance: instance = args.instance else: try: instances = list_all_instances_for_service( service=service, clusters=[cluster], instance_type=None, soa_dir=args.yelpsoa_config_root) if 'main' in instances: instance = 'main' else: instance = list(instances)[0] except NoConfigurationForServiceError: sys.stdout.write(PaastaColors.red( 'Could not automatically detect instance to emulate. Please specify one with the --instance option.\n')) sys.exit(2) sys.stdout.write(PaastaColors.yellow( 'Guessing instance configuration for %s. To override, use the --instance option.\n' % instance)) return instance
def load_aws_credentials_from_yaml(yaml_file_path): with open(yaml_file_path, 'r') as yaml_file: try: credentials_yaml = YAML().load(yaml_file.read()) except Exception as e: paasta_print( PaastaColors.red( 'Encountered %s when trying to parse AWS credentials yaml %s. ' 'Suppressing further output to avoid leaking credentials.' % ( type(e), yaml_file_path, ), ), ) sys.exit(1) return ( credentials_yaml['aws_access_key_id'], credentials_yaml['aws_secret_access_key'], )
async def test_get_cpu_usage_bad(): fake_task = mock.create_autospec(mesos.task.Task) fake_task.cpu_limit = asynctest.CoroutineMock(return_value=1.1) fake_duration = 100 fake_task.stats = asynctest.CoroutineMock(return_value={ 'cpus_system_time_secs': 50.0, 'cpus_user_time_secs': 50.0, }) current_time = datetime.datetime.now() fake_task.__getitem__.return_value = [{ 'state': 'TASK_RUNNING', 'timestamp': int(current_time.strftime('%s')) - fake_duration, }] with asynctest.patch('paasta_tools.mesos_tools.datetime.datetime', autospec=True) as mock_datetime: mock_datetime.now.return_value = current_time actual = await mesos_tools.get_cpu_usage(fake_task) assert PaastaColors.red('100.0%') in actual
def report_status_for_cluster(service, cluster, deploy_pipeline, actual_deployments, instance_whitelist, verbose=0): """With a given service and cluster, prints the status of the instances in that cluster""" print print "cluster: %s" % cluster seen_instances = [] for namespace in deploy_pipeline: cluster_in_pipeline, instance = namespace.split('.') seen_instances.append(instance) if cluster_in_pipeline != cluster: continue if instance_whitelist and instance not in instance_whitelist: continue # Case: service deployed to cluster.instance if namespace in actual_deployments: formatted_instance = PaastaColors.blue(instance) version = actual_deployments[namespace][:8] # TODO: Perform sanity checks once per cluster instead of for each namespace status = execute_paasta_serviceinit_on_remote_master( 'status', cluster, service, instance, verbose=verbose) # Case: service NOT deployed to cluster.instance else: formatted_instance = PaastaColors.red(instance) version = 'None' status = None print ' instance: %s' % formatted_instance print ' Git sha: %s' % version if status is not None: for line in status.rstrip().split('\n'): print ' %s' % line print report_invalid_whitelist_values(instance_whitelist, seen_instances, 'instance')
def verify_instances(args_instances, service, clusters): """Verify that a list of instances specified by user is correct for this service. :param args_instances: a comma separated string containig a list of instances. :param service: the service name :param cluster: a list of clusters :returns: a list of instances specified in args_instances without any exclusions. """ unverified_instances = args_instances.split(",") service_instances = list_all_instances_for_service(service, clusters=clusters) misspelled_instances = [ i for i in unverified_instances if i not in service_instances ] if misspelled_instances: suggestions = [] for instance in misspelled_instances: suggestions.extend( difflib.get_close_matches(instance, service_instances, n=5, cutoff=0.5)) if clusters: message = ("%s doesn't have any instances matching %s on %s." % (service, ', '.join(sorted(misspelled_instances)), ', '.join(sorted(clusters)))) else: message = ("%s doesn't have any instances matching %s." % (service, ', '.join(sorted(misspelled_instances)))) paasta_print(PaastaColors.red(message)) if suggestions: paasta_print("Did you mean any of these?") for instance in sorted(suggestions): paasta_print(" %s" % instance) return unverified_instances
def get_verbose_status_of_marathon_app(app): """Takes a given marathon app object and returns the verbose details about the tasks, times, hosts, etc""" output = [] create_datetime = datetime_from_utc_to_local( isodate.parse_datetime(app.version)) output.append(" Marathon app ID: %s" % PaastaColors.bold(app.id)) output.append( " App created: %s (%s)" % (str(create_datetime), humanize.naturaltime(create_datetime))) output.append(" Tasks:") rows = [("Mesos Task ID", "Host deployed to", "Deployed at what localtime", "Health")] for task in app.tasks: local_deployed_datetime = datetime_from_utc_to_local(task.staged_at) if task.host is not None: hostname = "%s:%s" % (task.host.split(".")[0], task.ports[0]) else: hostname = "Unknown" if not task.health_check_results: health_check_status = PaastaColors.grey("N/A") elif marathon_tools.is_task_healthy(task): health_check_status = PaastaColors.green("Healthy") else: health_check_status = PaastaColors.red("Unhealthy") rows.append(( get_short_task_id(task.id), hostname, '%s (%s)' % ( local_deployed_datetime.strftime("%Y-%m-%dT%H:%M"), humanize.naturaltime(local_deployed_datetime), ), health_check_status, )) output.append('\n'.join([" %s" % line for line in format_table(rows)])) if len(app.tasks) == 0: output.append(" No tasks associated with this marathon app") return app.tasks, "\n".join(output)
def print_logs_by_time(self, service, start_time, end_time, levels, components, clusters, instances, raw_mode): aggregated_logs = [] if 'marathon' in components or 'chronos' in components: sys.stderr.write( PaastaColors.red( "Warning, you have chosen to get marathon or chronos logs based " "on time. This command may take a dozen minutes or so to run " "because marathon and chronos are on shared streams.\n")) def callback(component, stream_info, scribe_env, cluster): if stream_info.per_cluster: stream_name = stream_info.stream_name_fn(service, cluster) else: stream_name = stream_info.stream_name_fn(service) ctx = self.scribe_get_from_time(scribe_env, stream_name, start_time, end_time) self.filter_and_aggregate_scribe_logs( ctx, scribe_env, stream_name, levels, service, components, clusters, instances, aggregated_logs, filter_fn=stream_info.filter_fn, parser_fn=stream_info.parse_fn, start_time=start_time, end_time=end_time) self.run_code_over_scribe_envs(clusters=clusters, components=components, callback=callback) aggregated_logs.sort(key=lambda log_line: log_line['sort_key']) for line in aggregated_logs: print_log(line['raw_line'], levels, raw_mode)
def paasta_wait_for_deployment(args): """Wrapping wait_for_deployment""" if args.verbose: log.setLevel(level=logging.DEBUG) else: log.setLevel(level=logging.INFO) service = args.service if service and service.startswith('services-'): service = service.split('services-', 1)[1] if args.git_url is None: args.git_url = get_git_url(service=service, soa_dir=args.soa_dir) try: validate_service_name(service, soa_dir=args.soa_dir) validate_deploy_group(args.deploy_group, service, args.soa_dir) validate_git_sha(args.commit, args.git_url, args.deploy_group, service) except (GitShaError, DeployGroupError, NoSuchService) as e: paasta_print(PaastaColors.red('{}'.format(e))) return 1 try: wait_for_deployment(service=service, deploy_group=args.deploy_group, git_sha=args.commit, soa_dir=args.soa_dir, timeout=args.timeout) _log(service=service, component='deploy', line=("Deployment of {} for {} complete".format( args.commit, args.deploy_group)), level='event') except (KeyboardInterrupt, TimeoutError): paasta_print("Waiting for deployment aborted.") return 1 except NoInstancesFound: return 1 return 0
def paasta_fsm(args): variables = get_paasta_config(yelpsoa_config_root=args.yelpsoa_config_root) destination = args.yelpsoa_config_root paasta_config = load_system_paasta_config() template = paasta_config.get_fsm_template() write_paasta_config(variables=variables, template=template, destination=destination) paasta_print(PaastaColors.yellow(" _ _(o)_(o)_ _")) paasta_print(PaastaColors.red(r" ._\`:_ F S M _:' \_,")) paasta_print(PaastaColors.green(r" / (`---'\ `-.")) paasta_print(PaastaColors.cyan(" ,-` _) (_,")) paasta_print("With My Noodly Appendage I Have Written Configs!") paasta_print() paasta_print( "Customize Them If It Makes You Happy -- http://y/paasta For Details") paasta_print("Remember To Add, Commit, And Push When You're Done:") paasta_print()
def stop_chronos_job(service, instance, client, cluster, existing_jobs, emergency=False): log_reason = PaastaColors.red( "EmergencyStop") if emergency else "Brutal bounce" for job in existing_jobs: name = PaastaColors.cyan(job["name"]) _log( service=service, line=f"{log_reason}: Killing all tasks for job {name}", component="deploy", level="event", cluster=cluster, instance=instance, ) job["disabled"] = True client.update(job) client.delete_tasks(job["name"])
def test_duplicate_frameworks(): state = { 'frameworks': [ { 'name': 'test_framework1', }, { 'name': 'test_framework1', }, { 'name': 'test_framework1', }, { 'name': 'test_framework2', }, ] } output, ok = paasta_metastatus.assert_no_duplicate_frameworks(state) assert PaastaColors.red(" CRITICAL: Framework test_framework1 has 3 instances running--expected no more than 1.") \ in output assert not ok
def _cleanup_container(docker_client, container_id): if docker_client.inspect_container(container_id)["State"].get( "OOMKilled", False): print( PaastaColors.red( "Your service was killed by the OOM Killer!\n" "You've exceeded the memory limit, try increasing the mem parameter in your soa_configs" ), file=sys.stderr, ) print("\nStopping and removing the old container %s..." % container_id) print("(Please wait or you may leave an orphaned container.)") try: docker_client.stop(container_id) docker_client.remove_container(container_id) print("...done") except errors.APIError: print( PaastaColors.yellow( "Could not clean up container! You should stop and remove container '%s' manually." % container_id))
def extract_args(args): try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: paasta_print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and remote-run may not behave" "the same way it would behave on a server configured for PaaSTA." ), sep='\n', ) system_paasta_config = SystemPaastaConfig({"volumes": []}, '/etc/paasta') service = figure_out_service_name(args, soa_dir=args.yelpsoa_config_root) cluster = args.cluster or system_paasta_config.get_local_run_config().get( 'default_cluster', None) if not cluster: paasta_print( PaastaColors.red( "PaaSTA on this machine has not been configured with a default cluster." "Please pass one using '-c'."), sep='\n', file=sys.stderr, ) os._exit(1) soa_dir = args.yelpsoa_config_root instance = args.instance if instance is None: instance_type = 'adhoc' instance = 'remote' else: instance_type = validate_service_instance(service, instance, cluster, soa_dir) return (system_paasta_config, service, cluster, soa_dir, instance, instance_type)
def status_marathon_job_human(service, instance, deploy_status, app_id, running_instances, normal_instance_count): name = PaastaColors.cyan(compose_job_id(service, instance)) if deploy_status != 'NotRunning': if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green( "(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red( "(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow( "(%d/%d)" % (running_instances, normal_instance_count)) return "Marathon: %s - up with %s instances. Status: %s" % ( status, instance_count, deploy_status) else: status = PaastaColors.yellow("Warning") return "Marathon: %s - %s (app %s) is not configured in Marathon yet (waiting for bounce)" % ( status, name, app_id)
def start_chronos_job(service, instance, job_id, client, cluster, job_config, emergency=False): name = PaastaColors.cyan(job_id) log_reason = PaastaColors.red( "EmergencyStart") if emergency else "Brutal bounce" log_immediate_run = " and running it immediately" if not job_config[ "disabled"] else "" _log(service=service, line="%s: Sending job %s to Chronos%s" % (log_reason, name, log_immediate_run), component="deploy", level="event", cluster=cluster, instance=instance) client.update(job_config) # TODO fail or give some output/feedback to user that the job won't run immediately if disabled (PAASTA-1244) if not job_config["disabled"]: client.run(job_id)
def format_kubernetes_replicaset_table(replicasets): rows = [("ReplicaSet Name", "Ready / Desired", "Created at what localtime") ] for replicaset in replicasets: local_created_datetime = datetime_from_utc_to_local( datetime.fromtimestamp(replicaset.create_timestamp)) replica_status = f"{replicaset.ready_replicas}/{replicaset.replicas}" if replicaset.ready_replicas >= replicaset.replicas: replica_status = PaastaColors.green(replica_status) else: replica_status = PaastaColors.red(replica_status) rows.append(( replicaset.name, replica_status, "{} ({})".format( local_created_datetime.strftime("%Y-%m-%dT%H:%M"), humanize.naturaltime(local_created_datetime), ), )) return format_table(rows)
def status_kubernetes_job_human( service: str, instance: str, deploy_status: str, desired_app_id: str, app_count: int, running_instances: int, normal_instance_count: int, ) -> str: name = PaastaColors.cyan(compose_job_id(service, instance)) if app_count >= 0: if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green( "(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red( "(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow( "(%d/%d)" % (running_instances, normal_instance_count)) return "Kubernetes: {} - up with {} instances. Status: {}".format( status, instance_count, deploy_status, ) else: status = PaastaColors.yellow("Warning") return "Kubernetes: {} - {} (app {}) is not configured in Kubernetes yet (waiting for bounce)".format( status, name, desired_app_id, )
def validate_filtering_args(args, log_reader): if not log_reader.SUPPORTS_LINE_OFFSET and args.line_offset is not None: sys.stderr.write( PaastaColors.red(log_reader.__class__.__name__ + " does not support line based offsets")) return False if not log_reader.SUPPORTS_LINE_COUNT and args.line_count is not None: sys.stderr.write( PaastaColors.red( log_reader.__class__.__name__ + " does not support line count based log retrieval")) return False if not log_reader.SUPPORTS_TAILING and args.tail: sys.stderr.write( PaastaColors.red(log_reader.__class__.__name__ + " does not support tailing")) return False if not log_reader.SUPPORTS_TIME and (args.time_from is not None or args.time_to is not None): sys.stderr.write( PaastaColors.red(log_reader.__class__.__name__ + " does not support time based offsets")) return False if args.tail and (args.line_count is not None or args.time_from is not None or args.time_to is not None or args.line_offset is not None): sys.stderr.write( PaastaColors.red( "You cannot specify line/time based filtering parameters when tailing" )) return False # Can't have both if args.line_count is not None and args.time_from is not None: sys.stderr.write( PaastaColors.red( "You cannot filter based on both line counts and time")) return False return True
def _format_config_hash(job): job_id = job.get("name", PaastaColors.red("UNKNOWN")) return job_id
def _format_command(job): command = job.get("command", PaastaColors.red("UNKNOWN")) return command
def test_generate_summary_for_results_critical(): assert (metastatus_lib.generate_summary_for_check( "Myservice", False) == "Myservice Status: %s" % PaastaColors.red("CRITICAL"))
def configure_and_run_docker_container( docker_client, docker_url, docker_sha, service, instance, cluster, system_paasta_config, args, pull_image=False, dry_run=False, ): """ Run Docker container by image hash with args set in command line. Function prints the output of run command in stdout. """ if instance is None and args.healthcheck_only: paasta_print( "With --healthcheck-only, --instance MUST be provided!", file=sys.stderr ) return 1 if instance is None and not sys.stdin.isatty(): paasta_print( "--instance and --cluster must be specified when using paasta local-run without a tty!", file=sys.stderr, ) return 1 soa_dir = args.yelpsoa_config_root volumes = list() load_deployments = (docker_url is None or pull_image) and not docker_sha interactive = args.interactive try: if instance is None: instance_type = "adhoc" instance = "interactive" instance_config = get_default_interactive_config( service=service, cluster=cluster, soa_dir=soa_dir, load_deployments=load_deployments, ) interactive = True else: instance_type = validate_service_instance( service, instance, cluster, soa_dir ) instance_config = get_instance_config( service=service, instance=instance, cluster=cluster, load_deployments=load_deployments, soa_dir=soa_dir, ) except NoConfigurationForServiceError as e: paasta_print(str(e), file=sys.stderr) return 1 except NoDeploymentsAvailable: paasta_print( PaastaColors.red( "Error: No deployments.json found in %(soa_dir)s/%(service)s. " "You can generate this by running: " "generate_deployments_for_service -d %(soa_dir)s -s %(service)s" % {"soa_dir": soa_dir, "service": service} ), sep="\n", file=sys.stderr, ) return 1 if docker_sha is not None: instance_config.branch_dict = { "git_sha": docker_sha, "docker_image": build_docker_image_name(service=service, sha=docker_sha), "desired_state": "start", "force_bounce": None, } if docker_url is None: try: docker_url = instance_config.get_docker_url() except NoDockerImageError: if instance_config.get_deploy_group() is None: paasta_print( PaastaColors.red( f"Error: {service}.{instance} has no 'deploy_group' set. Please set one so " "the proper image can be used to run for this service." ), sep="", file=sys.stderr, ) else: paasta_print( PaastaColors.red( "Error: No sha has been marked for deployment for the %s deploy group.\n" "Please ensure this service has either run through a jenkins pipeline " "or paasta mark-for-deployment has been run for %s\n" % (instance_config.get_deploy_group(), service) ), sep="", file=sys.stderr, ) return 1 if pull_image: docker_pull_image(docker_url) for volume in instance_config.get_volumes(system_paasta_config.get_volumes()): if os.path.exists(volume["hostPath"]): volumes.append( "{}:{}:{}".format( volume["hostPath"], volume["containerPath"], volume["mode"].lower() ) ) else: paasta_print( PaastaColors.yellow( "Warning: Path %s does not exist on this host. Skipping this binding." % volume["hostPath"] ), file=sys.stderr, ) if interactive is True and args.cmd is None: command = "bash" elif args.cmd: command = args.cmd else: command_from_config = instance_config.get_cmd() if command_from_config: command = format_command_for_type( command=command_from_config, instance_type=instance_type, date=args.date ) else: command = instance_config.get_args() secret_provider_kwargs = { "vault_cluster_config": system_paasta_config.get_vault_cluster_config(), "vault_auth_method": args.vault_auth_method, "vault_token_file": args.vault_token_file, } return run_docker_container( docker_client=docker_client, service=service, instance=instance, docker_url=docker_url, volumes=volumes, interactive=interactive, command=command, healthcheck=args.healthcheck, healthcheck_only=args.healthcheck_only, user_port=args.user_port, instance_config=instance_config, soa_dir=args.yelpsoa_config_root, dry_run=dry_run, json_dict=args.dry_run_json_dict, framework=instance_type, secret_provider_name=system_paasta_config.get_secret_provider_name(), secret_provider_kwargs=secret_provider_kwargs, skip_secrets=args.skip_secrets, )
def run_docker_container( docker_client, service, instance, docker_url, volumes, interactive, command, healthcheck, healthcheck_only, user_port, instance_config, secret_provider_name, soa_dir=DEFAULT_SOA_DIR, dry_run=False, json_dict=False, framework=None, secret_provider_kwargs={}, skip_secrets=False, ): """docker-py has issues running a container with a TTY attached, so for consistency we execute 'docker run' directly in both interactive and non-interactive modes. In non-interactive mode when the run is complete, stop the container and remove it (with docker-py). """ if user_port: if check_if_port_free(user_port): chosen_port = user_port else: paasta_print( PaastaColors.red( "The chosen port is already in use!\n" "Try specifying another one, or omit (--port|-o) and paasta will find a free one for you" ), file=sys.stderr, ) sys.exit(1) else: chosen_port = pick_random_port(service) environment = instance_config.get_env_dictionary() if not skip_secrets: secret_environment = decrypt_secret_environment_variables( secret_provider_name=secret_provider_name, environment=environment, soa_dir=soa_dir, service_name=service, cluster_name=instance_config.cluster, secret_provider_kwargs=secret_provider_kwargs, ) environment.update(secret_environment) local_run_environment = get_local_run_environment_vars( instance_config=instance_config, port0=chosen_port, framework=framework ) environment.update(local_run_environment) net = instance_config.get_net() memory = instance_config.get_mem() container_name = get_container_name() docker_params = instance_config.format_docker_parameters() healthcheck_mode, healthcheck_data = get_healthcheck_for_instance( service, instance, instance_config, chosen_port, soa_dir=soa_dir ) if healthcheck_mode is None: container_port = None interactive = True elif not user_port and not healthcheck and not healthcheck_only: container_port = None else: try: container_port = instance_config.get_container_port() except AttributeError: container_port = None simulate_healthcheck = ( healthcheck_only or healthcheck ) and healthcheck_mode is not None docker_run_args = dict( memory=memory, chosen_port=chosen_port, container_port=container_port, container_name=container_name, volumes=volumes, env=environment, interactive=interactive, detach=simulate_healthcheck, docker_hash=docker_url, command=command, net=net, docker_params=docker_params, ) docker_run_cmd = get_docker_run_cmd(**docker_run_args) joined_docker_run_cmd = " ".join(docker_run_cmd) if dry_run: if json_dict: paasta_print(json.dumps(docker_run_args)) else: paasta_print(json.dumps(docker_run_cmd)) return 0 else: paasta_print( "Running docker command:\n%s" % PaastaColors.grey(joined_docker_run_cmd) ) merged_env = {**os.environ, **environment} if interactive or not simulate_healthcheck: # NOTE: This immediately replaces us with the docker run cmd. Docker # run knows how to clean up the running container in this situation. wrapper_path = shutil.which("paasta_docker_wrapper") # To properly simulate mesos, we pop the PATH, which is not available to # The executor merged_env.pop("PATH") execlpe(wrapper_path, *docker_run_cmd, merged_env) # For testing, when execlpe is patched out and doesn't replace us, we # still want to bail out. return 0 container_started = False container_id = None try: (returncode, output) = _run(docker_run_cmd, env=merged_env) if returncode != 0: paasta_print( "Failure trying to start your container!" "Returncode: %d" "Output:" "%s" "" "Fix that problem and try again." "http://y/paasta-troubleshooting" % (returncode, output), sep="\n", ) # Container failed to start so no need to cleanup; just bail. sys.exit(1) container_started = True container_id = get_container_id(docker_client, container_name) paasta_print("Found our container running with CID %s" % container_id) if simulate_healthcheck: healthcheck_result = simulate_healthcheck_on_service( instance_config=instance_config, docker_client=docker_client, container_id=container_id, healthcheck_mode=healthcheck_mode, healthcheck_data=healthcheck_data, healthcheck_enabled=healthcheck, ) def _output_exit_code(): returncode = docker_client.inspect_container(container_id)["State"][ "ExitCode" ] paasta_print(f"Container exited: {returncode})") if healthcheck_only: if container_started: _output_exit_code() _cleanup_container(docker_client, container_id) if healthcheck_mode is None: paasta_print( "--healthcheck-only, but no healthcheck is defined for this instance!" ) sys.exit(1) elif healthcheck_result is True: sys.exit(0) else: sys.exit(1) running = docker_client.inspect_container(container_id)["State"]["Running"] if running: paasta_print("Your service is now running! Tailing stdout and stderr:") for line in docker_client.attach( container_id, stderr=True, stream=True, logs=True ): paasta_print(line) else: _output_exit_code() returncode = 3 except KeyboardInterrupt: returncode = 3 # Cleanup if the container exits on its own or interrupted. if container_started: returncode = docker_client.inspect_container(container_id)["State"]["ExitCode"] _cleanup_container(docker_client, container_id) return returncode
def configure_and_run_docker_container( docker_client, docker_hash, service, instance, cluster, system_paasta_config, args, pull_image=False, dry_run=False ): """ Run Docker container by image hash with args set in command line. Function prints the output of run command in stdout. """ if instance is None and args.healthcheck_only: paasta_print( "With --healthcheck-only, --instance MUST be provided!", file=sys.stderr, ) return 1 if instance is None and not sys.stdin.isatty(): paasta_print( "--instance and --cluster must be specified when using paasta local-run without a tty!", file=sys.stderr, ) return 1 soa_dir = args.yelpsoa_config_root volumes = list() load_deployments = docker_hash is None or pull_image interactive = args.interactive try: if instance is None: instance_type = 'adhoc' instance = 'interactive' instance_config = get_default_interactive_config( service=service, cluster=cluster, soa_dir=soa_dir, load_deployments=load_deployments, ) interactive = True else: instance_type = validate_service_instance(service, instance, cluster, soa_dir) instance_config = get_instance_config( service=service, instance=instance, cluster=cluster, load_deployments=load_deployments, soa_dir=soa_dir, ) except NoConfigurationForServiceError as e: paasta_print(str(e), file=sys.stderr) return 1 except NoDeploymentsAvailable: paasta_print( PaastaColors.red( "Error: No deployments.json found in %(soa_dir)s/%(service)s." "You can generate this by running:" "generate_deployments_for_service -d %(soa_dir)s -s %(service)s" % { 'soa_dir': soa_dir, 'service': service, } ), sep='\n', file=sys.stderr, ) return 1 if docker_hash is None: try: docker_url = instance_config.get_docker_url() except NoDockerImageError: paasta_print(PaastaColors.red( "Error: No sha has been marked for deployment for the %s deploy group.\n" "Please ensure this service has either run through a jenkins pipeline " "or paasta mark-for-deployment has been run for %s\n" % (instance_config.get_deploy_group(), service)), sep='', file=sys.stderr, ) return 1 docker_hash = docker_url if pull_image: docker_pull_image(docker_url) # if only one volume specified, extra_volumes should be converted to a list extra_volumes = instance_config.get_extra_volumes() if type(extra_volumes) == dict: extra_volumes = [extra_volumes] for volume in system_paasta_config.get_volumes() + extra_volumes: volumes.append('%s:%s:%s' % (volume['hostPath'], volume['containerPath'], volume['mode'].lower())) if interactive is True and args.cmd is None: command = 'bash' elif args.cmd: command = args.cmd else: command_from_config = instance_config.get_cmd() if command_from_config: command_modifier = command_function_for_framework(instance_type) command = command_modifier(command_from_config) else: command = instance_config.get_args() return run_docker_container( docker_client=docker_client, service=service, instance=instance, docker_hash=docker_hash, volumes=volumes, interactive=interactive, command=command, healthcheck=args.healthcheck, healthcheck_only=args.healthcheck_only, user_port=args.user_port, instance_config=instance_config, soa_dir=args.yelpsoa_config_root, dry_run=dry_run, json_dict=args.dry_run_json_dict, framework=instance_type, )
def simulate_healthcheck_on_service( instance_config, docker_client, container_id, healthcheck_mode, healthcheck_data, healthcheck_enabled ): """Simulates Marathon-style healthcheck on given service if healthcheck is enabled :param instance_config: service manifest :param docker_client: Docker client object :param container_id: Docker container id :param healthcheck_data: tuple url to healthcheck :param healthcheck_enabled: boolean :returns: healthcheck_passed: boolean """ healthcheck_link = PaastaColors.cyan(healthcheck_data) if healthcheck_enabled: grace_period = instance_config.get_healthcheck_grace_period_seconds() timeout = instance_config.get_healthcheck_timeout_seconds() interval = instance_config.get_healthcheck_interval_seconds() max_failures = instance_config.get_healthcheck_max_consecutive_failures() paasta_print('\nStarting health check via %s (waiting %s seconds before ' 'considering failures due to grace period):' % (healthcheck_link, grace_period)) # silenty start performing health checks until grace period ends or first check succeeds graceperiod_end_time = time.time() + grace_period after_grace_period_attempts = 0 while True: # First inspect the container for early exits container_state = docker_client.inspect_container(container_id) if not container_state['State']['Running']: paasta_print( PaastaColors.red('Container exited with code {}'.format( container_state['State']['ExitCode'], )) ) healthcheck_passed = False break healthcheck_passed, healthcheck_output = run_healthcheck_on_container( docker_client, container_id, healthcheck_mode, healthcheck_data, timeout, ) # Yay, we passed the healthcheck if healthcheck_passed: paasta_print("{}'{}' (via {})".format( PaastaColors.green("Healthcheck succeeded!: "), healthcheck_output, healthcheck_link, )) break # Otherwise, print why we failed if time.time() < graceperiod_end_time: color = PaastaColors.grey msg = '(disregarded due to grace period)' extra_msg = ' (via: {}. Output: {})'.format(healthcheck_link, healthcheck_output) else: # If we've exceeded the grace period, we start incrementing attempts after_grace_period_attempts += 1 color = PaastaColors.red msg = '(Attempt {} of {})'.format( after_grace_period_attempts, max_failures, ) extra_msg = ' (via: {}. Output: {})'.format(healthcheck_link, healthcheck_output) paasta_print('{}{}'.format( color('Healthcheck failed! {}'.format(msg)), extra_msg, )) if after_grace_period_attempts == max_failures: break time.sleep(interval) else: paasta_print('\nPaaSTA would have healthchecked your service via\n%s' % healthcheck_link) healthcheck_passed = True return healthcheck_passed
def run_docker_container( docker_client, service, instance, docker_hash, volumes, interactive, command, healthcheck, healthcheck_only, user_port, instance_config, soa_dir=DEFAULT_SOA_DIR, dry_run=False, json_dict=False, framework=None, ): """docker-py has issues running a container with a TTY attached, so for consistency we execute 'docker run' directly in both interactive and non-interactive modes. In non-interactive mode when the run is complete, stop the container and remove it (with docker-py). """ if user_port: if check_if_port_free(user_port): chosen_port = user_port else: paasta_print( PaastaColors.red( "The chosen port is already in use!\n" "Try specifying another one, or omit (--port|-o) and paasta will find a free one for you" ), file=sys.stderr, ) sys.exit(1) else: chosen_port = pick_random_port() environment = instance_config.get_env_dictionary() local_run_environment = get_local_run_environment_vars( instance_config=instance_config, port0=chosen_port, framework=framework, ) environment.update(local_run_environment) net = instance_config.get_net() memory = instance_config.get_mem() container_name = get_container_name() docker_params = instance_config.format_docker_parameters() try: container_port = instance_config.get_container_port() except AttributeError: container_port = None docker_run_args = dict( memory=memory, chosen_port=chosen_port, container_port=container_port, container_name=container_name, volumes=volumes, env=environment, interactive=interactive, docker_hash=docker_hash, command=command, net=net, docker_params=docker_params, ) docker_run_cmd = get_docker_run_cmd(**docker_run_args) joined_docker_run_cmd = ' '.join(docker_run_cmd) healthcheck_mode, healthcheck_data = get_healthcheck_for_instance( service, instance, instance_config, chosen_port, soa_dir=soa_dir) if dry_run: if json_dict: paasta_print(json.dumps(docker_run_args)) else: paasta_print(json.dumps(docker_run_cmd)) return 0 else: paasta_print('Running docker command:\n%s' % PaastaColors.grey(joined_docker_run_cmd)) if interactive: # NOTE: This immediately replaces us with the docker run cmd. Docker # run knows how to clean up the running container in this situation. execlp('paasta_docker_wrapper', *docker_run_cmd) # For testing, when execlp is patched out and doesn't replace us, we # still want to bail out. return 0 container_started = False container_id = None try: (returncode, output) = _run(docker_run_cmd) if returncode != 0: paasta_print( 'Failure trying to start your container!' 'Returncode: %d' 'Output:' '%s' '' 'Fix that problem and try again.' 'http://y/paasta-troubleshooting' % (returncode, output), sep='\n', ) # Container failed to start so no need to cleanup; just bail. sys.exit(1) container_started = True container_id = get_container_id(docker_client, container_name) paasta_print('Found our container running with CID %s' % container_id) # If the service has a healthcheck, simulate it if healthcheck_mode is not None: healthcheck_result = simulate_healthcheck_on_service( instance_config=instance_config, docker_client=docker_client, container_id=container_id, healthcheck_mode=healthcheck_mode, healthcheck_data=healthcheck_data, healthcheck_enabled=healthcheck, ) def _output_stdout_and_exit_code(): returncode = docker_client.inspect_container(container_id)['State']['ExitCode'] paasta_print('Container exited: %d)' % returncode) paasta_print('Here is the stdout and stderr:\n\n') paasta_print( docker_client.attach(container_id, stderr=True, stream=False, logs=True) ) if healthcheck_only: if container_started: _output_stdout_and_exit_code() _cleanup_container(docker_client, container_id) if healthcheck_mode is None: paasta_print('--healthcheck-only, but no healthcheck is defined for this instance!') sys.exit(1) elif healthcheck_result is True: sys.exit(0) else: sys.exit(1) running = docker_client.inspect_container(container_id)['State']['Running'] if running: paasta_print('Your service is now running! Tailing stdout and stderr:') for line in docker_client.attach(container_id, stderr=True, stream=True, logs=True): paasta_print(line) else: _output_stdout_and_exit_code() returncode = 3 except KeyboardInterrupt: returncode = 3 # Cleanup if the container exits on its own or interrupted. if container_started: returncode = docker_client.inspect_container(container_id)['State']['ExitCode'] _cleanup_container(docker_client, container_id) return returncode
def paasta_local_run(args): if args.action == "pull" and os.geteuid() != 0 and not docker_config_available(): paasta_print("Re-executing paasta local-run --pull with sudo..") os.execvp("sudo", ["sudo", "-H"] + sys.argv) if args.action == "build" and not makefile_responds_to("cook-image"): paasta_print( "A local Makefile with a 'cook-image' target is required for --build", file=sys.stderr, ) paasta_print( "If you meant to pull the docker image from the registry, explicitly pass --pull", file=sys.stderr, ) return 1 try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: paasta_print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and local-run may not behave" "the same way it would behave on a server configured for PaaSTA." ), sep="\n", ) system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta") local_run_config = system_paasta_config.get_local_run_config() service = figure_out_service_name(args, soa_dir=args.yelpsoa_config_root) if args.cluster: cluster = args.cluster else: try: cluster = local_run_config["default_cluster"] except KeyError: paasta_print( PaastaColors.red( "PaaSTA on this machine has not been configured with a default cluster." "Please pass one to local-run using '-c'." ), sep="\n", file=sys.stderr, ) return 1 instance = args.instance docker_client = get_docker_client() docker_sha = None docker_url = None if args.action == "build": default_tag = "paasta-local-run-{}-{}".format(service, get_username()) docker_url = os.environ.get("DOCKER_TAG", default_tag) os.environ["DOCKER_TAG"] = docker_url pull_image = False cook_return = paasta_cook_image( args=None, service=service, soa_dir=args.yelpsoa_config_root ) if cook_return != 0: return cook_return elif args.action == "dry_run": pull_image = False docker_url = None docker_sha = args.sha else: pull_image = True docker_url = None docker_sha = args.sha try: return configure_and_run_docker_container( docker_client=docker_client, docker_url=docker_url, docker_sha=docker_sha, service=service, instance=instance, cluster=cluster, args=args, pull_image=pull_image, system_paasta_config=system_paasta_config, dry_run=args.action == "dry_run", ) except errors.APIError as e: paasta_print("Can't run Docker container. Error: %s" % str(e), file=sys.stderr) return 1
def paasta_local_run(args): if args.action == 'build' and not makefile_responds_to('cook-image'): paasta_print("A local Makefile with a 'cook-image' target is required for --build", file=sys.stderr) paasta_print("If you meant to pull the docker image from the registry, explicitly pass --pull", file=sys.stderr) return 1 try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: paasta_print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and local-run may not behave" "the same way it would behave on a server configured for PaaSTA." ), sep='\n', ) system_paasta_config = SystemPaastaConfig({"volumes": []}, '/etc/paasta') local_run_config = system_paasta_config.get_local_run_config() service = figure_out_service_name(args, soa_dir=args.yelpsoa_config_root) if args.cluster: cluster = args.cluster else: try: cluster = local_run_config['default_cluster'] except KeyError: paasta_print( PaastaColors.red( "PaaSTA on this machine has not been configured with a default cluster." "Please pass one to local-run using '-c'."), sep='\n', file=sys.stderr, ) return 1 instance = args.instance docker_client = get_docker_client() if args.action == 'build': default_tag = 'paasta-local-run-%s-%s' % (service, get_username()) tag = os.environ.get('DOCKER_TAG', default_tag) os.environ['DOCKER_TAG'] = tag pull_image = False cook_return = paasta_cook_image(args=None, service=service, soa_dir=args.yelpsoa_config_root) if cook_return != 0: return cook_return elif args.action == 'dry_run': pull_image = False tag = None else: pull_image = True tag = None try: return configure_and_run_docker_container( docker_client=docker_client, docker_hash=tag, service=service, instance=instance, cluster=cluster, args=args, pull_image=pull_image, system_paasta_config=system_paasta_config, dry_run=args.action == 'dry_run', ) except errors.APIError as e: paasta_print( 'Can\'t run Docker container. Error: %s' % str(e), file=sys.stderr, ) return 1
def simulate_healthcheck_on_service( instance_config, docker_client, container_id, healthcheck_mode, healthcheck_data, healthcheck_enabled, ): """Simulates Marathon-style healthcheck on given service if healthcheck is enabled :param instance_config: service manifest :param docker_client: Docker client object :param container_id: Docker container id :param healthcheck_data: tuple url to healthcheck :param healthcheck_enabled: boolean :returns: healthcheck_passed: boolean """ healthcheck_link = PaastaColors.cyan(healthcheck_data) if healthcheck_enabled: grace_period = instance_config.get_healthcheck_grace_period_seconds() timeout = instance_config.get_healthcheck_timeout_seconds() interval = instance_config.get_healthcheck_interval_seconds() max_failures = instance_config.get_healthcheck_max_consecutive_failures() paasta_print( "\nStarting health check via %s (waiting %s seconds before " "considering failures due to grace period):" % (healthcheck_link, grace_period) ) # silently start performing health checks until grace period ends or first check succeeds graceperiod_end_time = time.time() + grace_period after_grace_period_attempts = 0 healthchecking = True def _stream_docker_logs(container_id, generator): while healthchecking: try: # the generator will block until another log line is available log_line = next(generator).decode("utf-8").rstrip("\n") if healthchecking: paasta_print(f"container [{container_id[:12]}]: {log_line}") else: # stop streaming at first opportunity, since generator.close() # cant be used until the container is dead break except StopIteration: # natural end of logs break docker_logs_generator = docker_client.logs( container_id, stderr=True, stream=True ) threading.Thread( target=_stream_docker_logs, daemon=True, args=(container_id, docker_logs_generator), ).start() while True: # First inspect the container for early exits container_state = docker_client.inspect_container(container_id) if not container_state["State"]["Running"]: paasta_print( PaastaColors.red( "Container exited with code {}".format( container_state["State"]["ExitCode"] ) ) ) healthcheck_passed = False break healthcheck_passed, healthcheck_output = run_healthcheck_on_container( docker_client, container_id, healthcheck_mode, healthcheck_data, timeout ) # Yay, we passed the healthcheck if healthcheck_passed: paasta_print( "{}'{}' (via {})".format( PaastaColors.green("Healthcheck succeeded!: "), healthcheck_output, healthcheck_link, ) ) break # Otherwise, print why we failed if time.time() < graceperiod_end_time: color = PaastaColors.grey msg = "(disregarded due to grace period)" extra_msg = f" (via: {healthcheck_link}. Output: {healthcheck_output})" else: # If we've exceeded the grace period, we start incrementing attempts after_grace_period_attempts += 1 color = PaastaColors.red msg = "(Attempt {} of {})".format( after_grace_period_attempts, max_failures ) extra_msg = f" (via: {healthcheck_link}. Output: {healthcheck_output})" paasta_print("{}{}".format(color(f"Healthcheck failed! {msg}"), extra_msg)) if after_grace_period_attempts == max_failures: break time.sleep(interval) healthchecking = False # end docker logs stream else: paasta_print( "\nPaaSTA would have healthchecked your service via\n%s" % healthcheck_link ) healthcheck_passed = True return healthcheck_passed