def get_deployments_strings(service: str, soa_dir: str) -> List[str]: output = [] try: deployments = get_actual_deployments(service, soa_dir) except NoDeploymentsAvailable: deployments = {} if deployments == {}: output.append(" - N/A: Not deployed to any PaaSTA Clusters") else: service_config = load_service_namespace_config(service=service, namespace="main", soa_dir=soa_dir) service_mode = service_config.get_mode() for cluster in deployments_to_clusters(deployments): if service_mode == "tcp": service_port = service_config.get("proxy_port") link = PaastaColors.cyan("%s://paasta-%s.yelp:%d/" % (service_mode, cluster, service_port)) elif service_mode == "http" or service_mode == "https": link = PaastaColors.cyan( f"{service_mode}://{service}.paasta-{cluster}.yelp/") else: link = "N/A" output.append(f" - {cluster} ({link})") return output
def get_service_info(service): service_configuration = read_service_configuration(service) description = service_configuration.get('description', NO_DESCRIPTION_MESSAGE) external_link = service_configuration.get('external_link', NO_EXTERNAL_LINK_MESSAGE) pipeline_url = get_pipeline_url(service) smartstack_endpoints = get_smartstack_endpoints(service) git_url = get_git_url(service) output = [] output.append('Service Name: %s' % service) output.append('Description: %s' % description) output.append('External Link: %s' % PaastaColors.cyan(external_link)) output.append('Monitored By: team %s' % get_team(service=service, overrides={})) output.append('Runbook: %s' % PaastaColors.cyan(get_runbook(service=service, overrides={}))) output.append('Git Repo: %s' % git_url) output.append('Jenkins Pipeline: %s' % pipeline_url) output.append('Deployed to the following clusters:') output.extend(get_deployments_strings(service)) if smartstack_endpoints: output.append('Smartstack endpoint(s):') for endpoint in smartstack_endpoints: output.append(' - %s' % endpoint) output.append('Dashboard(s):') output.extend(get_dashboard_urls(service)) return '\n'.join(output)
def get_service_info(service, soa_dir): service_configuration = read_service_configuration(service, soa_dir) description = service_configuration.get('description', NO_DESCRIPTION_MESSAGE) external_link = service_configuration.get('external_link', NO_EXTERNAL_LINK_MESSAGE) smartstack_endpoints = get_smartstack_endpoints(service, soa_dir) git_url = get_git_url(service, soa_dir) output = [] output.append('Service Name: %s' % service) output.append('Description: %s' % description) output.append('External Link: %s' % PaastaColors.cyan(external_link)) output.append('Monitored By: team %s' % get_team(service=service, overrides={})) output.append( 'Runbook: %s' % PaastaColors.cyan(get_runbook(service=service, overrides={}))) output.append('Git Repo: %s' % git_url) output.append('Deployed to the following clusters:') output.extend(get_deployments_strings(service, soa_dir)) if smartstack_endpoints: output.append('Smartstack endpoint(s):') for endpoint in smartstack_endpoints: output.append(' - %s' % endpoint) output.append('Dashboard(s):') output.extend(get_dashboard_urls(service)) return '\n'.join(output)
def get_deployments_strings(service: str, soa_dir: str) -> List[str]: output = [] try: deployments = get_actual_deployments(service, soa_dir) except NoDeploymentsAvailable: deployments = {} if deployments == {}: output.append(' - N/A: Not deployed to any PaaSTA Clusters') else: service_config = load_service_namespace_config( service=service, namespace='main', soa_dir=soa_dir, ) service_mode = service_config.get_mode() for cluster in deployments_to_clusters(deployments): if service_mode == "tcp": service_port = service_config.get('proxy_port') link = PaastaColors.cyan('%s://paasta-%s.yelp:%d/' % (service_mode, cluster, service_port)) elif service_mode == "http" or service_mode == "https": link = PaastaColors.cyan('%s://%s.paasta-%s.yelp/' % (service_mode, service, cluster)) else: link = "N/A" output.append(' - %s (%s)' % (cluster, link)) return output
def get_service_info(service, soa_dir): service_configuration = read_service_configuration(service, soa_dir) description = service_configuration.get("description", NO_DESCRIPTION_MESSAGE) external_link = service_configuration.get("external_link", NO_EXTERNAL_LINK_MESSAGE) smartstack_endpoints = get_smartstack_endpoints(service, soa_dir) git_url = get_git_url(service, soa_dir) output = [] output.append("Service Name: %s" % service) output.append("Description: %s" % description) output.append("External Link: %s" % PaastaColors.cyan(external_link)) output.append("Monitored By: team %s" % get_team(service=service, overrides={}, soa_dir=soa_dir)) output.append("Runbook: %s" % PaastaColors.cyan( get_runbook(service=service, overrides={}, soa_dir=soa_dir))) output.append("Git Repo: %s" % git_url) output.append("Deployed to the following clusters:") output.extend(get_deployments_strings(service, soa_dir)) if smartstack_endpoints: output.append("Smartstack endpoint(s):") for endpoint in smartstack_endpoints: output.append(" - %s" % endpoint) output.append("Dashboard(s):") output.extend(get_dashboard_urls(service)) return "\n".join(output)
class NoSuchService(Exception): """Exception to be raised in the event that the service name can not be guessed. """ GUESS_ERROR_MSG = ("Could not determine service name.\n" "Please run this from the root of a copy " "(git clone) of your service.\n" "Alternatively, supply the %s name you wish to " "inspect with the %s option." % (PaastaColors.cyan("SERVICE"), PaastaColors.cyan("-s"))) CHECK_ERROR_MSG = ( "not found. Please provide a valid service name.\n" "Ensure that a directory of the same name exists in %s." % PaastaColors.green("/nail/etc/services")) def __init__(self, service): self.service = service def __str__(self): if self.service: return "SERVICE: {} {}".format(PaastaColors.cyan(self.service), self.CHECK_ERROR_MSG) else: return self.GUESS_ERROR_MSG
def pick_default_log_mode(args, log_reader, service, levels, components, clusters, instances): if log_reader.SUPPORTS_LINE_COUNT: paasta_print( PaastaColors.cyan( "Fetching 100 lines and applying filters. Try -n 1000 for more lines..." ), file=sys.stderr, ) log_reader.print_last_n_logs( service=service, line_count=100, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 elif log_reader.SUPPORTS_TIME: start_time, end_time = generate_start_end_time() paasta_print( PaastaColors.cyan( "Fetching a specific time period and applying filters..."), file=sys.stderr, ) log_reader.print_logs_by_time( service=service, start_time=start_time, end_time=end_time, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 elif log_reader.SUPPORTS_TAILING: paasta_print(PaastaColors.cyan("Tailing logs and applying filters..."), file=sys.stderr) log_reader.tail_logs( service=service, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0
def status_marathon_job_human( service, instance, deploy_status, app_id, running_instances, normal_instance_count, ): name = PaastaColors.cyan(compose_job_id(service, instance)) if deploy_status != 'NotRunning': if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green( "(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red( "(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow( "(%d/%d)" % (running_instances, normal_instance_count)) return "Marathon: %s - up with %s instances. Status: %s" % ( status, instance_count, deploy_status) else: status = PaastaColors.yellow("Warning") return "Marathon: %s - %s (app %s) is not configured in Marathon yet (waiting for bounce)" % ( status, name, app_id, )
def status_marathon_job_human( service: str, instance: str, deploy_status: str, desired_app_id: str, app_count: int, running_instances: int, normal_instance_count: int, ) -> str: name = PaastaColors.cyan(compose_job_id(service, instance)) if app_count >= 0: if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green( "(%d/%d)" % (running_instances, normal_instance_count) ) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red( "(%d/%d)" % (running_instances, normal_instance_count) ) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow( "(%d/%d)" % (running_instances, normal_instance_count) ) return "Marathon: {} - up with {} instances. Status: {}".format( status, instance_count, deploy_status ) else: status = PaastaColors.yellow("Warning") return "Marathon: {} - {} (app {}) is not configured in Marathon yet (waiting for bounce)".format( status, name, desired_app_id )
def git_repo_missing(git_url): git_url = PaastaColors.cyan(git_url) return failure( "Could not find Git repo %s. " "Your service must be there.\n" " More info:" % git_url, "http://y/yelpsoa-configs")
def start_chronos_job(service, instance, job_id, client, cluster, job_config, complete_job_config, emergency=False): """ Calls the 'manual start' Chronos endpoint (https://mesos.github.io/chronos/docs/api.html#manually-starting-a-job), running the job now regardless of its 'schedule'. The job's "schedule" is unmodified. If a job is disabled, this function does not do anything. """ name = PaastaColors.cyan(job_id) # The job should be run immediately as long as the job is not disabled via the 'disabled' key in soa-configs or has # been previously stopped. if complete_job_config['disabled']: print PaastaColors.red("You cannot emergency start a disabled job. Run `paasta start` first.") else: log_reason = PaastaColors.red("EmergencyStart") if emergency else "Brutal bounce" _log( service=service, line="%s: Starting manual run of %s in Chronos" % (log_reason, name), component="deploy", level="event", cluster=cluster, instance=instance ) client.update(complete_job_config) client.run(job_id)
def status_marathon_job(service, instance, app_id, normal_instance_count, client): name = PaastaColors.cyan(compose_job_id(service, instance)) if marathon_tools.is_app_id_running(app_id, client): app = client.get_app(app_id) running_instances = app.tasks_running if len(app.deployments) == 0: deploy_status = PaastaColors.bold("Running") else: deploy_status = PaastaColors.yellow("Deploying") if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green( "(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red( "(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow( "(%d/%d)" % (running_instances, normal_instance_count)) return "Marathon: %s - up with %s instances. Status: %s." % ( status, instance_count, deploy_status) else: red_not = PaastaColors.red("NOT") status = PaastaColors.red("Critical") return "Marathon: %s - %s (app %s) is %s running in Marathon." % ( status, name, app_id, red_not)
def missing_deployments_message(service): jenkins_url = PaastaColors.cyan( 'https://jenkins.yelpcorp.com/view/services-%s' % service) message = "%s No deployments in deployments.json yet.\n " \ "Has Jenkins run?\n " \ "Check: %s" % (x_mark(), jenkins_url) return message
def __str__(self): if self.service: return "SERVICE: {} {}".format( PaastaColors.cyan(self.service), self.CHECK_ERROR_MSG ) else: return self.GUESS_ERROR_MSG
def start_chronos_job(service, instance, job_id, client, cluster, job_config, complete_job_config, emergency=False): """ Calls the 'manual start' Chronos endpoint (https://mesos.github.io/chronos/docs/api.html#manually-starting-a-job), running the job now regardless of its 'schedule'. The job's "schedule" is unmodified. If a job is disabled, this function does not do anything. """ name = PaastaColors.cyan(job_id) # The job should be run immediately as long as the job is not disabled via the 'disabled' key in soa-configs or has # been previously stopped. if complete_job_config['disabled']: paasta_print( PaastaColors.red( "You cannot emergency start a disabled job. Run `paasta start` first." )) else: log_reason = PaastaColors.red( "EmergencyStart") if emergency else "Brutal bounce" _log(service=service, line="%s: Starting manual run of %s in Chronos" % (log_reason, name), component="deploy", level="event", cluster=cluster, instance=instance) client.update(complete_job_config) client.run(job_id)
def pick_default_log_mode(args, log_reader, service, levels, components, clusters, instances): if log_reader.SUPPORTS_LINE_COUNT: paasta_print( PaastaColors.cyan( "No filtering specified, grabbing last 100 lines" ), file=sys.stdout, ) log_reader.print_last_n_logs( service=service, line_count=100, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 elif log_reader.SUPPORTS_TIME: start_time, end_time = generate_start_end_time() paasta_print(PaastaColors.cyan("No filtering specified, grabbing last 30 minutes of logs"), file=sys.stderr) log_reader.print_logs_by_time( service=service, start_time=start_time, end_time=end_time, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 elif log_reader.SUPPORTS_TAILING: paasta_print(PaastaColors.cyan("No filtering specified, tailing logs"), file=sys.stderr) log_reader.tail_logs( service=service, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0
def get_cluster_dashboards(cluster): """Returns the direct dashboards for humans to use for a given cluster""" output = [] output.append( "Warning: Dashboards in prod are not directly reachable. " "See http://y/paasta-troubleshooting for instructions. (search for 'prod dashboards')" ) output.append("User Dashboards (Read Only):") output.append(" Mesos: %s" % PaastaColors.cyan("http://mesos.paasta-%s.yelp/" % cluster)) output.append( " Marathon: %s" % PaastaColors.cyan("http://marathon.paasta-%s.yelp/" % cluster)) output.append( " Chronos: %s" % PaastaColors.cyan("http://chronos.paasta-%s.yelp/" % cluster)) output.append(" Synapse: %s" % PaastaColors.cyan("http://paasta-%s.yelp:%s/" % (cluster, DEFAULT_SYNAPSE_PORT))) output.append("Admin Dashboards (Read/write, requires secrets):") output.append(" Mesos: %s" % PaastaColors.cyan("http://paasta-%s.yelp:5050/" % cluster)) output.append(" Marathon: %s" % PaastaColors.cyan("http://paasta-%s.yelp:5052/" % cluster)) output.append(" Chronos: %s" % PaastaColors.cyan("http://paasta-%s.yelp:5053/" % cluster)) return '\n'.join(output)
def pick_default_log_mode(args, log_reader, service, levels, components, clusters): if log_reader.SUPPORTS_LINE_COUNT: sys.stderr.write(PaastaColors.cyan("No filtering specified, grabbing last 100 lines") + "\n") log_reader.print_last_n_logs(service, 100, levels, components, clusters, raw_mode=args.raw_mode) return 0 elif log_reader.SUPPORTS_TIME: start_time, end_time = generate_start_end_time() sys.stderr.write(PaastaColors.cyan("No filtering specified, grabbing last 30 minutes of logs") + "\n") log_reader.print_logs_by_time(service, start_time, end_time, levels, components, clusters, raw_mode=args.raw_mode) return 0 elif log_reader.SUPPORTS_TAILING: sys.stderr.write(PaastaColors.cyan("No filtering specified, tailing logs") + "\n") log_reader.tail_logs(service, levels, components, clusters, raw_mode=args.raw_mode) return 0
def scale_marathon_job(service, instance, app_id, delta, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log(service=service, line="EmergencyScale: Scaling %s %s by %d instances" % (name, 'down' if delta < 0 else 'up', abs(int(delta))), component='deploy', level='event', cluster=cluster, instance=instance) client.scale_app(app_id, delta=int(delta), force=True)
def pick_default_log_mode(args, log_reader, service, levels, components, clusters, instances): if log_reader.SUPPORTS_LINE_COUNT: sys.stderr.write( PaastaColors.cyan( "No filtering specified, grabbing last 100 lines") + "\n") log_reader.print_last_n_logs(service, 100, levels, components, clusters, instances, raw_mode=args.raw_mode) return 0 elif log_reader.SUPPORTS_TIME: start_time, end_time = generate_start_end_time() sys.stderr.write( PaastaColors.cyan( "No filtering specified, grabbing last 30 minutes of logs") + "\n") log_reader.print_logs_by_time(service, start_time, end_time, levels, components, clusters, instances, raw_mode=args.raw_mode) return 0 elif log_reader.SUPPORTS_TAILING: sys.stderr.write( PaastaColors.cyan("No filtering specified, tailing logs") + "\n") log_reader.tail_logs(service, levels, components, clusters, instances, raw_mode=args.raw_mode) return 0
def start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log(service=service, line="EmergencyStart: scaling %s up to %d instances" % (name, normal_instance_count), component='deploy', level='event', cluster=cluster, instance=instance) client.scale_app(app_id, instances=normal_instance_count, force=True)
def scale_marathon_job(service, instance, app_id, delta, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log( service=service, line="EmergencyScale: Scaling %s %s by %d instances" % (name, 'down' if delta < 0 else 'up', abs(int(delta))), component='deploy', level='event', cluster=cluster, instance=instance ) client.scale_app(app_id, delta=int(delta), force=True)
def start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log( service=service, line="EmergencyStart: scaling %s up to %d instances" % (name, normal_instance_count), component='deploy', level='event', cluster=cluster, instance=instance ) client.scale_app(app_id, instances=normal_instance_count, force=True)
def stop_marathon_job(service, instance, app_id, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log(service=service, line="EmergencyStop: Scaling %s down to 0 instances" % (name), component='deploy', level='event', cluster=cluster, instance=instance) client.scale_app( app_id, instances=0, force=True ) # TODO do we want to capture the return val of any client calls?
def stop_marathon_job(service, instance, app_id, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log( service=service, line="EmergencyStop: Scaling %s down to 0 instances" % (name), component='deploy', level='event', cluster=cluster, instance=instance ) client.scale_app(app_id, instances=0, force=True) # TODO do we want to capture the return val of any client calls?
def restart_marathon_job(service, instance, app_id, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log( service=service, line="EmergencyRestart: Scaling %s down to 0 instances, then letting them scale back up" % (name), component='deploy', level='event', cluster=cluster, instance=instance ) client.scale_app(app_id, instances=0, force=True)
def pick_default_log_mode(args, log_reader, service, levels, components, clusters, instances): if log_reader.SUPPORTS_LINE_COUNT: paasta_print(PaastaColors.cyan("No filtering specified, grabbing last 100 lines"), file=sys.stdout) log_reader.print_last_n_logs( service=service, line_count=100, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 elif log_reader.SUPPORTS_TIME: start_time, end_time = generate_start_end_time() paasta_print(PaastaColors.cyan("No filtering specified, grabbing last 30 minutes of logs"), file=sys.stderr) log_reader.print_logs_by_time( service=service, start_time=start_time, end_time=end_time, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 elif log_reader.SUPPORTS_TAILING: paasta_print(PaastaColors.cyan("No filtering specified, tailing logs"), file=sys.stderr) log_reader.tail_logs( service=service, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0
def paasta_fsm(args): validate_args(args) (srvname, service_stanza, smartstack_stanza, monitoring_stanza, deploy_stanza, marathon_stanza, cluster_stanza, team) = ( get_paasta_config( args.yelpsoa_config_root, args.srvname, args.auto, args.port, args.team, args.description, args.external_link, ) ) srv = Service(srvname, args.yelpsoa_config_root) write_paasta_config( srv, service_stanza, smartstack_stanza, monitoring_stanza, deploy_stanza, marathon_stanza, cluster_stanza, ) print PaastaColors.yellow(" _ _(o)_(o)_ _") print PaastaColors.red(" ._\`:_ F S M _:' \_,") print PaastaColors.green(" / (`---'\ `-.") print PaastaColors.cyan(" ,-` _) (_,") print "With My Noodly Appendage I Have Written Configs For" print print PaastaColors.bold(" %s" % srvname) print print "Customize Them If It Makes You Happy -- http://y/paasta For Details" print "Remember To Add, Commit, And Push When You're Done:" print print "cd %s" % join(args.yelpsoa_config_root, srvname) print "# Review And/Or Customize Files" print "git add ." print "git commit -m'Initial Commit For %s'" % srvname print "git push origin HEAD # Pushmaster Or Ops Deputy Privs Required" print
def get_deployments_strings(service, soa_dir): output = [] try: deployments = get_actual_deployments(service, soa_dir) except NoDeploymentsAvailable: deployments = {} if deployments == {}: output.append(' - N/A: Not deployed to any PaaSTA Clusters') else: service_config = load_service_namespace_config(service, 'main', soa_dir) service_mode = service_config.get_mode() for cluster in deployments_to_clusters(deployments): if service_mode == "tcp": service_port = service_config.get('proxy_port') link = PaastaColors.cyan('%s://paasta-%s.yelp:%d/' % (service_mode, cluster, service_port)) elif service_mode == "http": link = PaastaColors.cyan('%s://%s.paasta-%s.yelp/' % (service_mode, service, cluster)) else: link = "N/A" output.append(' - %s (%s)' % (cluster, link)) return output
def get_cluster_dashboards(cluster): """Returns the direct dashboards for humans to use for a given cluster""" SPACER = ' ' try: dashboards = load_system_paasta_config().get_dashboard_links()[cluster] except KeyError: output = [PaastaColors.red('No dashboards configured for %s!' % cluster)] else: output = ['Dashboards:'] spacing = max((len(label) for label in dashboards.keys())) + 1 for label, url in dashboards.items(): output.append(' %s:%s%s' % (label, SPACER * (spacing - len(label)), PaastaColors.cyan(url))) return '\n'.join(output)
def paasta_fsm(args): variables = get_paasta_config(yelpsoa_config_root=args.yelpsoa_config_root) destination = args.yelpsoa_config_root paasta_config = load_system_paasta_config() template = paasta_config.get_fsm_template() write_paasta_config( variables=variables, template=template, destination=destination, ) print PaastaColors.yellow(" _ _(o)_(o)_ _") print PaastaColors.red(" ._\`:_ F S M _:' \_,") print PaastaColors.green(" / (`---'\ `-.") print PaastaColors.cyan(" ,-` _) (_,") print "With My Noodly Appendage I Have Written Configs!" print print "Customize Them If It Makes You Happy -- http://y/paasta For Details" print "Remember To Add, Commit, And Push When You're Done:" print
def stop_chronos_job(service, instance, client, cluster, existing_jobs, emergency=False): log_reason = PaastaColors.red("EmergencyStop") if emergency else "Brutal bounce" for job in existing_jobs: name = PaastaColors.cyan(job["name"]) _log( service=service, line="%s: Killing all tasks for job %s" % (log_reason, name), component="deploy", level="event", cluster=cluster, instance=instance ) job["disabled"] = True client.update(job) client.delete_tasks(job["name"])
def start_chronos_job(service, instance, job_id, client, cluster, job_config, emergency=False): name = PaastaColors.cyan(job_id) log_reason = PaastaColors.red("EmergencyStart") if emergency else "Brutal bounce" log_immediate_run = " and running it immediately" if not job_config["disabled"] else "" _log( service=service, line="%s: Sending job %s to Chronos%s" % (log_reason, name, log_immediate_run), component="deploy", level="event", cluster=cluster, instance=instance ) client.update(job_config) # TODO fail or give some output/feedback to user that the job won't run immediately if disabled (PAASTA-1244) if not job_config["disabled"]: client.run(job_id)
def get_cluster_dashboards(cluster): """Returns the direct dashboards for humans to use for a given cluster""" SPACER = ' ' try: dashboards = load_system_paasta_config().get_dashboard_links()[cluster] except KeyError as e: if e.args[0] == cluster: output = [PaastaColors.red('No dashboards configured for %s!' % cluster)] else: output = [PaastaColors.red('No dashboards configured!')] else: output = ['Dashboards:'] spacing = max((len(label) for label in dashboards.keys())) + 1 for label, url in dashboards.items(): output.append(' %s:%s%s' % (label, SPACER * (spacing - len(label)), PaastaColors.cyan(url))) return '\n'.join(output)
def get_cluster_dashboards(cluster): """Returns the direct dashboards for humans to use for a given cluster""" output = [] output.append("Warning: Dashboards in prod are not directly reachable. " "See http://y/paasta-troubleshooting for instructions. (search for 'prod dashboards')") output.append("User Dashboards (Read Only):") output.append(" Mesos: %s" % PaastaColors.cyan("http://mesos.paasta-%s.yelp/" % cluster)) output.append(" Marathon: %s" % PaastaColors.cyan("http://marathon.paasta-%s.yelp/" % cluster)) output.append(" Chronos: %s" % PaastaColors.cyan("http://chronos.paasta-%s.yelp/" % cluster)) output.append(" Synapse: %s" % PaastaColors.cyan("http://paasta-%s.yelp:%s/" % (cluster, DEFAULT_SYNAPSE_PORT))) output.append("Admin Dashboards (Read/write, requires secrets):") output.append(" Mesos: %s" % PaastaColors.cyan("http://paasta-%s.yelp:5050/" % cluster)) output.append(" Marathon: %s" % PaastaColors.cyan("http://paasta-%s.yelp:5052/" % cluster)) output.append(" Chronos: %s" % PaastaColors.cyan("http://paasta-%s.yelp:5053/" % cluster)) return '\n'.join(output)
def status_marathon_job_human(service, instance, deploy_status, app_id, running_instances, normal_instance_count): name = PaastaColors.cyan(compose_job_id(service, instance)) if deploy_status != 'NotRunning': if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green("(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red("(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow("(%d/%d)" % (running_instances, normal_instance_count)) return "Marathon: %s - up with %s instances. Status: %s" % (status, instance_count, deploy_status) else: red_not = PaastaColors.red("NOT") status = PaastaColors.red("Critical") return "Marathon: %s - %s (app %s) is %s running in Marathon." % (status, name, app_id, red_not)
def status_marathon_job_human( service, instance, deploy_status, app_id, running_instances, normal_instance_count, unused_offers_summary=None, ): name = PaastaColors.cyan(compose_job_id(service, instance)) if unused_offers_summary is not None and len(unused_offers_summary) > 0: stalled_str = "\n ".join([ "%s: %s times" % (k, n) for k, n in unused_offers_summary.items() ]) stall_reason = "\n Possibly stalled for:\n %s" % stalled_str else: stall_reason = "" if deploy_status != 'NotRunning': if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green( "(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red( "(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow( "(%d/%d)" % (running_instances, normal_instance_count)) return "Marathon: %s - up with %s instances. Status: %s%s" % ( status, instance_count, deploy_status, stall_reason, ) else: status = PaastaColors.yellow("Warning") return "Marathon: %s - %s (app %s) is not configured in Marathon yet (waiting for bounce)%s" % ( status, name, app_id, stall_reason, )
def status_marathon_job(service, instance, app_id, normal_instance_count, client): name = PaastaColors.cyan(compose_job_id(service, instance)) if marathon_tools.is_app_id_running(app_id, client): app = client.get_app(app_id) running_instances = app.tasks_running if len(app.deployments) == 0: deploy_status = PaastaColors.bold("Running") elif app.instances == 0 and app.tasks_running == 0: deploy_status = PaastaColors.grey("Stopped") else: # App is currently deploying so we should check the launch queue for more info is_overdue, backoff_seconds = marathon_tools.get_app_queue_status( client, app_id) if is_overdue: deploy_status = "%s (new tasks are not launching due to lack of capacity)" % PaastaColors.red( "Waiting") elif backoff_seconds: deploy_status = "%s (next task won't launch for %s seconds due to previous failures)" % ( PaastaColors.red("Delayed"), backoff_seconds) else: deploy_status = PaastaColors.yellow("Deploying") if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green( "(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red( "(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow( "(%d/%d)" % (running_instances, normal_instance_count)) return "Marathon: %s - up with %s instances. Status: %s" % ( status, instance_count, deploy_status) else: red_not = PaastaColors.red("NOT") status = PaastaColors.red("Critical") return "Marathon: %s - %s (app %s) is %s running in Marathon." % ( status, name, app_id, red_not)
def status_marathon_job(service, instance, app_id, normal_instance_count, client): name = PaastaColors.cyan(compose_job_id(service, instance)) if marathon_tools.is_app_id_running(app_id, client): app = client.get_app(app_id) running_instances = app.tasks_running deploy_status = marathon_tools.get_marathon_app_deploy_status_human(app, app_id, client) if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green("(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red("(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow("(%d/%d)" % (running_instances, normal_instance_count)) return "Marathon: %s - up with %s instances. Status: %s" % (status, instance_count, deploy_status) else: red_not = PaastaColors.red("NOT") status = PaastaColors.red("Critical") return "Marathon: %s - %s (app %s) is %s running in Marathon." % (status, name, app_id, red_not)
def get_cluster_dashboards(cluster: str, ) -> str: """Returns the direct dashboards for humans to use for a given cluster""" SPACER = " " try: dashboards = load_system_paasta_config().get_dashboard_links()[cluster] except KeyError as e: if e.args[0] == cluster: output = [ PaastaColors.red("No dashboards configured for %s!" % cluster) ] else: output = [PaastaColors.red("No dashboards configured!")] else: output = ["Dashboards:"] spacing = max((len(label) for label in dashboards.keys())) + 1 for label, urls in dashboards.items(): if isinstance(urls, list): urls = "\n %s" % "\n ".join(urls) output.append(" {}:{}{}".format(label, SPACER * (spacing - len(label)), PaastaColors.cyan(urls))) return "\n".join(output)
def status_marathon_job(service, instance, app_id, normal_instance_count, client): name = PaastaColors.cyan(compose_job_id(service, instance)) if marathon_tools.is_app_id_running(app_id, client): app = client.get_app(app_id) running_instances = app.tasks_running if len(app.deployments) == 0: deploy_status = PaastaColors.bold("Running") elif app.instances == 0 and app.tasks_running == 0: deploy_status = PaastaColors.grey("Stopped") else: # App is currently deploying so we should check the launch queue for more info is_overdue, backoff_seconds = marathon_tools.get_app_queue_status(client, app_id) if is_overdue: deploy_status = "%s (new tasks are not launching due to lack of capacity)" % PaastaColors.red("Waiting") elif backoff_seconds: deploy_status = "%s (next task won't launch for %s seconds due to previous failures)" % ( PaastaColors.red("Delayed"), backoff_seconds) else: deploy_status = PaastaColors.yellow("Deploying") if running_instances >= normal_instance_count: status = PaastaColors.green("Healthy") instance_count = PaastaColors.green("(%d/%d)" % (running_instances, normal_instance_count)) elif running_instances == 0: status = PaastaColors.yellow("Critical") instance_count = PaastaColors.red("(%d/%d)" % (running_instances, normal_instance_count)) else: status = PaastaColors.yellow("Warning") instance_count = PaastaColors.yellow("(%d/%d)" % (running_instances, normal_instance_count)) return "Marathon: %s - up with %s instances. Status: %s" % (status, instance_count, deploy_status) else: red_not = PaastaColors.red("NOT") status = PaastaColors.red("Critical") return "Marathon: %s - %s (app %s) is %s running in Marathon." % (status, name, app_id, red_not)
def simulate_healthcheck_on_service( instance_config, docker_client, container_id, healthcheck_mode, healthcheck_data, healthcheck_enabled ): """Simulates Marathon-style healthcheck on given service if healthcheck is enabled :param instance_config: service manifest :param docker_client: Docker client object :param container_id: Docker container id :param healthcheck_data: tuple url to healthcheck :param healthcheck_enabled: boolean :returns: healthcheck_passed: boolean """ healthcheck_link = PaastaColors.cyan(healthcheck_data) if healthcheck_enabled: grace_period = instance_config.get_healthcheck_grace_period_seconds() timeout = instance_config.get_healthcheck_timeout_seconds() interval = instance_config.get_healthcheck_interval_seconds() max_failures = instance_config.get_healthcheck_max_consecutive_failures() paasta_print('\nStarting health check via %s (waiting %s seconds before ' 'considering failures due to grace period):' % (healthcheck_link, grace_period)) # silenty start performing health checks until grace period ends or first check succeeds graceperiod_end_time = time.time() + grace_period after_grace_period_attempts = 0 while True: # First inspect the container for early exits container_state = docker_client.inspect_container(container_id) if not container_state['State']['Running']: paasta_print( PaastaColors.red('Container exited with code {}'.format( container_state['State']['ExitCode'], )) ) healthcheck_passed = False break healthcheck_passed, healthcheck_output = run_healthcheck_on_container( docker_client, container_id, healthcheck_mode, healthcheck_data, timeout, ) # Yay, we passed the healthcheck if healthcheck_passed: paasta_print("{}'{}' (via {})".format( PaastaColors.green("Healthcheck succeeded!: "), healthcheck_output, healthcheck_link, )) break # Otherwise, print why we failed if time.time() < graceperiod_end_time: color = PaastaColors.grey msg = '(disregarded due to grace period)' extra_msg = ' (via: {}. Output: {})'.format(healthcheck_link, healthcheck_output) else: # If we've exceeded the grace period, we start incrementing attempts after_grace_period_attempts += 1 color = PaastaColors.red msg = '(Attempt {} of {})'.format( after_grace_period_attempts, max_failures, ) extra_msg = ' (via: {}. Output: {})'.format(healthcheck_link, healthcheck_output) paasta_print('{}{}'.format( color('Healthcheck failed! {}'.format(msg)), extra_msg, )) if after_grace_period_attempts == max_failures: break time.sleep(interval) else: paasta_print('\nPaaSTA would have healthchecked your service via\n%s' % healthcheck_link) healthcheck_passed = True return healthcheck_passed
def simulate_healthcheck_on_service( instance_config, docker_client, container_id, healthcheck_mode, healthcheck_data, healthcheck_enabled ): """Simulates Marathon-style healthcheck on given service if healthcheck is enabled :param instance_config: service manifest :param docker_client: Docker client object :param container_id: Docker container id :param healthcheck_data: tuple url to healthcheck :param healthcheck_enabled: boolean :returns: a 2-tuple of (healthcheck_passed_bool, healthcheck_output_string) """ healthcheck_link = PaastaColors.cyan(healthcheck_data) if healthcheck_enabled: grace_period = instance_config.get_healthcheck_grace_period_seconds() timeout = instance_config.get_healthcheck_timeout_seconds() interval = instance_config.get_healthcheck_interval_seconds() max_failures = instance_config.get_healthcheck_max_consecutive_failures() sys.stdout.write('\nStarting health check via %s (waiting %s seconds before ' 'considering failures due to grace period):\n' % (healthcheck_link, grace_period)) # silenty start performing health checks until grace period ends or first check succeeds graceperiod_end_time = time.time() + grace_period after_grace_period_attempts = 0 while True: # First inspect the container for early exits container_state = docker_client.inspect_container(container_id) if not container_state['State']['Running']: sys.stdout.write( PaastaColors.red('Container exited with code {}'.format( container_state['State']['ExitCode'], )) + '\n' ) healthcheck_result = (False, "Aborted by the user") break healthcheck_result = run_healthcheck_on_container( docker_client, container_id, healthcheck_mode, healthcheck_data, timeout, ) # Yay, we passed the healthcheck if healthcheck_result[0]: sys.stdout.write("{}'{}' (via {})\n".format( PaastaColors.green("Healthcheck succeeded!: "), healthcheck_result[1], healthcheck_link, )) break # Otherwise, print why we failed if time.time() < graceperiod_end_time: color = PaastaColors.grey msg = '(disregarded due to grace period)' extra_msg = '' else: # If we've exceeded the grace period, we start incrementing attempts after_grace_period_attempts += 1 color = PaastaColors.red msg = '(Attempt {} of {})'.format( after_grace_period_attempts, max_failures, ) extra_msg = ' (via: {})'.format(healthcheck_link) sys.stdout.write('{}{}\n'.format( color('Healthcheck failed! {}'.format(msg)), extra_msg, )) if after_grace_period_attempts == max_failures: break time.sleep(interval) else: sys.stdout.write('\nMesos would have healthchecked your service via\n%s\n' % healthcheck_link) healthcheck_result = (True, "No healthcheck enabled") return healthcheck_result
def simulate_healthcheck_on_service( instance_config, docker_client, container_id, healthcheck_mode, healthcheck_data, healthcheck_enabled, ): """Simulates Marathon-style healthcheck on given service if healthcheck is enabled :param instance_config: service manifest :param docker_client: Docker client object :param container_id: Docker container id :param healthcheck_data: tuple url to healthcheck :param healthcheck_enabled: boolean :returns: healthcheck_passed: boolean """ healthcheck_link = PaastaColors.cyan(healthcheck_data) if healthcheck_enabled: grace_period = instance_config.get_healthcheck_grace_period_seconds() timeout = instance_config.get_healthcheck_timeout_seconds() interval = instance_config.get_healthcheck_interval_seconds() max_failures = instance_config.get_healthcheck_max_consecutive_failures() paasta_print( "\nStarting health check via %s (waiting %s seconds before " "considering failures due to grace period):" % (healthcheck_link, grace_period) ) # silently start performing health checks until grace period ends or first check succeeds graceperiod_end_time = time.time() + grace_period after_grace_period_attempts = 0 healthchecking = True def _stream_docker_logs(container_id, generator): while healthchecking: try: # the generator will block until another log line is available log_line = next(generator).decode("utf-8").rstrip("\n") if healthchecking: paasta_print(f"container [{container_id[:12]}]: {log_line}") else: # stop streaming at first opportunity, since generator.close() # cant be used until the container is dead break except StopIteration: # natural end of logs break docker_logs_generator = docker_client.logs( container_id, stderr=True, stream=True ) threading.Thread( target=_stream_docker_logs, daemon=True, args=(container_id, docker_logs_generator), ).start() while True: # First inspect the container for early exits container_state = docker_client.inspect_container(container_id) if not container_state["State"]["Running"]: paasta_print( PaastaColors.red( "Container exited with code {}".format( container_state["State"]["ExitCode"] ) ) ) healthcheck_passed = False break healthcheck_passed, healthcheck_output = run_healthcheck_on_container( docker_client, container_id, healthcheck_mode, healthcheck_data, timeout ) # Yay, we passed the healthcheck if healthcheck_passed: paasta_print( "{}'{}' (via {})".format( PaastaColors.green("Healthcheck succeeded!: "), healthcheck_output, healthcheck_link, ) ) break # Otherwise, print why we failed if time.time() < graceperiod_end_time: color = PaastaColors.grey msg = "(disregarded due to grace period)" extra_msg = f" (via: {healthcheck_link}. Output: {healthcheck_output})" else: # If we've exceeded the grace period, we start incrementing attempts after_grace_period_attempts += 1 color = PaastaColors.red msg = "(Attempt {} of {})".format( after_grace_period_attempts, max_failures ) extra_msg = f" (via: {healthcheck_link}. Output: {healthcheck_output})" paasta_print("{}{}".format(color(f"Healthcheck failed! {msg}"), extra_msg)) if after_grace_period_attempts == max_failures: break time.sleep(interval) healthchecking = False # end docker logs stream else: paasta_print( "\nPaaSTA would have healthchecked your service via\n%s" % healthcheck_link ) healthcheck_passed = True return healthcheck_passed
def paasta_logs(args): """Print the logs for as Paasta service. :param args: argparse.Namespace obj created from sys.args by cli""" soa_dir = args.soa_dir service = figure_out_service_name(args, soa_dir) if args.clusters is None: clusters = list_clusters(service, soa_dir=soa_dir) else: clusters = args.clusters.split(",") if args.instances is None: instances = None else: instances = args.instances.split(",") if args.components is not None: components = args.components.split(",") else: components = DEFAULT_COMPONENTS components = set(components) if "app_output" in components: components.remove("app_output") components.add("stdout") components.add("stderr") if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) levels = [DEFAULT_LOGLEVEL, "debug"] log.info("Going to get logs for %s on clusters %s" % (service, clusters)) log_reader = get_log_reader() if not validate_filtering_args(args, log_reader): return 1 # They haven't specified what kind of filtering they want, decide for them if args.line_count is None and args.time_from is None and not args.tail: return pick_default_log_mode(args, log_reader, service, levels, components, clusters, instances) if args.tail: paasta_print(PaastaColors.cyan("Tailing logs"), file=sys.stderr) log_reader.tail_logs( service=service, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 # If the logger doesn't support offsetting the number of lines by a particular line number # there is no point in distinguishing between a positive/negative number of lines since it # can only get the last N lines if not log_reader.SUPPORTS_LINE_OFFSET and args.line_count is not None: args.line_count = abs(args.line_count) # Handle line based filtering if args.line_count is not None and args.line_offset is None: log_reader.print_last_n_logs( service=service, line_count=args.line_count, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 elif args.line_count is not None and args.line_offset is not None: log_reader.print_logs_by_offset( service=service, line_count=args.line_count, line_offset=args.line_offset, levels=levels, components=components, cluters=clusters, instances=instances, raw_mode=args.raw_mode, ) return 0 # Handle time based filtering try: start_time, end_time = generate_start_end_time(args.time_from, args.time_to) except ValueError as e: paasta_print(PaastaColors.red(e.message), file=sys.stderr) return 1 log_reader.print_logs_by_time( service=service, start_time=start_time, end_time=end_time, levels=levels, components=components, clusters=clusters, instances=instances, raw_mode=args.raw_mode, )
def service_dir_found(service, soa_dir): message = "yelpsoa-config directory for %s found in %s" \ % (PaastaColors.cyan(service), soa_dir) return success(message)
def simulate_healthcheck_on_service( instance_config, docker_client, container_id, healthcheck_mode, healthcheck_data, healthcheck_enabled ): """Simulates Marathon-style healthcheck on given service if healthcheck is enabled :param instance_config: service manifest :param docker_client: Docker client object :param container_id: Docker container id :param healthcheck_data: tuple url to healthcheck :param healthcheck_enabled: boolean :returns: if healthcheck_enabled is true, then returns output of healthcheck, otherwise simply returns true """ healthcheck_link = PaastaColors.cyan(healthcheck_data) if healthcheck_enabled: grace_period = instance_config.get_healthcheck_grace_period_seconds() timeout = instance_config.get_healthcheck_timeout_seconds() interval = instance_config.get_healthcheck_interval_seconds() max_failures = instance_config.get_healthcheck_max_consecutive_failures() sys.stdout.write('\nStarting health check via %s (waiting %s seconds before ' 'considering failures due to grace period):\n' % (healthcheck_link, grace_period)) # silenty start performing health checks until grace period ends or first check succeeds graceperiod_end_time = time.time() + grace_period while True: healthcheck_succeeded = run_healthcheck_on_container( docker_client, container_id, healthcheck_mode, healthcheck_data, timeout) if healthcheck_succeeded or time.time() > graceperiod_end_time: break else: sys.stdout.write("%s\n" % PaastaColors.grey("Healthcheck failed (disregarded due to grace period)")) time.sleep(interval) failure = False for attempt in range(1, max_failures + 1): healthcheck_succeeded = run_healthcheck_on_container( docker_client, container_id, healthcheck_mode, healthcheck_data, timeout) if healthcheck_succeeded: sys.stdout.write("%s (via: %s)\n" % (PaastaColors.green("Healthcheck succeeded!"), healthcheck_link)) failure = False break else: sys.stdout.write("%s (via: %s)\n" % (PaastaColors.red("Healthcheck failed! (Attempt %d of %d)" % (attempt, max_failures)), healthcheck_link)) failure = True time.sleep(interval) if failure: healthcheck_status = False else: healthcheck_status = True else: sys.stdout.write('\nMesos would have healthchecked your service via\n%s\n' % healthcheck_link) healthcheck_status = True return healthcheck_status
def get_dashboard_urls(service): output = [' - %s (Sensu Alerts)' % (PaastaColors.cyan('https://uchiwa.yelpcorp.com/#/events?q=%s' % service))] return output
def get_pipeline_url(service): return PaastaColors.cyan( 'https://jenkins.yelpcorp.com/view/services-%s' % service)
def __str__(self): if self.service: return "SERVICE: %s %s" \ % (PaastaColors.cyan(self.service), self.CHECK_ERROR_MSG) else: return self.GUESS_ERROR_MSG
def service_dir_found(service): message = "yelpsoa-config directory for %s found in /nail/etc/services" \ % PaastaColors.cyan(service) return success(message)