예제 #1
0
def get_kubernetes_task_allocation_info() -> Iterable[TaskAllocationInfo]:
    client = KubeClient()
    pods = get_all_running_kubernetes_pods(client)
    info_list = []
    for pod in pods:
        service, instance, pool = get_kubernetes_metadata(pod)
        name_to_info: MutableMapping[str, Any] = {}
        for container in pod.spec.containers:
            name_to_info[container.name] = {
                "resources":
                get_kubernetes_resource_request(container.resources),
                "container_type": get_container_type(container.name, instance),
            }
        container_statuses = pod.status.container_statuses or []
        for container in container_statuses:
            if not container.state.running:
                continue
            # docker://abcdef
            docker_id = (container.container_id.split("/")[-1]
                         if container.container_id else None)
            update = {
                "docker_id": docker_id,
                "start_time": container.state.running.started_at.timestamp(),
            }
            name_to_info[container.name].update(update)
        for info in name_to_info.values():
            info_list.append(
                TaskAllocationInfo(
                    paasta_service=service,
                    paasta_instance=instance,
                    container_type=info.get("container_type"),
                    paasta_pool=pool,
                    resources=info.get("resources"),
                    start_time=info.get("start_time"),
                    docker_id=info.get("docker_id"),
                    mesos_container_id=None,
                ))

    return info_list
예제 #2
0
def main() -> None:
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        # filter out unwanted zookeeper messages in the log
        logging.getLogger("kazoo").setLevel(logging.WARN)
        logging.basicConfig(level=logging.INFO)

    # system_paasta_config = load_system_paasta_config()
    kube_client = KubeClient()

    ensure_namespace(kube_client, namespace="paasta")
    setup_kube_succeeded = setup_kube_deployments(
        kube_client=kube_client,
        service_instances=args.service_instance_list,
        soa_dir=soa_dir,
        cluster=args.cluster or load_system_paasta_config().get_cluster(),
        rate_limit=args.rate_limit,
    )
    sys.exit(0 if setup_kube_succeeded else 1)
예제 #3
0
def main() -> None:
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    if args.cluster:
        cluster = args.cluster
    else:
        system_paasta_config = load_system_paasta_config()
        cluster = system_paasta_config.get_cluster()

    kube_client = KubeClient()

    success = setup_kube_crd(
        kube_client=kube_client,
        cluster=cluster,
        services=args.service_list,
        soa_dir=soa_dir,
    )
    sys.exit(0 if success else 1)
예제 #4
0
def main() -> None:
    args = parse_args()
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    system_paasta_config = load_system_paasta_config()
    if args.cluster:
        cluster = args.cluster
    else:
        cluster = system_paasta_config.get_cluster()
    secret_provider_name = system_paasta_config.get_secret_provider_name()
    vault_cluster_config = system_paasta_config.get_vault_cluster_config()
    kube_client = KubeClient()
    sys.exit(0) if sync_all_secrets(
        kube_client=kube_client,
        cluster=cluster,
        service_list=args.service_list,
        secret_provider_name=secret_provider_name,
        vault_cluster_config=vault_cluster_config,
        soa_dir=args.soa_dir,
    ) else sys.exit(1)
예제 #5
0
def main() -> None:
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    kube_client: Any = KubeClient()
    if args.dry_run:
        kube_client = StdoutKubeClient(kube_client)

    system_paasta_config = load_system_paasta_config()
    cluster = args.cluster or system_paasta_config.get_cluster()
    custom_resources = load_custom_resources(system_paasta_config)
    setup_kube_succeeded = setup_all_custom_resources(
        kube_client=kube_client,
        soa_dir=soa_dir,
        cluster=cluster,
        custom_resources=custom_resources,
        service=args.service,
        instance=args.instance,
    )
    sys.exit(0 if setup_kube_succeeded else 1)
예제 #6
0
def check_all_kubernetes_based_services_replication(
    soa_dir: str,
    service_instances: Sequence[str],
    instance_type_class: Type[InstanceConfig_T],
    check_service_replication: CheckServiceReplication,
    namespace: str,
) -> None:
    kube_client = KubeClient()
    all_pods = get_all_pods(kube_client=kube_client, namespace=namespace)
    all_nodes = get_all_nodes(kube_client)
    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    smartstack_replication_checker = KubeSmartstackReplicationChecker(
        nodes=all_nodes, system_paasta_config=system_paasta_config)
    service_instances_set = set(service_instances)

    for service in list_services(soa_dir=soa_dir):
        service_config = PaastaServiceConfigLoader(service=service,
                                                   soa_dir=soa_dir)
        for instance_config in service_config.instance_configs(
                cluster=cluster, instance_type_class=instance_type_class):
            if (service_instances_set
                    and f"{service}{SPACER}{instance_config.instance}"
                    not in service_instances_set):
                continue
            if instance_config.get_docker_image():
                check_service_replication(
                    instance_config=instance_config,
                    all_pods=all_pods,
                    smartstack_replication_checker=
                    smartstack_replication_checker,
                )
            else:
                log.debug(
                    "%s is not deployed. Skipping replication monitoring." %
                    instance_config.job_id)
예제 #7
0
def instance_set_state(request, ) -> None:
    service = request.swagger_data.get("service")
    instance = request.swagger_data.get("instance")
    desired_state = request.swagger_data.get("desired_state")

    try:
        instance_type = validate_service_instance(service, instance,
                                                  settings.cluster,
                                                  settings.soa_dir)
    except NoConfigurationForServiceError:
        error_message = "deployment key %s not found" % ".".join(
            [settings.cluster, instance])
        raise ApiFailure(error_message, 404)
    except Exception:
        error_message = traceback.format_exc()
        raise ApiFailure(error_message, 500)

    if instance_type in INSTANCE_TYPES_WITH_SET_STATE:
        try:
            cr_id_fn = cr_id_fn_for_instance_type(instance_type)
            kube_client = KubeClient()
            kubernetes_tools.set_cr_desired_state(
                kube_client=kube_client,
                cr_id=cr_id_fn(service=service, instance=instance),
                desired_state=desired_state,
            )
        except ApiException as e:
            error_message = (f"Error while setting state {desired_state} of "
                             f"{service}.{instance}: {e}")
            raise ApiFailure(error_message, 500)
    else:
        error_message = (
            f"instance_type {instance_type} of {service}.{instance} doesn't "
            f"support set_state, must be in INSTANCE_TYPES_WITH_SET_STATE, "
            f"currently: {INSTANCE_TYPES_WITH_SET_STATE}")
        raise ApiFailure(error_message, 404)
예제 #8
0
def print_output(argv: Optional[Sequence[str]] = None) -> None:
    mesos_available = is_mesos_available()
    kube_available = is_kubernetes_available()

    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    if mesos_available:
        master_kwargs = {}
        # we don't want to be passing False to not override a possible True
        # value from system config
        if args.use_mesos_cache:
            master_kwargs["use_mesos_cache"] = True

        master = get_mesos_master(**master_kwargs)

        marathon_servers = get_marathon_servers(system_paasta_config)
        marathon_clients = all_marathon_clients(
            get_marathon_clients(marathon_servers))

        try:
            mesos_state = a_sync.block(master.state)
            all_mesos_results = _run_mesos_checks(mesos_master=master,
                                                  mesos_state=mesos_state)
        except MasterNotAvailableException as e:
            # if we can't connect to master at all,
            # then bomb out early
            paasta_print(PaastaColors.red("CRITICAL:  %s" % "\n".join(e.args)))
            raise FatalError(2)

        marathon_results = _run_marathon_checks(marathon_clients)
    else:
        marathon_results = [
            metastatus_lib.HealthCheckResult(
                message="Marathon is not configured to run here", healthy=True)
        ]
        all_mesos_results = [
            metastatus_lib.HealthCheckResult(
                message="Mesos is not configured to run here", healthy=True)
        ]

    if kube_available:
        kube_client = KubeClient()
        kube_results = _run_kube_checks(kube_client)
    else:
        kube_results = [
            metastatus_lib.HealthCheckResult(
                message="Kubernetes is not configured to run here",
                healthy=True)
        ]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    kube_ok = all(metastatus_lib.status_for_results(kube_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    kube_summary = metastatus_lib.generate_summary_for_check(
        "Kubernetes", kube_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok]) else False

    paasta_print(f"Master paasta_tools version: {__version__}")
    paasta_print("Mesos leader: %s" % get_mesos_leader())
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1 and mesos_available:
        print_with_indent(
            "Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state(
            groupings=args.groupings,
            threshold=args.threshold,
            mesos_state=mesos_state)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = [headers] + [[
                str(x) for x in asi
            ] for asi in get_autoscaling_info_for_all_resources(mesos_state)]

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent("Per Slave Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2)
            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            all_rows, _ = utilization_table_by_grouping_from_mesos_state(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                mesos_state=mesos_state,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be
            # 1 for per-slave resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok,
                                                  kube_results, args.verbose)
    if args.verbose > 1 and kube_available:
        print_with_indent(
            "Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_kube(
            groupings=args.groupings,
            threshold=args.threshold,
            kube_client=kube_client)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("No autoscaling resources for Kubernetes", 2)

        if args.verbose >= 3:
            print_with_indent("Per Node Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2)
            # print info about nodes here. Note that we don't make
            # modifications to the healthy_exit variable here, because we don't
            # care about a single node having high usage.
            all_rows, _ = utilization_table_by_grouping_from_kube(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                kube_client=kube_client,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be
            # 1 for per-node resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)

    if not healthy_exit:
        raise FatalError(2)
def main() -> int:
    args = parse_args()
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    log.info("Generating adapter config from soaconfigs.")
    config = create_prometheus_adapter_config(paasta_cluster=args.cluster,
                                              soa_dir=args.soa_dir)
    log.info("Generated adapter config from soaconfigs.")
    if args.dry_run:
        log.info(
            "Generated the following config:\n%s",
            yaml.dump(config,
                      default_flow_style=False,
                      explicit_start=True,
                      width=sys.maxsize),
        )
        return 0  # everything after this point requires creds/updates state
    else:
        log.debug(
            "Generated the following config:\n%s",
            yaml.dump(config,
                      default_flow_style=False,
                      explicit_start=True,
                      width=sys.maxsize),
        )

    if not config["rules"]:
        log.error("Got empty rule configuration - refusing to continue.")
        return 0

    kube_client = KubeClient()
    if not args.dry_run:
        ensure_namespace(kube_client, namespace="paasta")
        ensure_namespace(kube_client, namespace="custom-metrics")

    existing_config = get_prometheus_adapter_configmap(kube_client=kube_client)
    if existing_config and existing_config != config:
        log.info("Existing config differs from soaconfigs - updating.")
        log.debug("Existing data: %s", existing_config)
        log.debug("Desired data: %s", config)
        update_prometheus_adapter_configmap(kube_client=kube_client,
                                            config=config)
        log.info("Updated adapter config.")
    elif existing_config:
        log.info("Existing config matches soaconfigs - exiting.")
        return 0
    else:
        log.info("No existing config - creating.")
        create_prometheus_adapter_configmap(kube_client=kube_client,
                                            config=config)
        log.info("Created adapter config.")

    # the prometheus adapter doesn't currently have a good way to reload on config changes
    # so we do the next best thing: restart the pod so that it picks up the new config.
    # see: https://github.com/DirectXMan12/k8s-prometheus-adapter/issues/104
    restart_prometheus_adapter(kube_client=kube_client)

    return 0
예제 #10
0
def paasta_start_or_stop(args, desired_state):
    """Requests a change of state to start or stop given branches of a service."""
    soa_dir = args.soa_dir

    pargs = apply_args_filters(args)
    if len(pargs) == 0:
        return 1

    affected_services = {
        s
        for service_list in pargs.values() for s in service_list.keys()
    }
    if len(affected_services) > 1:
        paasta_print(
            PaastaColors.red(
                "Warning: trying to start/stop/restart multiple services:"))

        for cluster, services_instances in pargs.items():
            paasta_print("Cluster %s:" % cluster)
            for service, instances in services_instances.items():
                paasta_print("    Service %s:" % service)
                paasta_print("        Instances %s" %
                             ",".join(instances.keys()))

        if sys.stdin.isatty():
            confirm = choice.Binary('Are you sure you want to continue?',
                                    False).ask()
        else:
            confirm = False
        if not confirm:
            paasta_print()
            paasta_print("exiting")
            return 1

    invalid_deploy_groups = []
    marathon_message_printed = False
    chronos_message_printed = False
    affected_flinkclusters = []

    if args.clusters is None or args.instances is None:
        if confirm_to_continue(pargs.items(), desired_state) is False:
            paasta_print()
            paasta_print("exiting")
            return 1

    for cluster, services_instances in pargs.items():
        for service, instances in services_instances.items():
            for instance in instances.keys():
                service_config = get_instance_config(
                    service=service,
                    cluster=cluster,
                    instance=instance,
                    soa_dir=soa_dir,
                    load_deployments=False,
                )
                if isinstance(service_config, FlinkClusterConfig):
                    affected_flinkclusters.append(service_config)
                    continue

                try:
                    remote_refs = get_remote_refs(service, soa_dir)
                except remote_git.LSRemoteException as e:
                    msg = (
                        "Error talking to the git server: %s\n"
                        "This PaaSTA command requires access to the git server to operate.\n"
                        "The git server may be down or not reachable from here.\n"
                        "Try again from somewhere where the git server can be reached, "
                        "like your developer environment.") % str(e)
                    paasta_print(msg)
                    return 1

                deploy_group = service_config.get_deploy_group()
                (deploy_tag,
                 _) = get_latest_deployment_tag(remote_refs, deploy_group)

                if deploy_tag not in remote_refs:
                    invalid_deploy_groups.append(deploy_group)
                else:
                    force_bounce = utils.format_timestamp(
                        datetime.datetime.utcnow())
                    if isinstance(service_config, MarathonServiceConfig
                                  ) and not marathon_message_printed:
                        print_marathon_message(desired_state)
                        marathon_message_printed = True
                    elif isinstance(
                            service_config,
                            ChronosJobConfig) and not chronos_message_printed:
                        print_chronos_message(desired_state)
                        chronos_message_printed = True

                    issue_state_change_for_service(
                        service_config=service_config,
                        force_bounce=force_bounce,
                        desired_state=desired_state,
                    )

    return_val = 0

    if affected_flinkclusters:
        if os.environ.get('ON_PAASTA_MASTER'):
            print_flinkcluster_message(desired_state)
            kube_client = KubeClient()
            for service_config in affected_flinkclusters:
                set_flinkcluster_desired_state(
                    kube_client=kube_client,
                    service=service_config.service,
                    instance=service_config.instance,
                    desired_state=dict(start='running',
                                       stop='stopped')[desired_state],
                )
        else:
            csi = defaultdict(lambda: defaultdict(list))
            for service_config in affected_flinkclusters:
                csi[service_config.cluster][service_config.service].append(
                    service_config.instance)

            system_paasta_config = load_system_paasta_config()
            for cluster, services_instances in csi.items():
                for service, instances in services_instances.items():
                    cmd_parts = [
                        'ON_PAASTA_MASTER=1',
                        'paasta',
                        desired_state,
                        '-c',
                        cluster,
                        '-s',
                        service,
                        '-i',
                        ','.join(instances),
                    ]
                    return_val, _ = run_on_master(
                        cluster=cluster,
                        system_paasta_config=system_paasta_config,
                        cmd_parts=cmd_parts,
                        graceful_exit=True,
                    )

    if invalid_deploy_groups:
        paasta_print("No branches found for %s in %s." %
                     (", ".join(invalid_deploy_groups), remote_refs))
        paasta_print("Has %s been deployed there yet?" % service)
        return_val = 1

    return return_val
예제 #11
0
def main():
    args = parse_args()
    setup_logging(args.verbose)

    kube_client = KubeClient()
    pods = get_all_pods(kube_client, args.namespace)

    allowed_uptime_minutes = args.minutes
    allowed_error_minutes = args.error_minutes

    completed_pods = []
    errored_pods = []

    for pod in pods:
        if is_pod_completed(pod) and _completed_longer_than_threshold(
                pod, allowed_uptime_minutes):
            completed_pods.append(pod)
        elif (
                # this is currently optional
                allowed_error_minutes is not None
                # there's no direct way to get what type of "bad" state these Pods ended up
                # (kubectl looks at phase and then container statuses to give something descriptive)
                # but, in the end, we really just care that a Pod is in a Failed phase
                and pod.status.phase == "Failed"):
            try:
                # and that said Pod has been around for a while (generally longer than we'd leave
                # Pods that exited sucessfully)
                # NOTE: we do this in a try-except since we're intermittently seeing pods in an error
                # state without a PodScheduled condition (even though that should be impossible)
                # this is not ideal, but its fine to skip these since this isn't a critical process
                if _scheduled_longer_than_threshold(pod,
                                                    allowed_error_minutes):
                    errored_pods.append(pod)
            except AttributeError:
                log.exception(
                    f"Unable to check {pod.metadata.name}'s schedule time. Pod status: {pod.status}.'"
                )

    if not (completed_pods or errored_pods):
        log.debug("No pods to terminate.")
        sys.exit(0)

    if args.dry_run:
        log.debug(
            "Dry run would have terminated the following completed pods:\n " +
            "\n ".join([pod.metadata.name for pod in completed_pods]))
        log.debug(
            "Dry run would have terminated the following errored pods:\n " +
            "\n ".join([pod.metadata.name for pod in errored_pods]))
        sys.exit(0)

    completed_successes, completed_errors = terminate_pods(
        completed_pods, kube_client)
    errored_successes, errored_errors = terminate_pods(errored_pods,
                                                       kube_client)

    successes = {
        "completed": completed_successes,
        "errored": errored_successes,
    }
    errors = {
        "completed": completed_errors,
        "errored": errored_errors,
    }

    for typ, pod_names in successes.items():
        if pod_names:
            log.debug(f"Successfully terminated the following {typ} pods:\n" +
                      "\n ".join(pod_names))

    # we've only really seen this fail recently due to the k8s API being flaky and returning
    # 404s for Pods that its returning to us when we get all Pods, so we just print the error
    # here for now and don't exit with a non-zero exit code since, again, this isn't a critical
    # process
    for typ, pod_names_and_errors in errors.items():
        if pod_names_and_errors:
            log.error(f"Failed to terminate the following {typ} pods:\n" +
                      "\n  ".join(f"{pod_name}: {error}"
                                  for pod_name, error in pod_names_and_errors))