Exemplo n.º 1
0
def wait_till_specific_host_is_in_stage(
    client,
    cluster_id: str,
    host_name: str,
    stages: List[str],
    nodes_count: int = 1,
    timeout: int = consts.CLUSTER_INSTALLATION_TIMEOUT / 2,
    interval: int = 5,
):
    log.info(f"Wait till {host_name} host is in stage {stages}")
    try:
        waiting.wait(
            lambda: utils.are_host_progress_in_stage(
                [client.get_host_by_name(cluster_id, host_name)],
                stages,
                nodes_count,
            ),
            timeout_seconds=timeout,
            sleep_seconds=interval,
            waiting_for=f"Node to be in of the stage {stages}",
        )
    except BaseException:
        hosts = [client.get_host_by_name(cluster_id, host_name)]
        log.error(
            f"All nodes stages: "
            f"{[host['progress']['current_stage'] for host in hosts]} "
            f"when waited for {stages}"
        )
        raise
Exemplo n.º 2
0
def wait_till_at_least_one_host_is_in_stage(
    client,
    cluster_id,
    stages,
    nodes_count=1,
    timeout=consts.CLUSTER_INSTALLATION_TIMEOUT / 2,
    interval=consts.DEFAULT_CHECK_STATUSES_INTERVAL,
):
    log.info(f"Wait till {nodes_count} node is in stage {stages}")
    try:
        waiting.wait(
            lambda: utils.are_host_progress_in_stage(
                client.get_cluster_hosts(cluster_id),
                stages,
                nodes_count,
            ),
            timeout_seconds=timeout,
            sleep_seconds=interval,
            waiting_for=f"Node to be in of the stage {stages}",
        )
    except BaseException:
        hosts = client.get_cluster_hosts(cluster_id)
        log.error(
            f"All nodes stages: "
            f"{[host['progress']['current_stage'] for host in hosts]} "
            f"when waited for {stages}"
        )
        raise
Exemplo n.º 3
0
def download_logs(
    client: InventoryClient,
    cluster: dict,
    dest: str,
    must_gather: bool,
    update_by_events: bool = False,
    retry_interval: int = RETRY_INTERVAL,
):
    if "hosts" not in cluster or len(cluster["hosts"]) == 0:
        cluster["hosts"] = client.get_cluster_hosts(cluster_id=cluster["id"])

    output_folder = get_logs_output_folder(dest, cluster)
    if not is_update_needed(output_folder, update_by_events, client, cluster):
        log.info(f"Skipping, no need to update {output_folder}.")
        return

    recreate_folder(output_folder)
    recreate_folder(os.path.join(output_folder, "cluster_files"))

    try:
        write_metadata_file(client, cluster,
                            os.path.join(output_folder, "metadata.json"))

        with SuppressAndLog(requests.exceptions.RequestException,
                            ConnectionError, KeyboardInterrupt):
            client.download_metrics(os.path.join(output_folder, "metrics.txt"))

        for cluster_file in (
                "bootstrap.ign",
                "master.ign",
                "worker.ign",
                "install-config.yaml",
        ):
            with SuppressAndLog(assisted_service_client.rest.ApiException,
                                KeyboardInterrupt):
                client.download_and_save_file(
                    cluster["id"], cluster_file,
                    os.path.join(output_folder, "cluster_files", cluster_file))

        with SuppressAndLog(assisted_service_client.rest.ApiException,
                            KeyboardInterrupt):
            download_manifests(client, cluster["id"], output_folder)

        infra_env_list = set()
        for host_id, infra_env_id in map(
                lambda host: (host["id"], host["infra_env_id"]),
                cluster["hosts"]):
            with SuppressAndLog(assisted_service_client.rest.ApiException,
                                KeyboardInterrupt):
                client.download_host_ignition(
                    infra_env_id, host_id,
                    os.path.join(output_folder, "cluster_files"))
            if infra_env_id not in infra_env_list:
                infra_env_list.add(infra_env_id)
                with SuppressAndLog(assisted_service_client.rest.ApiException,
                                    KeyboardInterrupt):
                    client.download_infraenv_events(
                        infra_env_id,
                        get_infraenv_events_path(infra_env_id, output_folder))

        with SuppressAndLog(assisted_service_client.rest.ApiException,
                            KeyboardInterrupt):
            client.download_cluster_events(
                cluster["id"], get_cluster_events_path(cluster, output_folder))
            shutil.copy2(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             "events.html"), output_folder)

        with SuppressAndLog(assisted_service_client.rest.ApiException,
                            KeyboardInterrupt):
            are_masters_in_configuring_state = are_host_progress_in_stage(
                cluster["hosts"], [HostsProgressStages.CONFIGURING], 2)
            are_masters_in_join_or_done_state = are_host_progress_in_stage(
                cluster["hosts"],
                [HostsProgressStages.JOINED, HostsProgressStages.DONE], 2)
            max_retries = MUST_GATHER_MAX_RETRIES if are_masters_in_join_or_done_state else MAX_RETRIES
            is_controller_expected = cluster[
                "status"] == ClusterStatus.INSTALLED or are_masters_in_configuring_state
            min_number_of_logs = min_number_of_log_files(
                cluster, is_controller_expected)

            for i in range(max_retries):
                cluster_logs_tar = os.path.join(
                    output_folder, f"cluster_{cluster['id']}_logs.tar")

                with suppress(FileNotFoundError):
                    os.remove(cluster_logs_tar)

                client.download_cluster_logs(cluster["id"], cluster_logs_tar)
                try:
                    verify_logs_uploaded(
                        cluster_logs_tar,
                        min_number_of_logs,
                        installation_success=(
                            cluster["status"] == ClusterStatus.INSTALLED),
                        check_oc=are_masters_in_join_or_done_state,
                    )
                    break
                except AssertionError as ex:
                    log.warning("Cluster logs verification failed: %s", ex)

                    # Skip sleeping on last retry
                    if i < MAX_RETRIES - 1:
                        log.info(f"Going to retry in {retry_interval} seconds")
                        time.sleep(retry_interval)

        kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress")

        with SuppressAndLog(assisted_service_client.rest.ApiException):
            client.download_kubeconfig_no_ingress(cluster["id"],
                                                  kubeconfig_path)

            if must_gather:
                config_etc_hosts(
                    cluster["name"],
                    cluster["base_dns_domain"],
                    client.get_api_vip(cluster, cluster["id"]),
                )
                download_must_gather(kubeconfig_path, output_folder)

    finally:
        run_command(f"chmod -R ugo+rx '{output_folder}'")