Пример #1
0
    def approve_csrs(kubeconfig_path: str, done: threading.Event):
        log.info(
            "Started background worker to approve CSRs when they appear...")
        while not done.is_set():
            unapproved_csrs = []
            try:
                unapproved_csrs = get_unapproved_csr_names(kubeconfig_path)
            except subprocess.SubprocessError:
                log.debug(
                    "Failed to list csrs. This is usually due to API downtime. Retrying"
                )
            except Exception:
                # We're in a thread so it's a bit awkward to stop everything else...
                # Just continue after logging the unexpected exception
                log.exception("Unknown exception while listing csrs")

            for csr_name in unapproved_csrs:
                log.info(f"Found unapproved CSR {csr_name}, approving...")

                try:
                    approve_csr(kubeconfig_path, csr_name)
                except subprocess.SubprocessError:
                    log.warning(
                        "Failed attempt to approve CSR, this may be due to API downtime. Will retry later"
                    )
                except Exception:
                    # We're in a thread so it's a bit awkward to stop everything else...
                    # Just continue after logging the unexpected exception
                    log.exception(
                        f"Unknown exception while approving the {csr_name} CSR"
                    )

            time.sleep(10)
Пример #2
0
    def waiting_for_installation_completion(
            self,
            controller: NodeController,
            cluster_configuration: ClusterConfig,
            skip_logs=False):
        master_ip = controller.master_ips[0][0]

        try:
            log.info("Configuring /etc/hosts...")
            utils.config_etc_hosts(
                cluster_name=cluster_configuration.cluster_name.get(),
                base_dns_domain=cluster_configuration.base_dns_domain,
                api_vip=master_ip,
            )

            log.info("Waiting for installation to complete...")
            waiting.wait(
                self.all_operators_available,
                sleep_seconds=20,
                timeout_seconds=60 * 60,
                waiting_for="all operators to get up",
            )
            log.info("Installation completed successfully!")
        except Exception:
            log.exception(
                "An unexpected error has occurred while waiting for installation to complete"
            )
            # In case of error, always collect logs
            self.log_collection(master_ip)
            raise
        else:
            # If successful, collect logs only if caller asked not to skip
            if not skip_logs:
                self.log_collection(master_ip)
    def _get_domain_ips_and_macs(
            domain: libvirt.virDomain) -> Tuple[List[str], List[str]]:
        interfaces_sources = [
            # getting all DHCP leases IPs
            libvirt.VIR_DOMAIN_INTERFACE_ADDRESSES_SRC_LEASE,
            # getting static IPs via ARP
            libvirt.VIR_DOMAIN_INTERFACE_ADDRESSES_SRC_ARP,
        ]

        interfaces = {}
        for addresses_source in interfaces_sources:
            try:
                interfaces.update(
                    **domain.interfaceAddresses(addresses_source))
            except libvirt.libvirtError:
                log.exception(
                    "Got an error while updating domain's network addresses")

        ips = []
        macs = []
        log.debug(f"Host {domain.name()} interfaces are {interfaces}")
        if interfaces:
            for (_, val) in interfaces.items():
                if val["addrs"]:
                    for addr in val["addrs"]:
                        ips.append(addr["addr"])
                        macs.append(val["hwaddr"])
        if ips:
            log.info("Host %s ips are %s", domain.name(), ips)
        if macs:
            log.info("Host %s macs are %s", domain.name(), macs)
        return ips, macs
Пример #4
0
 def wrapped(*args, **kwargs):
     try:
         return fn(*args, **kwargs)
     except errors as e:
         if message:
             log.exception(message)
         if callback:
             callback(e)
         if silent:
             return
         raise
Пример #5
0
def is_cluster_in_status(client, cluster_id, statuses):
    log.info("Is cluster %s in status %s", cluster_id, statuses)
    try:
        cluster_status = client.cluster_get(cluster_id).status
        if cluster_status in statuses:
            return True
        else:
            log.info(f"Cluster not yet in its required status. " f"Current status: {cluster_status}")
            return False
    except BaseException:
        log.exception("Failed to get cluster %s info", cluster_id)
Пример #6
0
def gather_sosreport_from_node(node: Node, destination_dir: str):
    try:
        node.upload_file(SOSREPORT_SCRIPT, "/tmp/man_sosreport.sh")
        node.run_command("chmod a+x /tmp/man_sosreport.sh")
        node.run_command("sudo /tmp/man_sosreport.sh")
        node.download_file(
            "/tmp/sosreport.tar.bz2",
            os.path.join(destination_dir, f"sosreport-{node.name}.tar.bz2"))

    except (TimeoutError, RuntimeError, SSHException, SCPException):
        log.exception("Failed accessing node %s for sosreport data gathering",
                      node)
Пример #7
0
def wait_and_verify_oc_logs_uploaded(cluster, cluster_tar_path):
    try:
        cluster.wait_for_logs_complete(timeout=OC_DOWNLOAD_LOGS_TIMEOUT,
                                       interval=OC_DOWNLOAD_LOGS_INTERVAL,
                                       check_host_logs_only=False)
        cluster.download_installation_logs(cluster_tar_path)
        assert os.path.exists(
            cluster_tar_path), f"{cluster_tar_path} doesn't exist"
        _verify_oc_logs_uploaded(cluster_tar_path)
    except BaseException:
        log.exception("oc logs were not uploaded")
        raise
Пример #8
0
 def get_nodes(self, ready: bool = False) -> V1NodeList:
     if self.hypershift_cluster_client is None:
         hypershift_cluter_kubeapi_client = create_kube_api_client(
             self.kubeconfig_path)
         self.hypershift_cluster_client = CoreV1Api(
             hypershift_cluter_kubeapi_client)
     try:
         nodes = self.hypershift_cluster_client.list_node()
     except Exception:
         log.exception("Failed listing nodes")
         return V1NodeList()
     if ready:
         return filter_node_by_ready_status(nodes)
     return nodes
Пример #9
0
def wait_for_controller_logs(client, cluster_id, timeout, interval=60):
    try:
        # if logs_info has any content, the conroller is alive and healthy
        waiting.wait(
            lambda: client.cluster_get(cluster_id).logs_info,
            timeout_seconds=timeout,
            sleep_seconds=interval,
            waiting_for="controller logs_info to be filled",
        )
    except BaseException:
        log.exception(
            "Failed to wait on start of controller logs on cluster %s",
            cluster_id)
        return False
Пример #10
0
def collect_debug_info_from_cluster(cluster_deployment,
                                    agent_cluster_install,
                                    output_folder=None):
    cluster_name = cluster_deployment.ref.name
    if not output_folder:
        output_folder = f"build/{cluster_name}"
        recreate_folder(output_folder)
    aci = agent_cluster_install.get()
    debug_info = aci["status"]["debugInfo"]

    try:
        log.info(
            "Collecting debugInfo events from cluster to %s, debug info %s",
            output_folder, debug_info)
        fetch_url_and_write_to_file("eventsURL", "events.json", debug_info,
                                    output_folder)
        log.info("Collecting debugInfo logs from cluster")
        fetch_url_and_write_to_file("logsURL", "logs.tar", debug_info,
                                    output_folder)
    except Exception as err:
        log.exception(
            f"Failed to collect debug info for cluster {cluster_name} ({err})")
Пример #11
0
def _are_logs_in_status(client,
                        cluster_id,
                        statuses,
                        check_host_logs_only=False):
    try:
        cluster = client.cluster_get(cluster_id)
        hosts = client.get_cluster_hosts(cluster_id)
        cluster_logs_status = cluster.logs_info
        host_logs_statuses = [host.get("logs_info", "") for host in hosts]
        if all(s in statuses for s in host_logs_statuses) and (
                check_host_logs_only or (cluster_logs_status in statuses)):
            log.info("found expected state. cluster logs: %s, host logs: %s",
                     cluster_logs_status, host_logs_statuses)
            return True

        log.info(
            "Cluster logs not yet in their required state. %s, host logs: %s",
            cluster_logs_status, host_logs_statuses)
        return False
    except BaseException:
        log.exception("Failed to get cluster %s log info", cluster_id)
        return False
Пример #12
0
def kube_api_test(
    kube_api_context,
    nodes: Nodes,
    cluster_config: ClusterConfig,
    proxy_server=None,
    *,
    is_ipv4=True,
    is_disconnected=False,
):
    cluster_name = cluster_config.cluster_name.get()

    # TODO resolve it from the service if the node controller doesn't have this information
    #  (please see cluster.get_primary_machine_cidr())
    machine_cidr = nodes.controller.get_primary_machine_cidr()

    agent_cluster_install = AgentClusterInstall(
        kube_api_client=kube_api_context.api_client,
        name=f"{cluster_name}-agent-cluster-install",
        namespace=global_variables.spoke_namespace,
    )

    secret = Secret(
        kube_api_client=kube_api_context.api_client,
        name=f"{cluster_name}-secret",
        namespace=global_variables.spoke_namespace,
    )
    secret.create(pull_secret=cluster_config.pull_secret)

    cluster_deployment = ClusterDeployment(
        kube_api_client=kube_api_context.api_client,
        name=cluster_name,
        namespace=global_variables.spoke_namespace,
    )
    cluster_deployment.create(
        agent_cluster_install_ref=agent_cluster_install.ref,
        secret=secret,
    )

    agent_cluster_install.create(
        cluster_deployment_ref=cluster_deployment.ref,
        image_set_ref=deploy_image_set(cluster_name, kube_api_context),
        cluster_cidr=cluster_config.cluster_networks[0].cidr,
        host_prefix=cluster_config.cluster_networks[0].host_prefix,
        service_network=cluster_config.service_networks[0].cidr,
        ssh_pub_key=cluster_config.ssh_public_key,
        hyperthreading=cluster_config.hyperthreading,
        control_plane_agents=nodes.controller.params.master_count,
        worker_agents=nodes.controller.params.worker_count,
        machine_cidr=machine_cidr,
    )
    agent_cluster_install.wait_to_be_ready(False)

    if is_disconnected:
        log.info("getting igntion and install config override for disconected install")
        ca_bundle = get_ca_bundle_from_hub()
        patch_install_config_with_ca_bundle(cluster_deployment, ca_bundle)
        ignition_config_override = get_ignition_config_override(ca_bundle)
    else:
        ignition_config_override = None

    proxy = setup_proxy(cluster_config, machine_cidr, cluster_name, proxy_server)

    infra_env = InfraEnv(
        kube_api_client=kube_api_context.api_client,
        name=f"{cluster_name}-infra-env",
        namespace=global_variables.spoke_namespace,
    )
    infra_env.create(
        cluster_deployment=cluster_deployment,
        ignition_config_override=ignition_config_override,
        secret=secret,
        proxy=proxy,
        ssh_pub_key=cluster_config.ssh_public_key,
    )
    infra_env.status()
    download_iso_from_infra_env(infra_env, cluster_config.iso_download_path)

    log.info("iso downloaded, starting nodes")
    nodes.start_all()

    log.info("waiting for host agent")
    agents = cluster_deployment.wait_for_agents(len(nodes))
    for agent in agents:
        agent.approve()
        set_agent_hostname(nodes[0], agent, is_ipv4)  # Currently only supports single node

    if len(nodes) == 1:
        set_single_node_ip(cluster_deployment, nodes, is_ipv4)

    log.info("Waiting for agent status verification")
    Agent.wait_for_agents_to_install(agents)

    agent_cluster_install.wait_to_be_ready(True)

    log.info("waiting for agent-cluster-install to be in installing state")
    agent_cluster_install.wait_to_be_installing()

    try:
        log.info("installation started, waiting for completion")
        agent_cluster_install.wait_to_be_installed()
        log.info("installation completed successfully")
    except Exception:
        log.exception("Failure during kube-api installation flow:")
        collect_debug_info_from_cluster(cluster_deployment, agent_cluster_install)