def log_collection(self, master_ip: Optional[str]): """ Collects all sorts of logs about the installation process @param master_ip The IP address of the master node. Used to SSH into the node when doing installer gather. When not given, installer gather log collection is skipped. """ etype, _value, _tb = sys.exc_info() log.info( f"Collecting logs after a {('failed', 'successful')[etype is None]} installation" ) with SuppressAndLog(Exception): log.info("Gathering sosreport data from host...") gather_sosreport_data(output_dir=IBIP_DIR) if master_ip is not None: with SuppressAndLog(Exception): log.info("Gathering information via installer-gather...") utils.recreate_folder(INSTALLER_GATHER_DIR, force_recreate=True) self.installer_gather( ip=master_ip, ssh_key=consts.DEFAULT_SSH_PRIVATE_KEY_PATH, out_dir=INSTALLER_GATHER_DIR) with SuppressAndLog(Exception): log.info("Gathering information via must-gather...") download_must_gather(KUBE_CONFIG, IBIP_DIR)
def log_collection(vm_ip): etype, _value, _tb = sys.exc_info() logging.info( f"Collecting logs after a {('failed', 'successful')[etype is None]} installation" ) try: logging.info("Gathering sosreport data from host...") gather_sosreport_data(output_dir=IBIP_DIR) except Exception: logging.exception("sosreport gathering failed!") utils.retry() try: logging.info("Gathering information via installer-gather...") utils.recreate_folder(INSTALLER_GATHER_DIR, force_recreate=True) installer_gather(ip=vm_ip, ssh_key=consts.DEFAULT_SSH_PRIVATE_KEY_PATH, out_dir=INSTALLER_GATHER_DIR) except Exception: logging.exception("installer-gather failed!") try: logging.info("Gathering information via must-gather...") utils.recreate_folder(MUST_GATHER_DIR) download_must_gather(KUBE_CONFIG, MUST_GATHER_DIR) except Exception: logging.exception("must-gather failed!")
def setup_files_and_folders(self, net_asset: LibvirtNetworkAssets, cluster_name: str): log.info("Creating needed files and folders") utils.recreate_folder(consts.BASE_IMAGE_FOLDER, force_recreate=False) utils.recreate_folder(IBIP_DIR, with_chmod=False, force_recreate=True) shutil.copy(os.path.join(RESOURCES_DIR, INSTALL_CONFIG_FILE_NAME), IBIP_DIR) # TODO: fetch pull_secret and ssh_key in a different way self.fill_install_config(os.environ["PULL_SECRET"], os.environ["SSH_PUB_KEY"], net_asset, cluster_name)
def wait_till_installed(client, cluster, timeout=60 * 60 * 2): # TODO: Change host validation for only previous known hosts try: wait_till_all_hosts_are_in_status( client=client, cluster_id=cluster.id, nodes_count=len(cluster.hosts), statuses=[consts.NodesStatus.INSTALLED], timeout=timeout, interval=60, ) operators_utils.wait_till_all_operators_are_in_status( client=client, cluster_id=cluster.id, operators_count=len(cluster.monitored_operators), operator_types=[OperatorType.BUILTIN, OperatorType.OLM], statuses=[consts.OperatorStatus.AVAILABLE, consts.OperatorStatus.FAILED], timeout=consts.CLUSTER_INSTALLATION_TIMEOUT, fall_on_error_status=False, ) utils.wait_till_cluster_is_in_status( client=client, cluster_id=cluster.id, statuses=[consts.ClusterStatus.INSTALLED], timeout=consts.CLUSTER_INSTALLATION_TIMEOUT if cluster.high_availability_mode == "Full" else consts.CLUSTER_INSTALLATION_TIMEOUT * 2, break_statuses=[consts.ClusterStatus.ERROR] ) finally: output_folder = f'build/{cluster.id}' utils.recreate_folder(output_folder) download_logs_from_all_hosts(client=client, cluster_id=cluster.id, output_folder=output_folder)
def setup_files_and_folders(args, net_asset, cluster_name): logging.info("Creating needed files and folders") utils.recreate_folder(consts.BASE_IMAGE_FOLDER, force_recreate=False) utils.recreate_folder(IBIP_DIR, with_chmod=False, force_recreate=True) shutil.copy(os.path.join(RESOURCES_DIR, INSTALL_CONFIG_FILE_NAME), IBIP_DIR) fill_install_config(args.pull_secret, args.ssh_key, net_asset, cluster_name)
def prepare_nodes(self): log.info("Preparing nodes") self.destroy_all_nodes() if not os.path.exists(self._entity_config.iso_download_path): utils.recreate_folder(os.path.dirname(self._entity_config.iso_download_path), force_recreate=False) # if file not exist lets create dummy utils.touch(self._entity_config.iso_download_path) self.params.running = False self._create_nodes()
def download_image(self, iso_download_path: str = None) -> Path: iso_download_url = self.get_details().download_url iso_download_path = iso_download_path or self._config.iso_download_path # ensure file path exists before downloading if not os.path.exists(iso_download_path): utils.recreate_folder(os.path.dirname(iso_download_path), force_recreate=False) log.info(f"Downloading image {iso_download_url} to {iso_download_path}") return utils.download_file(iso_download_url, iso_download_path, self._config.verify_download_iso_ssl)
def gather_sosreport_data(output_dir: str): sosreport_output = os.path.join(output_dir, "sosreport") recreate_folder(sosreport_output) controller = LibvirtController(config=TerraformConfig(), entity_config=ClusterConfig()) run_concurrently( jobs=[(gather_sosreport_from_node, node, sosreport_output) for node in controller.list_nodes()], timeout=60 * 20, )
def prepare_for_installation(self): utils.recreate_folder(consts.IMAGE_FOLDER, force_recreate=False) cluster = self.api_client.cluster_get( cluster_id=self.config.day1_cluster_id) self.config.day1_cluster_name = cluster.name openshift_version = cluster.openshift_version api_vip_dnsname = "api." + self.config.day1_cluster_name + "." + cluster.base_dns_domain api_vip_ip = cluster.api_vip openshift_cluster_id = str(uuid.uuid4()) params = { "openshift_version": openshift_version, "api_vip_dnsname": api_vip_dnsname } cluster = self.api_client.create_day2_cluster( self.config.day1_cluster_name + "-day2", openshift_cluster_id, **params) self.config.cluster_id = cluster.id self.api_client.set_pull_secret(cluster.id, self.config.pull_secret) self.set_cluster_proxy(cluster.id) self.config_etc_hosts(api_vip_ip, api_vip_dnsname) self.config.tf_folder = os.path.join( utils.TerraformControllerUtil.get_folder( self.config.day1_cluster_name), consts.Platforms.BARE_METAL) self.configure_terraform(self.config.tf_folder, self.config.day2_workers_count, api_vip_ip) static_network_config = None if self.config.is_static_ip: static_network_config = static_network.generate_day2_static_network_data_from_tf( self.config.tf_folder, self.config.day2_workers_count) # Generate image infra_env = self.api_client.create_infra_env( cluster_id=cluster.id, name=self.config.day1_cluster_name + "_infra-env", ssh_public_key=self.config.ssh_public_key, static_network_config=static_network_config, pull_secret=self.config.pull_secret, openshift_version=openshift_version, ) self.config.infra_env_id = infra_env.id # Download image iso_download_url = infra_env.download_url image_path = os.path.join( consts.IMAGE_FOLDER, f"{self.config.day1_cluster_name}-installer-image.iso") log.info(f"Downloading image {iso_download_url} to {image_path}") utils.download_file(iso_download_url, image_path, False)
def execute_day2_flow(cluster_id, args, day2_type_flag, has_ipv6): utils.recreate_folder(consts.IMAGE_FOLDER, force_recreate=False) client = args.api_client cluster = client.cluster_get(cluster_id=cluster_id) cluster_name = cluster.name openshift_version = cluster.openshift_version api_vip_dnsname = "api." + cluster_name + "." + cluster.base_dns_domain api_vip_ip = cluster.api_vip terraform_cluster_dir_prefix = cluster_name if day2_type_flag == "ocp": terraform_cluster_dir_prefix = f"{consts.CLUSTER_PREFIX}-{consts.DEFAULT_NAMESPACE}" else: cluster_id = str(uuid.uuid4()) cluster = client.create_day2_cluster( cluster_name + "-day2", cluster_id, **_day2_cluster_create_params(openshift_version, api_vip_dnsname)) set_cluster_pull_secret(client, cluster_id, args.pull_secret) set_cluster_proxy(client, cluster_id, args) config_etc_hosts(api_vip_ip, api_vip_dnsname) image_path = os.path.join(consts.IMAGE_FOLDER, f'{cluster_name}-installer-image.iso') tf_folder = os.path.join(utils.get_tf_folder(terraform_cluster_dir_prefix), consts.Platforms.BARE_METAL) set_day2_tf_configuration(tf_folder, args.num_day2_workers, api_vip_ip, api_vip_dnsname) static_network_config = None if args.with_static_network_config: static_network_config = static_network.generate_day2_static_network_data_from_tf( tf_folder, args.num_day2_workers) client.generate_and_download_image( cluster_id=cluster.id, image_path=image_path, ssh_key=args.ssh_key, static_network_config=static_network_config) day2_nodes_flow( client, terraform_cluster_dir_prefix, tf_folder, cluster, has_ipv6, args.num_day2_workers, args.install_cluster, day2_type_flag, args.with_static_network_config, cluster_name, )
def _create_tf_folder(self, name: str, platform: str): tf_folder = TerraformControllerUtil.get_folder(cluster_name=name) log.info("Creating %s as terraform folder", tf_folder) utils.recreate_folder(tf_folder) utils.copy_template_tree(tf_folder) if platform == consts.Platforms.NONE: return os.path.join(tf_folder, consts.Platforms.NONE) if isinstance(self._entity_config, BaseInfraEnvConfig): return os.path.join(tf_folder, "baremetal_infra_env") return os.path.join(tf_folder, consts.Platforms.BARE_METAL)
def execute_day1_flow(): client, cluster = try_get_cluster() cluster_name = f'{args.cluster_name or consts.CLUSTER_PREFIX}-{args.namespace}' if cluster: args.base_dns_domain = cluster.base_dns_domain cluster_name = cluster.name elif args.managed_dns_domains: args.base_dns_domain = args.managed_dns_domains.split(":")[0] log.info('Cluster name: %s', cluster_name) machine_net = MachineNetwork(args.ipv4, args.ipv6, args.vm_network_cidr, args.vm_network_cidr6, args.ns_index) image_path = args.image or os.path.join( consts.IMAGE_FOLDER, f'{args.namespace}-installer-image.iso' ) set_tf_config(cluster_name) if not args.image: utils.recreate_folder(consts.IMAGE_FOLDER, force_recreate=False) if not client: client = ClientFactory.create_client(url=utils.get_assisted_service_url_by_args(args=args), offline_token=utils.get_env("OFFLINE_TOKEN")) if args.cluster_id: cluster = client.cluster_get(cluster_id=args.cluster_id) else: cluster = client.create_cluster(cluster_name, ssh_public_key=args.ssh_key, **_cluster_create_params(client)) static_network_config = apply_static_network_config( cluster_name=cluster_name, kube_client=None, ) client.generate_and_download_image( cluster_id=cluster.id, image_path=image_path, image_type=args.iso_image_type, ssh_key=args.ssh_key, static_network_config=static_network_config, ) # Iso only, cluster will be up and iso downloaded but vm will not be created if not args.iso_only: run_nodes_flow(client, cluster_name, cluster, machine_net, image_path) return cluster.id if cluster else None
def collect_debug_info_from_cluster(cluster_deployment, agent_cluster_install): cluster_name = cluster_deployment.ref.name output_folder = f"build/{cluster_name}" recreate_folder(output_folder) aci = agent_cluster_install.get() debug_info = aci["status"]["debugInfo"] try: log.info("Collecting debugInfo (events/logs) from cluster") fetch_url_and_write_to_file("eventsURL", "events.json", debug_info, output_folder) fetch_url_and_write_to_file("logsURL", "logs.tar", debug_info, output_folder) except Exception as err: log.warning( f"Failed to collect debug info for cluster {cluster_name} ({err})")
def _collect_journalctl(nodes: Nodes, log_dir_name): log.info("Collecting journalctl\n") utils.recreate_folder(log_dir_name, with_chmod=False, force_recreate=False) journal_ctl_path = Path(log_dir_name) / "nodes_journalctl" utils.recreate_folder(journal_ctl_path, with_chmod=False) for node in nodes: try: node.run_command( f"sudo journalctl >> /tmp/{node.name}-journalctl") journal_path = journal_ctl_path / node.name node.download_file(f"/tmp/{node.name}-journalctl", str(journal_path)) except (RuntimeError, TimeoutError, SSHException): log.info(f"Could not collect journalctl for {node.name}")
def set_tf_config(cluster_name): nodes_details = _create_node_details(cluster_name) tf_folder = utils.get_tf_folder(cluster_name, args.namespace) utils.recreate_folder(tf_folder) utils.copy_template_tree(tf_folder) baremetal_template = os.path.join(tf_folder, consts.Platforms.BARE_METAL) machine_net = MachineNetwork(args.ipv4, args.ipv6, args.vm_network_cidr, args.vm_network_cidr6, args.ns_index) default_image_path = os.path.join(consts.IMAGE_FOLDER, f'{args.namespace}-installer-image.iso') fill_tfvars( image_path=args.image or default_image_path, storage_path=args.storage_path, master_count=args.master_count, nodes_details=nodes_details, tf_folder=baremetal_template, machine_net=machine_net )
def download_logs_kube_api(api_client: ApiClient, cluster_name: str, namespace: str, dest: str, must_gather: bool, management_kubeconfig: str): cluster_deployment = ClusterDeployment( kube_api_client=api_client, name=cluster_name, namespace=namespace, ) agent_cluster_install = AgentClusterInstall( kube_api_client=api_client, name=cluster_deployment.get()["spec"]["clusterInstallRef"]["name"], namespace=namespace, ) output_folder = os.path.join(dest, f"{cluster_name}") recreate_folder(output_folder) try: with SuppressAndLog(requests.exceptions.RequestException, ConnectionError): collect_debug_info_from_cluster(cluster_deployment, agent_cluster_install, output_folder) if must_gather: recreate_folder(os.path.join(output_folder, "must-gather")) with SuppressAndLog(Exception): # in case of hypershift if namespace.startswith("clusters"): log.info("Dumping hypershift files") hypershift = HyperShift(name=cluster_name) hypershift.dump(os.path.join(output_folder, "dump"), management_kubeconfig) else: _must_gather_kube_api(cluster_name, cluster_deployment, agent_cluster_install, output_folder) finally: run_command(f"chmod -R ugo+rx '{output_folder}'")
def _download_ipxe_script(self, infra_env_id: str, cluster_name: str): log.info(f"Downloading iPXE script to {self._ipxe_scripts_folder}") utils.recreate_folder(self._ipxe_scripts_folder, force_recreate=False) self._api_client.download_and_save_infra_env_file( infra_env_id=infra_env_id, file_name="ipxe-script", file_path=f"{self._ipxe_scripts_folder}/{cluster_name}" )
def execute_kube_api_flow(): log.info("Executing kube-api flow") cluster_name = f'{args.cluster_name or consts.CLUSTER_PREFIX}-{args.namespace}' utils.recreate_folder(consts.IMAGE_FOLDER, force_recreate=False) machine_net = MachineNetwork(args.ipv4, args.ipv6, args.vm_network_cidr, args.vm_network_cidr6, args.ns_index) kube_client = create_kube_api_client() cluster_deployment = ClusterDeployment( kube_api_client=kube_client, name=cluster_name, namespace=args.namespace ) set_tf_config(cluster_name) secret = Secret( kube_api_client=kube_client, name=cluster_name, namespace=args.namespace, ) secret.apply(pull_secret=args.pull_secret) imageSet=ClusterImageSet( kube_api_client=kube_client, name=f"{cluster_name}-image-set", namespace=args.namespace ) releaseImage = utils.get_openshift_release_image() imageSet.apply(releaseImage=releaseImage) ipv4 = args.ipv4 and args.ipv4.lower() in MachineNetwork.YES_VALUES ipv6 = args.ipv6 and args.ipv6.lower() in MachineNetwork.YES_VALUES api_vip, ingress_vip = "", "" if args.master_count > 1: api_vip, ingress_vip = _get_vips_ips(machine_net) agent_cluster_install = AgentClusterInstall( kube_api_client=kube_client, name=f'{cluster_name}-agent-cluster-install', namespace=args.namespace ) image_set_ref = ClusterImageSetReference(name=f'{cluster_name}-image-set') cluster_deployment.apply( secret=secret, base_domain=args.base_dns_domain, agent_cluster_install_ref=agent_cluster_install.ref, ) agent_cluster_install.apply( cluster_deployment_ref=cluster_deployment.ref, api_vip=api_vip, ingress_vip=ingress_vip, image_set_ref=image_set_ref, cluster_cidr=args.cluster_network if ipv4 else args.cluster_network6, host_prefix=args.host_prefix if ipv4 else args.host_prefix6, service_network=args.service_network if ipv4 else args.service_network6, ssh_pub_key=args.ssh_key, control_plane_agents=args.master_count, worker_agents=args.number_of_workers, machine_cidr=get_machine_cidr_from_machine_net(machine_net), hyperthreading=args.hyperthreading, ) agent_cluster_install.wait_to_be_ready(False) apply_static_network_config( cluster_name=cluster_name, kube_client=kube_client, ) image_path = os.path.join( consts.IMAGE_FOLDER, f'{args.namespace}-installer-image.iso' ) log.info("Creating infraEnv") http_proxy, https_proxy, no_proxy = _get_http_proxy_params(ipv4=ipv4, ipv6=ipv6) infra_env = InfraEnv( kube_api_client=kube_client, name=f"{cluster_name}-infra-env", namespace=args.namespace ) infra_env.apply( cluster_deployment=cluster_deployment, secret=secret, proxy=Proxy( http_proxy=http_proxy, https_proxy=https_proxy, no_proxy=no_proxy ), ssh_pub_key=args.ssh_key, nmstate_label=cluster_name, ) infra_env.status() image_url = infra_env.get_iso_download_url() utils.download_iso(image_url, image_path) try: nodes_flow_kube_api(cluster_name, machine_net, cluster_deployment, agent_cluster_install) finally: if not image_path or args.keep_iso: return log.info('deleting iso: %s', image_path) os.unlink(image_path)
def download_logs( client: InventoryClient, cluster: dict, dest: str, must_gather: bool, update_by_events: bool = False, retry_interval: int = RETRY_INTERVAL, ): if "hosts" not in cluster or len(cluster["hosts"]) == 0: cluster["hosts"] = client.get_cluster_hosts(cluster_id=cluster["id"]) output_folder = get_logs_output_folder(dest, cluster) if not is_update_needed(output_folder, update_by_events, client, cluster): log.info(f"Skipping, no need to update {output_folder}.") return recreate_folder(output_folder) recreate_folder(os.path.join(output_folder, "cluster_files")) try: write_metadata_file(client, cluster, os.path.join(output_folder, "metadata.json")) with SuppressAndLog(requests.exceptions.RequestException, ConnectionError, KeyboardInterrupt): client.download_metrics(os.path.join(output_folder, "metrics.txt")) for cluster_file in ( "bootstrap.ign", "master.ign", "worker.ign", "install-config.yaml", ): with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_and_save_file( cluster["id"], cluster_file, os.path.join(output_folder, "cluster_files", cluster_file)) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): download_manifests(client, cluster["id"], output_folder) infra_env_list = set() for host_id, infra_env_id in map( lambda host: (host["id"], host["infra_env_id"]), cluster["hosts"]): with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_host_ignition( infra_env_id, host_id, os.path.join(output_folder, "cluster_files")) if infra_env_id not in infra_env_list: infra_env_list.add(infra_env_id) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_infraenv_events( infra_env_id, get_infraenv_events_path(infra_env_id, output_folder)) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): client.download_cluster_events( cluster["id"], get_cluster_events_path(cluster, output_folder)) shutil.copy2( os.path.join(os.path.dirname(os.path.realpath(__file__)), "events.html"), output_folder) with SuppressAndLog(assisted_service_client.rest.ApiException, KeyboardInterrupt): are_masters_in_configuring_state = are_host_progress_in_stage( cluster["hosts"], [HostsProgressStages.CONFIGURING], 2) are_masters_in_join_or_done_state = are_host_progress_in_stage( cluster["hosts"], [HostsProgressStages.JOINED, HostsProgressStages.DONE], 2) max_retries = MUST_GATHER_MAX_RETRIES if are_masters_in_join_or_done_state else MAX_RETRIES is_controller_expected = cluster[ "status"] == ClusterStatus.INSTALLED or are_masters_in_configuring_state min_number_of_logs = min_number_of_log_files( cluster, is_controller_expected) for i in range(max_retries): cluster_logs_tar = os.path.join( output_folder, f"cluster_{cluster['id']}_logs.tar") with suppress(FileNotFoundError): os.remove(cluster_logs_tar) client.download_cluster_logs(cluster["id"], cluster_logs_tar) try: verify_logs_uploaded( cluster_logs_tar, min_number_of_logs, installation_success=( cluster["status"] == ClusterStatus.INSTALLED), check_oc=are_masters_in_join_or_done_state, ) break except AssertionError as ex: log.warning("Cluster logs verification failed: %s", ex) # Skip sleeping on last retry if i < MAX_RETRIES - 1: log.info(f"Going to retry in {retry_interval} seconds") time.sleep(retry_interval) kubeconfig_path = os.path.join(output_folder, "kubeconfig-noingress") with SuppressAndLog(assisted_service_client.rest.ApiException): client.download_kubeconfig_no_ingress(cluster["id"], kubeconfig_path) if must_gather: config_etc_hosts( cluster["name"], cluster["base_dns_domain"], client.get_api_vip(cluster, cluster["id"]), ) download_must_gather(kubeconfig_path, output_folder) finally: run_command(f"chmod -R ugo+rx '{output_folder}'")
def download_manifests(client: InventoryClient, cluster_id: str, output_folder: str) -> None: manifests_path = os.path.join(output_folder, "cluster_files", "manifests") recreate_folder(manifests_path) client.download_manifests(cluster_id, manifests_path)