def get_ceph_tools_pod(): """ Get the Ceph tools pod Returns: Pod object: The Ceph tools pod object """ ocp_pod_obj = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) ct_pod_items = ocp_pod_obj.get(selector='app=rook-ceph-tools')['items'] if not ct_pod_items: # setup ceph_toolbox pod if the cluster has been setup by some other CI setup_ceph_toolbox() ct_pod_items = ocp_pod_obj.get(selector='app=rook-ceph-tools')['items'] assert ct_pod_items, "No Ceph tools pod found" # In the case of node failure, the CT pod will be recreated with the old # one in status Terminated. Therefore, need to filter out the Terminated pod running_ct_pods = list() for pod in ct_pod_items: if ocp_pod_obj.get_resource_status( pod.get('metadata').get('name')) == constants.STATUS_RUNNING: running_ct_pods.append(pod) assert running_ct_pods, "No running Ceph tools pod found" ceph_pod = Pod(**running_ct_pods[0]) return ceph_pod
def deploy_with_external_mode(self): """ This function handles the deployment of OCS on external/indpendent RHCS cluster """ live_deployment = config.DEPLOYMENT.get("live_deployment") logger.info("Deploying OCS with external mode RHCS") ui_deployment = config.DEPLOYMENT.get("ui_deployment") if not ui_deployment: logger.info("Creating namespace and operator group.") run_cmd(f"oc create -f {constants.OLM_YAML}") if not live_deployment: self.create_ocs_operator_source() self.subscribe_ocs() operator_selector = get_selector_for_ocs_operator() subscription_plan_approval = config.DEPLOYMENT.get( "subscription_plan_approval") package_manifest = PackageManifest( resource_name=defaults.OCS_OPERATOR_NAME, selector=operator_selector, subscription_plan_approval=subscription_plan_approval, ) package_manifest.wait_for_resource(timeout=300) channel = config.DEPLOYMENT.get("ocs_csv_channel") csv_name = package_manifest.get_current_csv(channel=channel) csv = CSV(resource_name=csv_name, namespace=self.namespace) csv.wait_for_phase("Succeeded", timeout=720) # Create secret for external cluster secret_data = templating.load_yaml( constants.EXTERNAL_CLUSTER_SECRET_YAML) external_cluster_details = config.EXTERNAL_MODE.get( "external_cluster_details", "") if not external_cluster_details: raise ExternalClusterDetailsException( "No external cluster data found") secret_data["data"][ "external_cluster_details"] = external_cluster_details secret_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="external_cluster_secret", delete=False) templating.dump_data_to_temp_yaml(secret_data, secret_data_yaml.name) logger.info("Creating external cluster secret") run_cmd(f"oc create -f {secret_data_yaml.name}") cluster_data = templating.load_yaml( constants.EXTERNAL_STORAGE_CLUSTER_YAML) cluster_data["metadata"]["name"] = config.ENV_DATA[ "storage_cluster_name"] cluster_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="external_cluster_storage", delete=False) templating.dump_data_to_temp_yaml(cluster_data, cluster_data_yaml.name) run_cmd(f"oc create -f {cluster_data_yaml.name}", timeout=2400) self.external_post_deploy_validation() setup_ceph_toolbox()
def deploy_with_external_mode(self): """ This function handles the deployment of OCS on external/indpendent RHCS cluster """ live_deployment = config.DEPLOYMENT.get("live_deployment") logger.info("Deploying OCS with external mode RHCS") ui_deployment = config.DEPLOYMENT.get("ui_deployment") if not ui_deployment: logger.info("Creating namespace and operator group.") run_cmd(f"oc create -f {constants.OLM_YAML}") if not live_deployment: create_catalog_source() self.subscribe_ocs() operator_selector = get_selector_for_ocs_operator() subscription_plan_approval = config.DEPLOYMENT.get("subscription_plan_approval") ocs_version = version.get_semantic_ocs_version_from_config() if ocs_version >= version.VERSION_4_9: ocs_operator_names = [ defaults.ODF_OPERATOR_NAME, defaults.OCS_OPERATOR_NAME, ] else: ocs_operator_names = [defaults.OCS_OPERATOR_NAME] channel = config.DEPLOYMENT.get("ocs_csv_channel") for ocs_operator_name in ocs_operator_names: package_manifest = PackageManifest( resource_name=ocs_operator_name, selector=operator_selector, subscription_plan_approval=subscription_plan_approval, ) package_manifest.wait_for_resource(timeout=300) csv_name = package_manifest.get_current_csv(channel=channel) csv = CSV(resource_name=csv_name, namespace=self.namespace) csv.wait_for_phase("Succeeded", timeout=720) # Set rook log level self.set_rook_log_level() # Create secret for external cluster create_external_secret() cluster_data = templating.load_yaml(constants.EXTERNAL_STORAGE_CLUSTER_YAML) cluster_data["metadata"]["name"] = config.ENV_DATA["storage_cluster_name"] cluster_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="external_cluster_storage", delete=False ) templating.dump_data_to_temp_yaml(cluster_data, cluster_data_yaml.name) run_cmd(f"oc create -f {cluster_data_yaml.name}", timeout=2400) self.external_post_deploy_validation() setup_ceph_toolbox()
def get_images_post_upgrade(self, channel, pre_upgrade_images, upgrade_version): """ Checks if all images of OCS cluster upgraded, and return list of all images if upgrade success Args: channel: (str): OCS subscription channel pre_upgrade_images: (dict): Contains all OCS cluster images upgrade_version: (str): version to be upgraded Returns: set: Contains full path of OCS cluster old images """ operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, subscription_plan_approval=self.subscription_plan_approval, ) csv_name_post_upgrade = package_manifest.get_current_csv(channel) csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade, namespace=self.namespace) log.info( f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state" ) # Workaround for patching missing ceph-rook-tools pod after upgrade if self.version_before_upgrade == "4.2" and upgrade_version == "4.3": log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3") setup_ceph_toolbox(force_setup=True) # End of workaround if config.DEPLOYMENT.get("external_mode") or config.ENV_DATA.get( "mcg_only_deployment"): timeout = 200 else: timeout = 200 * get_osd_count() csv_post_upgrade.wait_for_phase("Succeeded", timeout=timeout) post_upgrade_images = get_images(csv_post_upgrade.get()) old_images, _, _ = get_upgrade_image_info(pre_upgrade_images, post_upgrade_images) return old_images
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url")) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default() if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU, min_memory=constants.MIN_NODE_MEMORY): logger.info("The cluster specs meet the minimum requirements and " "therefore, NooBaa auto scale will be enabled") min_nb_eps = config.DEPLOYMENT.get('min_noobaa_endpoints') max_nb_eps = config.DEPLOYMENT.get('max_noobaa_endpoints') change_noobaa_endpoints_count(min_nb_eps=min_nb_eps, max_nb_eps=max_nb_eps) else: logger.warning( "The cluster specs do not meet the minimum requirements" " and therefore, NooBaa auto scale will remain disabled") change_noobaa_endpoints_count(min_nb_eps=1, max_nb_eps=1)
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ _templating = templating.Templating() ceph_cluster = ocp.OCP( kind='CephCluster', namespace=self.namespace ) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") if not self.ocs_operator_deployment: create_oc_resource( 'common.yaml', self.cluster_path, _templating, config.ENV_DATA ) run_cmd( f'oc label namespace {config.ENV_DATA["cluster_namespace"]} ' f'"openshift.io/cluster-monitoring=true"' ) run_cmd( f"oc policy add-role-to-user view " f"system:serviceaccount:openshift-monitoring:prometheus-k8s " f"-n {self.namespace}" ) # HACK: If you would like to drop this hack, make sure that you # also updated docs and write appropriate unit/integration tests # for config processing. if config.ENV_DATA.get('monitoring_enabled') in ( "true", "True", True ): # RBAC rules for monitoring, based on documentation change in # rook: # https://github.com/rook/rook/commit/1b6fe840f6ae7372a9675ba727ecc65326708aa8 # HACK: This should be dropped when OCS is managed by OLM apply_oc_resource( 'rbac.yaml', self.cluster_path, _templating, config.ENV_DATA, template_dir="monitoring" ) # Increased to 15 seconds as 10 is not enough # TODO: do the sampler function and check if resource exist wait_time = 15 logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) create_oc_resource( 'operator-openshift.yaml', self.cluster_path, _templating, config.ENV_DATA ) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) run_cmd( f"oc wait --for condition=ready pod " f"-l app=rook-ceph-operator " f"-n {self.namespace} " f"--timeout=120s" ) run_cmd( f"oc wait --for condition=ready pod " f"-l app=rook-discover " f"-n {self.namespace} " f"--timeout=120s" ) create_oc_resource( 'cluster.yaml', self.cluster_path, _templating, config.ENV_DATA ) else: self.deploy_ocs_via_operator() pod = ocp.OCP( kind=constants.POD, namespace=self.namespace ) cfs = ocp.OCP( kind=constants.CEPHFILESYSTEM, namespace=self.namespace ) # Check for Ceph pods assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600 ) assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-mgr', timeout=600 ) assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600 ) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600 ) if not self.ocs_operator_deployment: logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # HACK: This should be dropped (including service-monitor.yaml and # prometheus-rules.yaml files) when OCS is managed by OLM if config.ENV_DATA.get('monitoring_enabled') not in ( "true", "True", True ): # HACK: skip creation of rook-ceph-mgr service monitor when # monitoring is enabled (if this were not skipped, the step # would fail because rook would create the service monitor at # this point already) create_oc_resource( "service-monitor.yaml", self.cluster_path, _templating, config.ENV_DATA ) # HACK: skip creation of prometheus-rules, rook-ceph is # concerned with it's setup now, based on clarification from # Umanga Chapagain create_oc_resource( "prometheus-rules.yaml", self.cluster_path, _templating, config.ENV_DATA ) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # Create MDS pods for CephFileSystem fs_data = templating.load_yaml(constants.CEPHFILESYSTEM_YAML) fs_data['metadata']['namespace'] = self.namespace ceph_obj = OCS(**fs_data) ceph_obj.create() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds', resource_count=2, timeout=600 ) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info(f"MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error( f"MDS deployment Failed! Please check logs!" ) if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get('persistent-monitoring'): # Create a pool, secrets and sc secret_obj = helpers.create_secret(interface_type=constants.CEPHBLOCKPOOL) cbj_obj = helpers.create_ceph_block_pool() sc_obj = helpers.create_storage_class( interface_type=constants.CEPHBLOCKPOOL, interface_name=cbj_obj.name, secret_name=secret_obj.name ) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager'] ) # Create configmap cluster-monitoring-config create_configmap_cluster_monitoring_pod(sc_obj.name) # Take some time to respin the pod waiting_time = 30 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state validate_pods_are_respinned_and_running_state( pods_list ) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods validate_pvc_are_mounted_on_monitoring_pods(pods_list) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check( namespace=self.namespace ) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1847098 if config.DEPLOYMENT.get('local_storage'): tools_pod = run_cmd( f"oc -n {self.namespace} get pod -l 'app=rook-ceph-tools' " f"-o jsonpath='{{.items[0].metadata.name}}'") pgs_to_autoscale = [ 'ocs-storagecluster-cephblockpool', 'ocs-storagecluster-cephfilesystem-data0' ] for pg in pgs_to_autoscale: run_cmd(f"oc -n {self.namespace} exec {tools_pod} -- " f"ceph osd pool set {pg} pg_autoscale_mode on") # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url")) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check(namespace=self.namespace) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace) try: ceph_cluster.get().get("items")[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") if config.DEPLOYMENT["external_mode"]: logger.info("Deploying OCS on external mode RHCS") return self.deploy_with_external_mode() self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod.wait_for_resource(condition="Running", selector="app=rook-ceph-mgr", timeout=600) assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector="app=rook-ceph-tools", resource_count=1, timeout=600, ) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data["items"][0]["metadata"]["name"] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "persistent-monitoring"): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus", "alertmanager"], ) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url"), ) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") if self.platform == constants.VSPHERE_PLATFORM: update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ set_registry_to_managed_state() image = None ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace) try: ceph_cluster.get().get("items")[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") # disconnected installation? load_cluster_info() if config.DEPLOYMENT.get("disconnected"): image = prepare_disconnected_ocs_deployment() if config.DEPLOYMENT["external_mode"]: self.deploy_with_external_mode() else: self.deploy_ocs_via_operator(image) pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods mon_pod_timeout = (900 if self.platform == constants.IBMCLOUD_PLATFORM else 600) assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=mon_pod_timeout, ) assert pod.wait_for_resource(condition="Running", selector="app=rook-ceph-mgr", timeout=600) assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector="app=rook-ceph-tools", resource_count=1, timeout=600, ) if not config.COMPONENTS["disable_cephfs"]: # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data["items"][0]["metadata"]["name"] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "persistent-monitoring"): setup_persistent_monitoring() elif config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) if not config.COMPONENTS["disable_cephfs"]: # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") if self.platform == constants.VSPHERE_PLATFORM: update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def test_upgrade(): ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): namespace = config.ENV_DATA['cluster_namespace'] version_before_upgrade = config.ENV_DATA.get("ocs_version") upgrade_version = config.UPGRADE.get("upgrade_ocs_version", version_before_upgrade) ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image') if ocs_registry_image: upgrade_version = get_ocs_version_from_image(ocs_registry_image) parsed_version_before_upgrade = parse_version(version_before_upgrade) parsed_upgrade_version = parse_version(upgrade_version) assert parsed_upgrade_version >= parsed_version_before_upgrade, ( f"Version you would like to upgrade to: {upgrade_version} " f"is not higher or equal to the version you currently running: " f"{version_before_upgrade}") operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, ) channel = config.DEPLOYMENT.get('ocs_csv_channel') csv_name_pre_upgrade = package_manifest.get_current_csv(channel) log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}") csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade, namespace=namespace) pre_upgrade_images = get_images(csv_pre_upgrade.get()) version_change = parsed_upgrade_version > parsed_version_before_upgrade if version_change: version_config_file = os.path.join(constants.CONF_DIR, 'ocs_version', f'ocs-{upgrade_version}.yaml') load_config_file(version_config_file) ocs_catalog = CatalogSource( resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME, namespace=constants.MARKETPLACE_NAMESPACE, ) upgrade_in_current_source = config.UPGRADE.get( 'upgrade_in_current_source', False) if not upgrade_in_current_source: if not ocs_catalog.is_exist() and not upgrade_in_current_source: log.info("OCS catalog source doesn't exist. Creating new one.") create_catalog_source(ocs_registry_image, ignore_upgrade=True) image_url = ocs_catalog.get_image_url() image_tag = ocs_catalog.get_image_name() log.info(f"Current image is: {image_url}, tag: {image_tag}") if ocs_registry_image: image_url, new_image_tag = ocs_registry_image.split(':') elif (config.UPGRADE.get('upgrade_to_latest', True) or version_change): new_image_tag = get_latest_ds_olm_tag() else: new_image_tag = get_next_version_available_for_upgrade( image_tag) cs_data = deepcopy(ocs_catalog.data) image_for_upgrade = ':'.join([image_url, new_image_tag]) log.info(f"Image: {image_for_upgrade} will be used for upgrade.") cs_data['spec']['image'] = image_for_upgrade with NamedTemporaryFile() as cs_yaml: dump_data_to_temp_yaml(cs_data, cs_yaml.name) ocs_catalog.apply(cs_yaml.name) # Wait for the new package manifest for upgrade. operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, ) package_manifest.wait_for_resource() channel = config.DEPLOYMENT.get('ocs_csv_channel') if not channel: channel = package_manifest.get_default_channel() # update subscription subscription = OCP( resource_name=constants.OCS_SUBSCRIPTION, kind='subscription', namespace=config.ENV_DATA['cluster_namespace'], ) current_ocs_source = subscription.data['spec']['source'] log.info(f"Current OCS subscription source: {current_ocs_source}") ocs_source = current_ocs_source if upgrade_in_current_source else ( constants.OPERATOR_CATALOG_SOURCE_NAME) patch_subscription_cmd = ( f'oc patch subscription {constants.OCS_SUBSCRIPTION} ' f'-n {namespace} --type merge -p \'{{"spec":{{"channel": ' f'"{channel}", "source": "{ocs_source}"}}}}\'') run_cmd(patch_subscription_cmd) subscription_plan_approval = config.DEPLOYMENT.get( 'subscription_plan_approval') if subscription_plan_approval == 'Manual': wait_for_install_plan_and_approve(namespace) attempts = 145 for attempt in range(1, attempts + 1): log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.") csv_name_post_upgrade = package_manifest.get_current_csv(channel) if csv_name_post_upgrade == csv_name_pre_upgrade: log.info(f"CSV is still: {csv_name_post_upgrade}") sleep(5) else: log.info(f"CSV now upgraded to: {csv_name_post_upgrade}") break if attempts == attempt: raise TimeoutException("No new CSV found after upgrade!") csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade, namespace=namespace) log.info( f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state" ) if version_before_upgrade == '4.2' and upgrade_version == '4.3': log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3") setup_ceph_toolbox(force_setup=True) osd_count = get_osd_count() csv_post_upgrade.wait_for_phase("Succeeded", timeout=200 * osd_count) post_upgrade_images = get_images(csv_post_upgrade.get()) old_images, _, _ = get_upgrade_image_info(pre_upgrade_images, post_upgrade_images) verify_image_versions(old_images, parsed_upgrade_version) ocs_install_verification( timeout=600, skip_osd_distribution_check=True, ocs_registry_image=ocs_registry_image, post_upgrade_verification=True, )
def test_upgrade(): ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): namespace = config.ENV_DATA['cluster_namespace'] ocs_catalog = CatalogSource( resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME, namespace=constants.MARKETPLACE_NAMESPACE, ) version_before_upgrade = config.ENV_DATA.get("ocs_version") upgrade_version = config.UPGRADE.get("upgrade_ocs_version", version_before_upgrade) parsed_version_before_upgrade = parse_version(version_before_upgrade) parsed_upgrade_version = parse_version(upgrade_version) assert parsed_upgrade_version >= parsed_version_before_upgrade, ( f"Version you would like to upgrade to: {upgrade_version} " f"is not higher or equal to the version you currently running: " f"{version_before_upgrade}") version_change = parsed_upgrade_version > parsed_version_before_upgrade if version_change: version_config_file = os.path.join(constants.CONF_DIR, 'ocs_version', f'ocs-{upgrade_version}.yaml') assert os.path.exists(version_config_file), ( f"OCS version config file {version_config_file} doesn't exist!" ) with open(os.path.abspath( os.path.expanduser(version_config_file))) as file_stream: custom_config_data = yaml.safe_load(file_stream) config.update(custom_config_data) image_url = ocs_catalog.get_image_url() image_tag = ocs_catalog.get_image_name() log.info(f"Current image is: {image_url}, tag: {image_tag}") ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image') if ocs_registry_image: image_url, new_image_tag = ocs_registry_image.split(':') elif config.UPGRADE.get('upgrade_to_latest', True) or version_change: new_image_tag = get_latest_ds_olm_tag() else: new_image_tag = get_next_version_available_for_upgrade(image_tag) cs_data = deepcopy(ocs_catalog.data) image_for_upgrade = ':'.join([image_url, new_image_tag]) log.info(f"Image: {image_for_upgrade} will be used for upgrade.") cs_data['spec']['image'] = image_for_upgrade operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, ) csv_name_pre_upgrade = package_manifest.get_current_csv() log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}") csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade, namespace=namespace) pre_upgrade_images = get_images(csv_pre_upgrade.get()) with NamedTemporaryFile() as cs_yaml: dump_data_to_temp_yaml(cs_data, cs_yaml.name) ocs_catalog.apply(cs_yaml.name) # Wait for package manifest is ready package_manifest.wait_for_resource() subscription_plan_approval = config.DEPLOYMENT.get( 'subscription_plan_approval') if subscription_plan_approval == 'Manual': wait_for_install_plan_and_approve(namespace) attempts = 145 for attempt in range(1, attempts): if attempts == attempt: raise TimeoutException("No new CSV found after upgrade!") log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.") package_manifest.reload_data() csv_name_post_upgrade = package_manifest.get_current_csv() if csv_name_post_upgrade == csv_name_pre_upgrade: log.info(f"CSV is still: {csv_name_post_upgrade}") sleep(5) else: log.info(f"CSV now upgraded to: {csv_name_post_upgrade}") break csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade, namespace=namespace) log.info( f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state" ) if version_before_upgrade == '4.2' and upgrade_version == '4.3': log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3") setup_ceph_toolbox(force_setup=True) csv_post_upgrade.wait_for_phase("Succeeded", timeout=600) post_upgrade_images = get_images(csv_post_upgrade.get()) old_images, _, _ = get_upgrade_image_info(pre_upgrade_images, post_upgrade_images) verify_image_versions(old_images, parsed_upgrade_version) ocs_install_verification(timeout=600, skip_osd_distribution_check=True)
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP( kind='CephCluster', namespace=self.namespace ) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() pod = ocp.OCP( kind=constants.POD, namespace=self.namespace ) cfs = ocp.OCP( kind=constants.CEPHFILESYSTEM, namespace=self.namespace ) # Check for Ceph pods assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600 ) assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-mgr', timeout=600 ) assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600 ) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600 ) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info(f"MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error( f"MDS deployment Failed! Please check logs!" ) # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get('persistent-monitoring'): sc = helpers.default_storage_class(interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager'] ) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get("telemeter_server_url")) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state validate_pods_are_respinned_and_running_state( pods_list ) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods validate_pvc_are_mounted_on_monitoring_pods(pods_list) elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get("telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check( namespace=self.namespace ) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()