def add_capacity_test(): osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 # Verify OSDs are encrypted. if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=80) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=5400), "Data re-balance failed to complete"
def wait_for_wl_to_finish(self, fio_client_pod): """ Waiting until the workload is finished Args: fio_client_pod (obj): the FIO client pod object Raises: IOError: in case of the FIO failed to finish Returns: str: the end time of the workload """ log.info("Waiting for fio_client to complete") pod_obj = OCP(kind="pod") pod_obj.wait_for_resource( condition="Completed", resource_name=fio_client_pod, timeout=18000, sleep=300, ) # Getting the end time of the test end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) output = run_cmd(f"oc logs {fio_client_pod}") log.info(f"The Test log is : {output}") try: if "Fio failed to execute" not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") return end_time
def verify_provider_resources(): """ Verify resources specific to managed OCS provider: 1. Ocs-provider-server pod is Running 2. cephcluster is Ready and its hostNetworking is set to True 3. Security groups are set up correctly """ # Verify ocs-provider-server pod is Running pod_obj = OCP( kind="pod", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) pod_obj.wait_for_resource(condition="Running", selector="app=ocsProviderApiServer", resource_count=1) # Verify that cephcluster is Ready and hostNetworking is True cephcluster = OCP(kind="CephCluster", namespace=defaults.ROOK_CLUSTER_NAMESPACE) cephcluster_yaml = cephcluster.get().get("items")[0] log.info("Verifying that cephcluster is Ready and hostNetworking is True") assert ( cephcluster_yaml["status"]["phase"] == "Ready" ), f"Status of cephcluster ocs-storagecluster-cephcluster is {cephcluster_yaml['status']['phase']}" assert cephcluster_yaml["spec"]["network"][ "hostNetwork"], f"hostNetwork is {cephcluster_yaml['spec']['network']['hostNetwork']}" assert verify_worker_nodes_security_groups()
def check_scale_pods_and_pvcs_created_on_consumers(self): for consumer_i, fio_scale in self.consumer_i_per_fio_scale.items(): config.switch_ctx(consumer_i) c_name = config.ENV_DATA.get("cluster_name") ocp_pvc = OCP(kind=constants.PVC, namespace=fio_scale.namespace) ocp_pvc.wait_for_resource( timeout=30, condition=constants.STATUS_BOUND, resource_count=self.scale_count, ) log.info( f"All the PVCs were created successfully on the consumer {c_name}" ) ocp_pod = OCP(kind=constants.POD, namespace=fio_scale.namespace) ocp_pod.wait_for_resource( timeout=30, condition=constants.STATUS_COMPLETED, resource_count=self.expected_pod_num, ) log.info( f"All the pods were created successfully on the consumer {c_name}" ) log.info( "All the pods and PVCs were created successfully on the consumers")
def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory): """ Check storage utilization, if its less then runs IO, Scale osds from 3-6, check for rebalance and reboot workers """ current_osd_count = count_cluster_osd() proj_obj = project_factory() if current_osd_count == 3: while not validate_osd_utilization(osd_used=50): # Create pvc pvc_objs = multi_pvc_factory(project=proj_obj, interface=interface, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) dc_pod_objs = list() for pvc_obj in pvc_objs: dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj)) wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs) for pod_obj in dc_pod_objs: pod_obj.run_io(storage_type='fs', size='3G', runtime='60', fio_filename=f'{pod_obj.name}_io') # Add capacity osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=count * 3) assert ceph_health_check(), "New OSDs failed to reach running state" cluster = CephCluster() # Get rebalance status rebalance_status = cluster.get_rebalance_status() logger.info(rebalance_status) if rebalance_status: time_taken = cluster.time_taken_to_complete_rebalance() logger.info(f"The time taken to complete rebalance {time_taken}") # Rolling reboot on worker nodes worker_nodes = get_typed_nodes(node_type='worker') factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in worker_nodes: nodes.restart_nodes(nodes=[node]) wait_for_nodes_status() assert ceph_health_check( delay=180), "Failed, Ceph health bad after nodes reboot"
def test_rgw_host_node_failure( self, nodes, node_restart_teardown, mcg_obj, bucket_factory ): """ Test case to fail node where RGW and Noobaa-db-0 hosting and verify new pod spuns on healthy node """ # Get rgw pods rgw_pod_obj = get_rgw_pods() # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name == "noobaa-db-0": noobaa_pod_node = get_pod_node(noobaa_pod) for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info( f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted" ) node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state( resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720 ) # Validate new rgw pod spun ocp_obj = OCP( kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1") # Start the node nodes.start_nodes(node_obj) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check() # Verify all storage pods are running wait_for_storage_pods()
def test_fio_workload_simple(self, ripsaw, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd( 'resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml' ) sc = 'ocs-storagecluster-ceph-rbd' if interface == 'CephBlockPool' else 'ocs-storagecluster-cephfs' # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Todo: have pvc_size set to 'get_osd_pods_memory_sum * 5' # once pr-2037 is merged fio_cr['spec']['clustername'] = config.ENV_DATA['platform'] + get_build() + get_ocs_version() fio_cr['spec']['test_user'] = get_ocs_version() + interface + io_pattern fio_cr['spec']['workload']['args']['storageclass'] = sc if io_pattern == 'sequential': fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read'] log.info(f'fio_cr: {fio_cr}') fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler( 300, 20, get_pod_name_by_pattern, 'fio-client', 'my-ripsaw' ): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=fio_client_pod, timeout=18000, sleep=300, ) output = run_cmd(f'oc logs {fio_client_pod}') try: if 'Fio failed to execute' not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() analyze_regression(io_pattern, sc, es_username=fio_cr['spec']['test_user'])
def test_add_capacity_node_restart( self, nodes, multi_pvc_factory, pod_factory, workload_storageutilization_rbd, num_of_nodes, ): """ test add capacity when one of the worker nodes got restart in the middle of the process """ logging.info( "Condition 1 to start the test is met: storageutilization is completed" ) # Please notice: When the branch 'wip-add-capacity-e_e' will be merged into master # the test will include more much data both before, and after calling 'add_capacity'function. node_list = get_ocs_nodes(num_of_nodes=num_of_nodes) assert node_list, "Condition 2 to start test failed: No node to restart" max_osds = 15 osd_pods_before = pod_helpers.get_osd_pods() assert ( len(osd_pods_before) < max_osds ), "Condition 3 to start test failed: We have maximum of osd's in the cluster" logging.info("All start conditions are met!") osd_size = storage_cluster.get_osd_size() logging.info("Calling add_capacity function...") result = storage_cluster.add_capacity(osd_size) if result: logging.info("add capacity finished successfully") else: logging.info("add capacity failed") # Restart nodes while additional storage is being added logging.info("Restart nodes:") logging.info([n.name for n in node_list]) nodes.restart_nodes(nodes=node_list, wait=True) logging.info("Finished restarting the node list") # The exit criteria verification conditions here are not complete. When the branch # 'wip-add-capacity-e_e' will be merged into master I will use the functions from this branch. pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify OSDs are encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() logging.info("Finished verifying add capacity osd storage with node restart") logging.info("Waiting for ceph health check to finished...") ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=180)
def add_capacity_test(): osd_size = storage_cluster.get_osd_size() existing_osd_pods = get_osd_pods() existing_osd_pod_names = [pod.name for pod in existing_osd_pods] if ui_add_capacity_conditions(): try: result = ui_add_capacity(osd_size) except Exception as e: logging.error( f"Add capacity via UI is not applicable and CLI method will be done. The error is {e}" ) result = storage_cluster.add_capacity(osd_size) else: result = storage_cluster.add_capacity(osd_size) osd_pods_post_expansion = get_osd_pods() osd_pod_names_post_expansion = [ pod.name for pod in osd_pods_post_expansion ] restarted_osds = list() logger.info( "Checking if existing OSD pods were restarted (deleted) post add capacity (bug 1931601)" ) for pod in existing_osd_pod_names: if pod not in osd_pod_names_post_expansion: restarted_osds.append(pod) assert ( len(restarted_osds) == 0 ), f"The following OSD pods were restarted (deleted) post add capacity: {restarted_osds}" pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled(): replica_count = 1 else: replica_count = 3 pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * replica_count, ) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 # Verify OSDs are encrypted. if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() check_ceph_health_after_add_capacity(ceph_rebalance_timeout=3600)
def test_delete_rook_ceph_osd_deployment(self): osd_deployments = get_osd_deployments() deployment_obj = OCP(kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) pod_obj = OCP(kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for osd_deployment in osd_deployments: # Get rook-ceph-osd pod name associated with the deployment osd_deployment_name = osd_deployment.name old_osd_pod = get_pod_name_by_pattern( pattern=osd_deployment_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, )[0] logger.info(f"Deleting OSD deployment: {osd_deployment_name}") try: deployment_obj.delete(resource_name=osd_deployment_name) deployment_obj.wait_for_resource( condition="0/1", resource_name=osd_deployment_name, column="READY") except CommandFailed as err: if "NotFound" not in str(err): raise # Wait for new OSD deployment to be Ready deployment_obj.wait_for_resource(condition="1/1", resource_name=osd_deployment_name, column="READY") # Check if a new OSD pod is created new_osd_pod = get_pod_name_by_pattern( pattern=osd_deployment_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, )[0] assert old_osd_pod != new_osd_pod, "New OSD pod not created" # Check if new OSD pod is up and running logger.info( "Waiting for a new OSD pod to get created and reach Running state" ) assert pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=new_osd_pod, column="STATUS", ), f"New OSD pod {new_osd_pod} is not in {constants.STATUS_RUNNING} state" # If clusterwide encryption is enabled, verify that the new OSDs are encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() assert ceph_health_check(delay=120, tries=50), "Ceph health check failed"
def test_add_capacity(self): """ Test to add variable capacity to the OSD cluster while IOs running """ self.ceph_cluster = CephCluster() osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=result * 3) self.ceph_cluster.cluster_health_check(timeout=1200)
def test_sql_workload_simple(self, ripsaw): """ This is a basic pgsql workload """ # Deployment postgres log.info("Deploying postgres database") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') ripsaw.setup_postgresql() # Create pgbench benchmark log.info("Create resource file for pgbench workload") pg_data = templating.load_yaml(constants.PGSQL_BENCHMARK_YAML) pg_obj = OCS(**pg_data) pg_obj.create() # Wait for pgbench pod to be created for pgbench_pod in TimeoutSampler(300, 3, get_pod_name_by_pattern, 'pgbench-1-dbs-client', 'my-ripsaw'): try: if pgbench_pod[0] is not None: pgbench_client_pod = pgbench_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Wait for pg_bench pod to initialized and complete log.info("Waiting for pgbench_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=pgbench_client_pod, timeout=800, sleep=10, ) # Running pgbench and parsing logs output = run_cmd(f'oc logs {pgbench_client_pod}') pg_output = utils.parse_pgsql_logs(output) log.info("*******PGBench output log*********\n" f"{pg_output}") for data in pg_output: latency_avg = data['latency_avg'] if not latency_avg: raise UnexpectedBehaviour("PGBench failed to run, " "no data found on latency_avg") log.info("PGBench has completed successfully") # Clean up pgbench benchmark log.info("Deleting PG bench benchmark") pg_obj.delete()
def _deploy_es(self): log.info('Deploy the PVC for the ElasticSearch cluster') self.ocp.apply(self.pvc) log.info('Deploy the ElasticSearch cluster') self.ocp.apply(self.crd) for es_pod in TimeoutSampler( 300, 20, get_pod_name_by_pattern, 'quickstart-es-default', self.namespace ): try: if es_pod[0] is not None: self.espod = es_pod[0] log.info(f'The ElasticSearch pod {self.espod} Started') break except IndexError: log.info('elasticsearch pod not ready yet') es_pod = OCP(kind='pod', namespace=self.namespace) log.info('Waiting for ElasticSearch to Run') assert es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600 ) log.info('Elastic Search is ready !!!')
def test_rgw_pod_existence(self): if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS or storagecluster_independent_check()): if (not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM and not config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM and (version.get_semantic_ocs_version_from_config() > version.VERSION_4_5)): logger.info("Checking whether RGW pod is not present") assert ( not pod.get_rgw_pods() ), "RGW pods should not exist in the current platform/cluster" elif (config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS and not config.ENV_DATA["mcg_only_deployment"]): rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"], check_if_cluster_was_upgraded(), None) logger.info( f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform' ) rgw_pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) assert rgw_pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=rgw_count, timeout=60, )
def add_capacity_test(): osd_size = storage_cluster.get_osd_size() existing_osd_pods = get_osd_pods() existing_osd_pod_names = [pod.name for pod in existing_osd_pods] result = storage_cluster.add_capacity(osd_size) osd_pods_post_expansion = get_osd_pods() osd_pod_names_post_expansion = [ pod.name for pod in osd_pods_post_expansion ] restarted_osds = list() logger.info( "Checking if existing OSD pods were restarted (deleted) post add capacity (bug 1931601)" ) for pod in existing_osd_pod_names: if pod not in osd_pod_names_post_expansion: restarted_osds.append(pod) assert ( len(restarted_osds) == 0 ), f"The following OSD pods were restarted (deleted) post add capacity: {restarted_osds}" pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 # Verify OSDs are encrypted. if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=80) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=5400), "Data re-balance failed to complete"
def validate_monitoring_pods_are_respinned_and_running_state(pods_list): """ Validate monitoring pods are respinned and running state Args: pod_list (list): List of the pods where pvc are mounted """ ocp = OCP(api_version='v1', kind='Pod', namespace='openshift-monitoring') assert ocp.wait_for_resource( condition=constants.STATUS_PENDING, resource_name=pods_list[0] ), f"failed to reach pod {pods_list[0]} " f"desired status {constants.STATUS_PENDING}" for pod in pods_list: assert ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod ), f"failed to reach pod {pod} " f"desired status {constants.STATUS_RUNNING}"
def wait_for_osd_pods_to_be_running(self, storagedeviceset_count): """ The function gets the number of storage device set in the cluster, and wait for the osd pods to be in status running. Args: storagedeviceset_count (int): the number of storage device set in the cluster """ logging.info("starting function 'wait_for_osd_pods_to_be_running'") pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=420, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=storagedeviceset_count * 3) self.new_pods_in_status_running = True
def test_add_capacity_osd_pod_delete(self, workload_storageutilization_rbd): """ Test add capacity when one of the osd pods gets deleted in the middle of the process. """ used_percentage = get_percent_used_capacity() logging.info(f"storageutilization is completed. used capacity = {used_percentage}") max_osds = 15 osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) if number_of_osd_pods_before >= max_osds: pytest.skip("We have maximum of osd's in the cluster") d = Disruptions() d.set_resource('osd') osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # OSD number go down by one and then gradually go up by 1 # and finally the OSD number will be storagedeviceset_count*3 pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info("Delete an osd pod while storage capacity is getting increased") d.delete_resource(1) pod = OCP( kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace'] ) pod.wait_for_resource( timeout=420, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=storagedeviceset_count * 3 ) logging.info("Finished verifying add capacity when one of the osd pods gets deleted") logging.info("Waiting for ceph health check to finished...") ceph_health_check( namespace=config.ENV_DATA['cluster_namespace'], tries=80 )
def wait_for_wl_to_finish(self, fio_client_pod): """ Waiting until the workload is finished Args: fio_client_pod (obj): the FIO client pod object Returns: str: the end time of the workload """ if dev_mode: timeout = 3600 sleeptime = 30 else: timeout = 18000 sleeptime = 300 log.info("Waiting for fio_client to complete") pod_obj = OCP(kind="pod") pod_obj.wait_for_resource( condition="Completed", resource_name=fio_client_pod, timeout=timeout, sleep=sleeptime, ) # Getting the end time of the test end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) output = run_cmd(f"oc logs {fio_client_pod}") log_file_name = f"{self.full_log_path}/test-pod.log" with open(log_file_name, "w") as f: f.write(output) log.info(f"The Test log is can be found at : {log_file_name}") try: if "Fio failed to execute" not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") return end_time
def _deploy_es(self): """ Deploying the Elasticsearch server """ # Creating PVC for the elasticsearch server and wait until it bound log.info("Creating 10 GiB PVC for the ElasticSearch cluster on") try: self.pvc_obj = create_pvc( sc_name=self.args.get("sc") or constants.CEPHBLOCKPOOL_SC, namespace=self.namespace, pvc_name="elasticsearch-data-quickstart-es-default-0", access_mode=constants.ACCESS_MODE_RWO, size="10Gi", ) # Make sure the PVC bound, or delete it and raise exception wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND) except ResourceWrongStatusException: log.error("The PVC couldn't created") return False self.pvc_obj.reload() log.info("Deploy the ElasticSearch cluster") self.ocp.apply(self.crd) sample = TimeoutSampler( timeout=300, sleep=10, func=self._pod_is_found, pattern="quickstart-es-default", ) if not sample.wait_for_func_status(True): log.error("The ElasticSearch pod deployment Failed") return False self.espod = get_pod_name_by_pattern("quickstart-es-default", self.namespace)[0] log.info(f"The ElasticSearch pod {self.espod} Started") es_pod = OCP(kind="pod", namespace=self.namespace) log.info("Waiting for ElasticSearch to Run") if not es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600, ): log.error("TThe ElasticSearch pod is not running !") return False else: log.info("Elastic Search is ready !!!") return True
def add_capacity_test(): osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=result * 3) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=80)
def __init__(self, mcg, obc): """ Initializer function Args: mcg (obj): Multi cloud gateway object obc (str): Name of the Object Bucket Claim """ self.obc_name = obc self.namespace = config.ENV_DATA['cluster_namespace'] obc_obj = OCP(namespace=self.namespace, kind='ObjectBucketClaim') assert obc_obj.wait_for_resource( condition=constants.STATUS_BOUND, resource_name=self.obc_name, column='PHASE', resource_count=1, timeout=60 ), "OBC did not reach BOUND Phase, cannot initialize OBC credentials" obc_resource = OCP(namespace=self.namespace, kind='ObjectBucketClaim', resource_name=self.obc_name) obc_results = obc_resource.get() self.ob_name = obc_results.get('spec').get('ObjectBucketName') self.bucket_name = obc_results.get('spec').get('bucketName') ob_obj = OCP(namespace=self.namespace, kind='ObjectBucket', resource_name=self.ob_name).get() self.obc_account = ob_obj.get('spec').get('additionalState').get( 'account') secret_obc_obj = OCP(kind='secret', namespace=self.namespace, resource_name=self.obc_name).get() self.access_key_id = base64.b64decode( secret_obc_obj.get('data').get('AWS_ACCESS_KEY_ID')).decode( 'utf-8') self.access_key = base64.b64decode( secret_obc_obj.get('data').get('AWS_SECRET_ACCESS_KEY')).decode( 'utf-8') self.s3_endpoint = mcg.s3_endpoint self.s3_resource = boto3.resource( 's3', verify=False, endpoint_url=self.s3_endpoint, aws_access_key_id=self.access_key_id, aws_secret_access_key=self.access_key) self.s3_client = boto3.client('s3', verify=False, endpoint_url=self.s3_endpoint, aws_access_key_id=self.access_key_id, aws_secret_access_key=self.access_key)
def test_add_capacity(self): """ Test to add variable capacity to the OSD cluster while IOs running """ osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=result * 3) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 pod.wait_for_resource(timeout=300, condition=constants.STATUS_COMPLETED, selector=constants.OSD_PREPARE_APP_LABEL, resource_count=result * 3) ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=80)
def run(self): """ Run the benchmark and wait until it completed """ # Create the benchmark object self.sf_obj = OCS(**self.crd_data) self.sf_obj.create() # Wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler( 240, 10, get_pod_name_by_pattern, "smallfile-client", benchmark_operator.BMO_NAME, ): try: if bench_pod[0] is not None: small_file_client_pod = bench_pod[0] break except IndexError: log.info("Bench pod not ready yet") bench_pod = OCP(kind="pod", namespace=benchmark_operator.BMO_NAME) log.info("Waiting for SmallFile benchmark to Run") assert bench_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=small_file_client_pod, sleep=30, timeout=600, ) log.info("The SmallFiles benchmark is running, wait for completion") bench_pod.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=small_file_client_pod, timeout=18000, sleep=60, ) log.info("The SmallFiles benchmark is completed")
def wait_for_osd_pods_to_be_running(self, storagedeviceset_count): """ The function gets the number of storage device set in the cluster, and wait for the osd pods to be in status running. Args: storagedeviceset_count (int): the number of storage device set in the cluster """ logging.info("starting function 'wait_for_osd_pods_to_be_running'") pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled(): replica_count = 1 else: replica_count = 3 pod.wait_for_resource( timeout=420, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=storagedeviceset_count * replica_count, ) self.new_pods_in_status_running = True
def create_kafkadrop(self, wait=True): """ Create kafkadrop pod, service and routes Args: wait (bool): If true waits till kafkadrop pod running Return: tuple: Contains objects of kafkadrop pod, service and route """ # Create kafkadrop pod try: kafkadrop = list( templating.load_yaml(constants.KAFKADROP_YAML, multi_document=True)) self.kafkadrop_pod = OCS(**kafkadrop[0]) self.kafkadrop_svc = OCS(**kafkadrop[1]) self.kafkadrop_route = OCS(**kafkadrop[2]) self.kafkadrop_pod.create() self.kafkadrop_svc.create() self.kafkadrop_route.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during creation of kafkadrop which kafka UI") raise cf # Validate kafkadrop pod running if wait: ocp_obj = OCP(kind=constants.POD, namespace=constants.AMQ_NAMESPACE) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector="app=kafdrop", timeout=120, sleep=5, ) return self.kafkadrop_pod, self.kafkadrop_svc, self.kafkadrop_route
def finalizer(): op_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) pod_obj = OCP( kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) operator_obj = op_obj.get(resource_name=constants.ROOK_CEPH_OPERATOR) if operator_obj.get("spec").get("replicas") != 1: modify_deployment_replica_count( deployment_name=constants.ROOK_CEPH_OPERATOR, replica_count=1 ), "Failed to scale up rook-ceph-operator to 1" log.info("Validate all mons are up and running") try: pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=60, sleep=5, ) except (TimeoutExpiredError, ResourceWrongStatusException) as ex: log.warning(ex) op_obj.delete(resource_name=constants.ROOK_CEPH_OPERATOR) for pod in get_mon_pods(): pod.delete() pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=360, sleep=5, ) log.info("All mons are up and running")
def _deploy_es(self): """ Deploying the Elasticsearch server """ # Creating PVC for the elasticsearch server and wait until it bound log.info("Creating 10 GiB PVC for the ElasticSearch cluster on") self.pvc_obj = create_pvc( sc_name=constants.CEPHBLOCKPOOL_SC, namespace=self.namespace, pvc_name="elasticsearch-data-quickstart-es-default-0", access_mode=constants.ACCESS_MODE_RWO, size="10Gi", ) wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND) self.pvc_obj.reload() log.info("Deploy the ElasticSearch cluster") self.ocp.apply(self.crd) sample = TimeoutSampler( timeout=300, sleep=10, func=self._pod_is_found, pattern="quickstart-es-default", ) if not sample.wait_for_func_status(True): self.cleanup() raise Exception("The ElasticSearch pod deployment Failed") self.espod = get_pod_name_by_pattern("quickstart-es-default", self.namespace)[0] log.info(f"The ElasticSearch pod {self.espod} Started") es_pod = OCP(kind="pod", namespace=self.namespace) log.info("Waiting for ElasticSearch to Run") assert es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600, ) log.info("Elastic Search is ready !!!")
def test_smallfile_workload(self, ripsaw): """ Run SmallFile Workload """ log.info("Apply Operator CRD") ripsaw.apply_crd('resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml') log.info("Running SmallFile bench") sf_data = templating.load_yaml_to_dict( constants.SMALLFILE_BENCHMARK_YAML) sf_obj = OCS(**sf_data) sf_obj.create() # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler(40, 3, get_pod_name_by_pattern, 'smallfile-client', 'my-ripsaw'): try: if bench_pod[0] is not None: small_file_client_pod = bench_pod[0] break except IndexError: log.info("Bench pod not ready yet") bench_pod = OCP(kind='pod', namespace='my-ripsaw') log.info("Waiting for SmallFile benchmark to Run") assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING, resource_name=small_file_client_pod, sleep=30, timeout=600) start_time = time.time() timeout = 900 while True: logs = bench_pod.exec_oc_cmd(f'logs {small_file_client_pod}', out_yaml_format=False) if "RUN STATUS DONE" in logs: log.info("SmallFile Benchmark Completed Successfully") break if timeout < (time.time() - start_time): raise TimeoutError( f"Timed out waiting for benchmark to complete") time.sleep(30)
def setup_ceph_toolbox(force_setup=False): """ Setup ceph-toolbox - also checks if toolbox exists, if it exists it behaves as noop. Args: force_setup (bool): force setup toolbox pod """ ocs_version = version.get_semantic_ocs_version_from_config() if ocsci_config.ENV_DATA["mcg_only_deployment"]: log.info("Skipping Ceph toolbox setup due to running in MCG only mode") return namespace = ocsci_config.ENV_DATA["cluster_namespace"] ceph_toolbox = get_pod_name_by_pattern("rook-ceph-tools", namespace) # setup toolbox for external mode # Refer bz: 1856982 - invalid admin secret if len(ceph_toolbox) == 1: log.info("Ceph toolbox already exists, skipping") if force_setup: log.info("Running force setup for Ceph toolbox!") else: return external_mode = ocsci_config.DEPLOYMENT.get("external_mode") if ocs_version == version.VERSION_4_2: tool_box_data = templating.load_yaml(constants.TOOL_POD_YAML) tool_box_data["spec"]["template"]["spec"]["containers"][0][ "image"] = get_rook_version() rook_toolbox = OCS(**tool_box_data) rook_toolbox.create() else: if external_mode: toolbox = templating.load_yaml(constants.TOOL_POD_YAML) toolbox["spec"]["template"]["spec"]["containers"][0][ "image"] = get_rook_version() toolbox["metadata"]["name"] += "-external" keyring_dict = ocsci_config.EXTERNAL_MODE.get("admin_keyring") if ocs_version >= version.VERSION_4_10: toolbox["spec"]["template"]["spec"]["containers"][0][ "command"] = ["/bin/bash"] toolbox["spec"]["template"]["spec"]["containers"][0]["args"][ 0] = "-m" toolbox["spec"]["template"]["spec"]["containers"][0]["args"][ 1] = "-c" toolbox["spec"]["template"]["spec"]["containers"][0][ "tty"] = True env = toolbox["spec"]["template"]["spec"]["containers"][0]["env"] # replace secret env = [ item for item in env if not (item["name"] == "ROOK_CEPH_SECRET") ] env.append({ "name": "ROOK_CEPH_SECRET", "value": keyring_dict["key"] }) toolbox["spec"]["template"]["spec"]["containers"][0]["env"] = env # add ceph volumeMounts ceph_volume_mount_path = { "mountPath": "/etc/ceph", "name": "ceph-config" } ceph_volume = {"name": "ceph-config", "emptyDir": {}} toolbox["spec"]["template"]["spec"]["containers"][0][ "volumeMounts"].append(ceph_volume_mount_path) toolbox["spec"]["template"]["spec"]["volumes"].append(ceph_volume) rook_toolbox = OCS(**toolbox) rook_toolbox.create() return # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1982721 # TODO: Remove workaround when bug 1982721 is fixed # https://github.com/red-hat-storage/ocs-ci/issues/4585 if ocsci_config.ENV_DATA.get("is_multus_enabled"): toolbox = templating.load_yaml(constants.TOOL_POD_YAML) toolbox["spec"]["template"]["spec"]["containers"][0][ "image"] = get_rook_version() toolbox["metadata"]["name"] += "-multus" toolbox["spec"]["template"]["metadata"]["annotations"] = { "k8s.v1.cni.cncf.io/networks": "openshift-storage/ocs-public" } toolbox["spec"]["template"]["spec"]["hostNetwork"] = False rook_toolbox = OCS(**toolbox) rook_toolbox.create() return # for OCS >= 4.3 there is new toolbox pod deployment done here: # https://github.com/openshift/ocs-operator/pull/207/ log.info("starting ceph toolbox pod") run_cmd( "oc patch ocsinitialization ocsinit -n openshift-storage --type " 'json --patch \'[{ "op": "replace", "path": ' '"/spec/enableCephTools", "value": true }]\'') toolbox_pod = OCP(kind=constants.POD, namespace=namespace) toolbox_pod.wait_for_resource( condition="Running", selector="app=rook-ceph-tools", resource_count=1, timeout=120, )