def label_nodes(request): """ Fixture to label the node(s) that will run the application pod. That will be all workers node that do not run the OCS cluster. """ def teardown(): log.info('Clear label form worker (Application) nodes') # Getting all Application nodes app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL) helpers.remove_label_from_worker_node(app_nodes, constants.APP_NODE_LABEL) request.addfinalizer(teardown) # Getting all OCS nodes (to verify app pod wil not run on) ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) # Add label to the worker nodes worker_nodes = helpers.get_worker_nodes() # Getting list of free nodes free_nodes = list(set(worker_nodes) - set(ocs_nodes)) log.info('Adding the app-node label to Non-OCS workers') log.debug(f'The Workers nodes are : {worker_nodes}') log.debug(f'The OCS nodes are : {ocs_nodes}') log.debug(f'The free nodes are : {free_nodes}') assert free_nodes, \ 'Did not found any worker to run on, pleas deploy another worker' helpers.label_worker_node(free_nodes, constants.APP_NODE_LABEL, constants.VDBENCH_NODE_LABEL) return
def cleanup(self): """ Function to tear down """ # Delete all pods, pvcs and namespaces for namespace in self.namespace_list: delete_objs_parallel( obj_list=pod.get_all_pods(namespace=namespace.namespace), namespace=namespace.namespace, kind=self.kind, ) delete_objs_parallel( obj_list=pvc.get_all_pvc_objs(namespace=namespace.namespace), namespace=namespace.namespace, kind=constants.PVC, ) ocp = OCP(kind=constants.NAMESPACE) ocp.delete(resource_name=namespace.namespace) # Remove scale label from worker nodes in cleanup scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL) helpers.remove_label_from_worker_node(node_list=scale_workers, label_key="scale-label") # Delete machineset which will delete respective nodes too for aws-ipi platform if self.ms_name: for name in self.ms_name: machine.delete_custom_machineset(name)
def uninstall_lso(lso_sc): """ Function uninstalls local-volume objects from OCS cluster """ ocp_obj = ocp.OCP() sc_obj = (ocp.OCP(kind=constants.STORAGECLASS, resource_name=lso_sc, namespace=config.ENV_DATA['local_storage_namespace'])) lv_name = sc_obj.get().get('metadata').get('labels').get( 'local.storage.openshift.io/owner-name') lv_obj = (ocp.OCP(kind=constants.LOCAL_VOLUME, resource_name=lv_name, namespace=config.ENV_DATA['local_storage_namespace'])) log.info( f"Local storage was found. using storage class: {lso_sc}, local volume:{lv_name}" ) device_list = lv_obj.get().get('spec').get('storageClassDevices')[0].get( 'devicePaths') storage_node_list = get_labeled_nodes(constants.OPERATOR_NODE_LABEL) pv_obj_list = (ocp.OCP( kind=constants.PV, selector=f'storage.openshift.com/local-volume-owner-name={lv_name}', namespace=config.ENV_DATA['local_storage_namespace'])) log.info("Deleting local volume PVs") for pv in pv_obj_list.get().get('items'): log.info(f"deleting pv {pv.get('metadata').get('name')}") pv_obj_list.delete(resource_name=pv.get('metadata').get('name')) log.info("Removing local volume from storage nodes") for node in storage_node_list: log.info(f"Removing from node {node}") ocp_obj.exec_oc_debug_cmd( node=node, cmd_list=[f"rm -rfv /mnt/local-storage/{lso_sc}"]) disk_list_str = "" for device in device_list: disk_list_str = disk_list_str + f" {device}" disk_list_str = f"DISKS=\"{disk_list_str}\"" log.info(f"The disk list is {disk_list_str}") sgd_command = "for disk in $DISKS; do sgdisk --zap-all $disk;done" log.info("Wiping disks on storage nodes ") for node in storage_node_list: log.info(f"Wiping on node {node}") cmd_list = [disk_list_str, sgd_command] ocp_obj.exec_oc_debug_cmd(node=node, cmd_list=cmd_list) log.info(f"Deleting storage class {lso_sc}") sc_obj.delete(resource_name=lso_sc) log.info(f"Deleting local volume {lv_name}") lv_obj.delete(resource_name=lv_name)
def delete_worker_node(): # Remove scale label from worker nodes scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL) if scale_workers: helpers.remove_label_from_worker_node(node_list=scale_workers, label_key="scale-label") # Delete machineset if ms_name: for name in ms_name: machine.delete_custom_machineset(name)
def is_node_labeled(node_name, label=constants.OPERATOR_NODE_LABEL): """ Check if the node is labeled with a specified label. Args: node_name (str): The node name to check if it has the specific label label (str): The name of the label. Default value is the OCS label. Returns: bool: True if the node is labeled with the specified label. False otherwise """ node_names_with_label = machine.get_labeled_nodes(label=label) return node_name in node_names_with_label
def test_rolling_reboot_node(self, node_type): """ Test to rolling reboot of nodes """ # Get info from SCALE_DATA_FILE for validation if os.path.exists(SCALE_DATA_FILE): file_data = templating.load_yaml(SCALE_DATA_FILE) namespace = file_data.get("NAMESPACE") pod_scale_list = file_data.get("POD_SCALE_LIST") pvc_scale_list = file_data.get("PVC_SCALE_LIST") else: raise FileNotFoundError node_list = list() # Rolling reboot nodes if node_type == constants.WORKER_MACHINE: tmp_list = get_nodes(node_type=node_type) ocs_node_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) for tmp in tmp_list: if tmp.name in ocs_node_list: node_list.append(tmp) else: node_list = get_nodes(node_type=node_type) factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in node_list: nodes.restart_nodes(nodes=[node]) scale_lib.validate_node_and_oc_services_are_up_after_reboot() # Validate storage pods are running wait_for_storage_pods() # Validate cluster health ok and all pods are running assert utils.ceph_health_check( delay=180 ), "Ceph health in bad state after node reboots" # Validate all PVCs from namespace are in Bound state assert scale_lib.validate_all_pvcs_and_check_state( namespace=namespace, pvc_scale_list=pvc_scale_list ) # Validate all PODs from namespace are up and running assert scale_lib.validate_all_pods_and_check_state( namespace=namespace, pod_scale_list=pod_scale_list )
def teardown(): if with_ocs: return if m_set != '': log.info(f'Destroy {m_set}') machine.delete_custom_machineset(m_set) else: log.info('Clear label form worker (Application) nodes') # Getting all Application nodes app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL) log.debug(f'The application nodes are : {app_nodes}') helpers.remove_label_from_worker_node(app_nodes, constants.VDBENCH_NODE_LABEL)
def get_ocs_nodes(num_of_nodes=None): """ Gets the ocs nodes Args: num_of_nodes (int): The number of ocs nodes to return. If not specified, it returns all the ocs nodes. Returns: list: List of ocs nodes """ ocs_node_names = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) ocs_nodes = get_node_objs(ocs_node_names) num_of_nodes = num_of_nodes or len(ocs_nodes) return ocs_nodes[:num_of_nodes]
def uninstall_lso(lso_sc): """ Function uninstalls local-volume objects from OCS cluster """ ocp_obj = ocp.OCP() sc_obj = ocp.OCP( kind=constants.STORAGECLASS, resource_name=lso_sc, namespace=config.ENV_DATA["local_storage_namespace"], ) log.info("Deleting local volume set") lvs_obj = ocp.OCP( kind=constants.LOCAL_VOLUME_SET, namespace=config.ENV_DATA["local_storage_namespace"], ) lvs_obj.delete(constants.LOCAL_VOLUME_SET_YAML) pv_obj_list = ocp.OCP( kind=constants.PV, namespace=config.ENV_DATA["local_storage_namespace"], ) log.info("Deleting local volume PVs") for pv in pv_obj_list.get().get("items"): log.info(f"deleting pv {pv.get('metadata').get('name')}") pv_obj_list.delete(resource_name=pv.get("metadata").get("name")) log.info(f"Deleting storage class {lso_sc}") sc_obj.delete(resource_name=lso_sc) log.info("deleting local volume discovery") lvd_obj = ocp.OCP( kind=constants.LOCAL_VOLUME_DISCOVERY, namespace=config.ENV_DATA["local_storage_namespace"], ) lvd_obj.delete(yaml_file=constants.LOCAL_VOLUME_DISCOVERY_YAML) log.info("Removing local volume from storage nodes") storage_node_list = get_labeled_nodes(constants.OPERATOR_NODE_LABEL) for node in storage_node_list: log.info(f"Removing from node {node}") ocp_obj.exec_oc_debug_cmd( node=node, cmd_list=[f"rm -rfv /mnt/local-storage/{lso_sc}"] )
def check_and_add_enough_worker(worker_count): """ Function to check if there is enough workers available to scale pods. IF there is no enough worker then worker will be added based on supported platforms Function also adds scale label to the respective worker nodes. Args: worker_count (int): Expected worker count to be present in the setup Returns: book: True is there is enough worker count else raise exception. """ # Check either to use OCS workers for scaling app pods # Further continue to label the worker with scale label else not worker_list = node.get_worker_nodes() ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) scale_worker = machine.get_labeled_nodes(constants.SCALE_LABEL) if config.RUN.get("use_ocs_worker_for_scale"): if not scale_worker: helpers.label_worker_node(node_list=worker_list, label_key="scale-label", label_value="app-scale") else: if not scale_worker: for node_item in ocs_worker_list: worker_list.remove(node_item) if worker_list: helpers.label_worker_node( node_list=worker_list, label_key="scale-label", label_value="app-scale", ) scale_worker_list = machine.get_labeled_nodes(constants.SCALE_LABEL) logging.info(f"Print existing scale worker {scale_worker_list}") # Check if there is enough nodes to continue scaling of app pods if len(scale_worker_list) >= worker_count: logging.info(f"Setup has expected worker count {worker_count} " "to continue scale of pods") return True else: logging.info( "There is no enough worker in the setup, will add enough worker " "for the automation supported platforms") # Add enough worker for AWS if (config.ENV_DATA["deployment_type"] == "ipi" and config.ENV_DATA["platform"].lower() == "aws"): # Create machineset for app worker nodes on each aws zone # Each zone will have one app worker node ms_name = list() labels = [("node-role.kubernetes.io/app", "app-scale")] for obj in machine.get_machineset_objs(): if "app" in obj.name: ms_name.append(obj.name) if not ms_name: if len(machine.get_machineset_objs()) == 3: for zone in ["a", "b", "c"]: ms_name.append( machine.create_custom_machineset( instance_type="m5.4xlarge", labels=labels, zone=zone, )) else: ms_name.append( machine.create_custom_machineset( instance_type="m5.4xlarge", labels=labels, zone="a", )) for ms in ms_name: machine.wait_for_new_node_to_be_ready(ms) if len(ms_name) == 3: exp_count = int(worker_count / 3) else: exp_count = worker_count for name in ms_name: machine.add_node(machine_set=name, count=exp_count) for ms in ms_name: machine.wait_for_new_node_to_be_ready(ms) worker_list = node.get_worker_nodes() ocs_worker_list = machine.get_labeled_nodes( constants.OPERATOR_NODE_LABEL) scale_label_worker = machine.get_labeled_nodes( constants.SCALE_LABEL) ocs_worker_list.extend(scale_label_worker) final_list = list(dict.fromkeys(ocs_worker_list)) for node_item in final_list: if node_item in worker_list: worker_list.remove(node_item) if worker_list: helpers.label_worker_node( node_list=worker_list, label_key="scale-label", label_value="app-scale", ) return True elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "vsphere"): raise UnsupportedPlatformError( "Unsupported Platform to add worker") elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "baremetal"): raise UnsupportedPlatformError( "Unsupported Platform to add worker") elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "azure"): raise UnsupportedPlatformError( "Unsupported Platform to add worker") else: raise UnavailableResourceException( "There is no enough worker nodes to continue app pod scaling")
def uninstall_ocs(): """ The function uninstalls the OCS operator from a openshift cluster and removes all its settings and dependencies """ ocp_obj = ocp.OCP() provisioners = constants.OCS_PROVISIONERS # List the storage classes sc_list = get_all_storageclass() sc_name_list = [] for storage_class in sc_list: if storage_class.get('provisioner') not in provisioners: sc_list.remove(storage_class) else: sc_name_list.append(storage_class.get('metadata').get('name')) # Query for PVCs and OBCs that are using the storage class provisioners listed in the previous step. pvc_to_delete = [] pvc_name_list = [] for sc in sc_name_list: pvc_to_delete.extend(get_all_pvcs_in_storageclass(sc)) # ignoring all noobaa pvcs & make name list for pvc in pvc_to_delete: if "noobaa" in pvc.name: pvc_to_delete.remove(pvc) else: pvc_name_list.append(pvc.name) pods_to_delete = [] all_pods = get_all_pods() # default openshift-storage namespace all_pods.extend(get_all_pods(namespace=constants.OPENSHIFT_IMAGE_REGISTRY_NAMESPACE)) all_pods.extend(get_all_pods(namespace=constants.OPENSHIFT_MONITORING_NAMESPACE)) for pod_obj in all_pods: try: pvc_name = get_pvc_name(pod_obj) except UnavailableResourceException: continue if pvc_name in pvc_name_list: pods_to_delete.append(pod_obj) log.info("Removing monitoring stack from OpenShift Container Storage") remove_monitoring_stack_from_ocs() log.info("Removing OpenShift Container Platform registry from OpenShift Container Storage") remove_ocp_registry_from_ocs(config.ENV_DATA['platform']) log.info("Removing the cluster logging operator from OpenShift Container Storage") csv = ocp.OCP( kind=constants.CLUSTER_SERVICE_VERSION, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE ) logging_csv = csv.get().get('items') if logging_csv: clusterlogging_obj = ocp.OCP( kind=constants.CLUSTER_LOGGING, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE ) clusterlogging_obj.delete(resource_name='instance') log.info("deleting pvcs") for pvc in pvc_to_delete: log.info(f"deleting pvc: {pvc.name}") pvc.delete() log.info("deleting pods") for pod in pods_to_delete: log.info(f"deleting pod {pod.name}") pod.delete() log.info("removing rook directory from nodes") nodes_list = get_labeled_nodes(constants.OPERATOR_NODE_LABEL) for node in nodes_list: log.info(f"removing rook from {node}") ocp_obj.exec_oc_debug_cmd(node=node, cmd_list=["rm -rf /var/lib/rook"]) log.info("Delete the storage classes with an openshift-storage provisioner list") for storage_class in sc_list: log.info(f"deleting storage class {storage_class.get('metadata').get('name')}") sc_obj = ocp.OCP(kind=constants.STORAGECLASS) sc_obj.delete(resource_name=storage_class.get('metadata').get('name')) log.info("unlabaling storage nodes") nodes_list = get_all_nodes() for node in nodes_list: node_obj = ocp.OCP(kind=constants.NODE, resource_name=node) node_obj.add_label(resource_name=node, label=constants.OPERATOR_NODE_LABEL[:-3] + '-') node_obj.add_label(resource_name=node, label=constants.TOPOLOGY_ROOK_LABEL + '-') log.info("deleting storageCluster object") storage_cluster = ocp.OCP(kind=constants.STORAGECLUSTER, resource_name=constants.DEFAULT_CLUSTERNAME) storage_cluster.delete(resource_name=constants.DEFAULT_CLUSTERNAME) log.info("removing CRDs") crd_list = ['backingstores.noobaa.io', 'bucketclasses.noobaa.io', 'cephblockpools.ceph.rook.io', 'cephfilesystems.ceph.rook.io', 'cephnfses.ceph.rook.io', 'cephobjectstores.ceph.rook.io', 'cephobjectstoreusers.ceph.rook.io', 'noobaas.noobaa.io', 'ocsinitializations.ocs.openshift.io', 'storageclusterinitializations.ocs.openshift.io', 'storageclusters.ocs.openshift.io', 'cephclusters.ceph.rook.io'] for crd in crd_list: ocp_obj.exec_oc_cmd(f"delete crd {crd} --timeout=300m") log.info("deleting openshift-storage namespace") ocp_obj.delete_project('openshift-storage') ocp_obj.wait_for_delete('openshift-storage')
def label_nodes(request, with_ocs): """ Fixture to label the node(s) that will run the application pod. That will be all workers node that do not run the OCS cluster. """ m_set = '' # this will hold machine_set name that added def teardown(): ceph_health_check() if with_ocs: return if m_set != '': log.info(f'Destroy {m_set}') machine.delete_custom_machineset(m_set) else: log.info('Clear label form worker (Application) nodes') # Getting all Application nodes app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL) log.debug(f'The application nodes are : {app_nodes}') helpers.remove_label_from_worker_node(app_nodes, constants.VDBENCH_NODE_LABEL) request.addfinalizer(teardown) if with_ocs: return # Add label to the worker nodes # Getting all OCS nodes (to verify app pod wil not run on) ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) worker_nodes = helpers.get_worker_nodes() # Getting list of free nodes free_nodes = list(set(worker_nodes) - set(ocs_nodes)) if not free_nodes: # No free nodes - Creating new machineset for application pods log.info('Adding new machineset, with worker for application pod') m_set = machine.create_custom_machineset( label=constants.APP_NODE_LABEL) machine.wait_for_new_node_to_be_ready(m_set) free_nodes = machine.get_labeled_nodes( f'node-role.kubernetes.io/app={constants.APP_NODE_LABEL}') # TODO: implement this for VMWare as well. log.info('Adding the app-node label to Non-OCS workers') log.debug(f'The Workers nodes are : {worker_nodes}') log.debug(f'The OCS nodes are : {ocs_nodes}') log.debug(f'The free nodes are : {free_nodes}') assert free_nodes, \ 'Did not found any worker to run on, pleas deploy another worker' helpers.label_worker_node(free_nodes, constants.APP_NODE_LABEL, constants.VDBENCH_NODE_LABEL) return
def test_vdbench_workload(self, template, with_ocs, load, label_nodes, ripsaw, servers, threads, blocksize, fileio, samples, width, depth, files, file_size, runtime, pause): """ Run VDBench Workload Args : template (str) : Name of yaml file that will used as a template with_ocs (bool) : This parameter will indicate if the test will run on the same nodes as the OCS load (int) : load to run on the storage in percentage of the capacity. label_nodes (fixture) : This fixture is labeling the worker(s) that will used for App. pod(s) ripsaw (fixture) : Fixture to deploy the ripsaw benchmarking operator servers (int) : Number of servers (pods) that will run the IO threads (int) : Number of threads that will run on each server blocksize (list - str): List of BlockSize - must add the 'K' to it fileio (str) : How to select file for the IO : random / sequential samples (int) : Number of time(s) to run each test width (int) : Width of directory tree to create depth (int) : Depth of directory tree to create files (int) : Number of files to create in each directory file_size (int) : File size (in MB) to create runtime (int) : Time (in Sec.) for each test iteration pause (int) : Time (in Min.) to pause between each test iteration. """ log.info(f'going to use {template} as template') log.info("Apply Operator CRD") crd = 'resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml' ripsaw.apply_crd(crd) log.info('Running vdbench benchmark') if template: template = os.path.join(constants.TEMPLATE_VDBENCH_DIR, template) else: template = constants.VDBENCH_BENCHMARK_YAML sf_data = templating.load_yaml(template) target_results = template + 'Results' log.info('Calculating Storage size....') ceph_cluster = CephCluster() total_capacity = ceph_cluster.get_ceph_capacity() assert total_capacity > constants.VDBENCH_MIN_CAPACITY, ( "Storage capacity is too low for performance testing") log.info(f'The Total usable capacity is {total_capacity}') if load: width = constants.VDBENCH_WIDTH depth = constants.VDBENCH_DEPTH file_size = constants.VDBENCH_FILE_SIZE capacity_per_pod = constants.VDBENCH_CAP_PER_POD total_dirs = width**depth log.info(f'The total dirs in the tree {total_dirs}') log.info(f'Going to run with {load} % of the capacity load.') tested_capacity = round(total_capacity * 1024 * load / 100) log.info(f'Tested capacity is {tested_capacity} MB') servers = round(tested_capacity / capacity_per_pod) """ To spread the application pods evenly on all workers or application nodes and at least 2 app pods per node. """ nodes = len( node.get_typed_nodes(node_type=constants.WORKER_MACHINE)) if not with_ocs: nodes = len( machine.get_labeled_nodes( f'node-role.kubernetes.io/app={constants.APP_NODE_LABEL}' )) log.info(f'Going to use {nodes} nodes for the test !') servers = round(servers / nodes) * nodes if servers < (nodes * 2): servers = nodes * 2 files = round(tested_capacity / servers / total_dirs) total_files = round(files * servers * total_dirs) log.info(f'number of pods is {servers}') log.info(f'Going to create {total_files} files !') log.info(f'number of files in dir is {files}') """ Setting up the parameters for this test """ if servers: sf_data['spec']['workload']['args']['servers'] = servers target_results = target_results + '-' + str(servers) if threads: sf_data['spec']['workload']['args']['threads'] = threads target_results = target_results + '-' + str(threads) if fileio: sf_data['spec']['workload']['args']['fileio'] = fileio target_results = target_results + '-' + str(fileio) if samples: sf_data['spec']['workload']['args']['samples'] = samples target_results = target_results + '-' + str(samples) if width: sf_data['spec']['workload']['args']['width'] = width target_results = target_results + '-' + str(width) if depth: sf_data['spec']['workload']['args']['depth'] = depth target_results = target_results + '-' + str(depth) if files: sf_data['spec']['workload']['args']['files'] = files target_results = target_results + '-' + str(files) if file_size: sf_data['spec']['workload']['args']['file_size'] = file_size target_results = target_results + '-' + str(file_size) if runtime: sf_data['spec']['workload']['args']['runtime'] = runtime target_results = target_results + '-' + str(runtime) if pause: sf_data['spec']['workload']['args']['pause'] = pause target_results = target_results + '-' + str(pause) if len(blocksize) > 0: sf_data['spec']['workload']['args']['bs'] = blocksize target_results = target_results + '-' + '_'.join(blocksize) if with_ocs: if sf_data['spec']['workload']['args']['pin_server']: del sf_data['spec']['workload']['args']['pin_server'] """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ vol_size = int((files * total_dirs) * file_size * 1.3) log.info('number of files to create : {}'.format( int(files * (width**depth)))) log.info(f'The size of all files is : {vol_size}MB') vol_size = int(vol_size / 1024) if vol_size < 100: vol_size = 100 sf_data['spec']['workload']['args']['storagesize'] = f'{vol_size}Gi' log.debug(f'output of configuration file is {sf_data}') timeout = 86400 # 3600 (1H) * 24 (1D) = one days sf_obj = OCS(**sf_data) sf_obj.create() # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, 'vdbench-client', 'my-ripsaw'): try: if bench_pod[0] is not None: vdbench_client_pod = bench_pod[0] break except IndexError: log.info('Benchmark client pod not ready yet') bench_pod = OCP(kind='pod', namespace='my-ripsaw') log.info('Waiting for VDBench benchmark to Run') assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING, resource_name=vdbench_client_pod, sleep=30, timeout=600) start_time = time.time() while True: logs = bench_pod.exec_oc_cmd(f'logs {vdbench_client_pod}', out_yaml_format=False) if 'Test Run Finished' in logs: log.info('VdBench Benchmark Completed Successfully') break if timeout < (time.time() - start_time): raise TimeoutError( 'Timed out waiting for benchmark to complete') time.sleep(30) # Getting the results file from the benchmark pod and put it with the # test logs. # TODO: find the place of the actual test log and not in the parent # logs path target_results = '{}/{}.tgz'.format(ocsci_log_path(), target_results) pod_results = constants.VDBENCH_RESULTS_FILE retrive_files_from_pod(vdbench_client_pod, target_results, pod_results)
def identify_and_add_nodes(self, scenario, num_of_nodes): """ Fetches info about the worker nodes and add nodes (if required) Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test Returns: tuple: tuple containing: list: list of OCS nodes name list: list of non-OCS nodes name """ nodes_to_add = 0 initial_worker_nodes = node.get_worker_nodes() ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) non_ocs_nodes = list(set(initial_worker_nodes) - set(ocs_nodes)) if "colocated" in scenario and len(ocs_nodes) < num_of_nodes: nodes_to_add = num_of_nodes - len(initial_worker_nodes) if "dedicated" in scenario and len(non_ocs_nodes) < num_of_nodes: nodes_to_add = num_of_nodes - len(non_ocs_nodes) if nodes_to_add > 0: logger.info(f"{nodes_to_add} extra workers nodes needed") if config.ENV_DATA["deployment_type"] == "ipi": machine_name = random.choice( machine.get_machines( machine_type=constants.WORKER_MACHINE)).name machineset_name = machine.get_machineset_from_machine_name( machine_name) node.add_new_node_and_label_it( machineset_name=machineset_name, num_nodes=nodes_to_add, mark_for_ocs_label=False, ) else: is_rhel = config.ENV_DATA.get( "rhel_workers") or config.ENV_DATA.get("rhel_user") node_type = constants.RHEL_OS if is_rhel else constants.RHCOS node.add_new_node_and_label_upi( node_type=node_type, num_nodes=nodes_to_add, mark_for_ocs_label=False, ) new_worker_nodes = node.get_worker_nodes() new_nodes_added = list( set(new_worker_nodes) - set(initial_worker_nodes)) assert (len(new_nodes_added) == nodes_to_add ), "Extra nodes not added in the cluster" non_ocs_nodes += new_nodes_added if "colocated" in scenario and len(ocs_nodes) < num_of_nodes: logger.info("Adding OCS storage label to Non-OCS workers") node_obj = ocp.OCP(kind=constants.NODE) nodes_to_label = non_ocs_nodes[0:(num_of_nodes - len(ocs_nodes))] for node_name in nodes_to_label: node_obj.add_label(resource_name=node_name, label=constants.OPERATOR_NODE_LABEL) ocs_nodes.append(node_name) non_ocs_nodes = list(set(non_ocs_nodes) - set(ocs_nodes)) logger.info(f"The OCS nodes are : {ocs_nodes}") logger.info(f"The Non-OCS nodes are: {non_ocs_nodes}") return ocs_nodes, non_ocs_nodes
def identify_and_add_nodes(self, scenario, num_of_nodes): """ Fetches info about the worker nodes and add nodes (if required) Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test Returns: tuple: tuple containing: list: list of OCS nodes name list: list of non-OCS nodes name """ nodes_to_add = 0 initial_worker_nodes = node.get_worker_nodes() ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) non_ocs_nodes = list(set(initial_worker_nodes) - set(ocs_nodes)) if "colocated" in scenario and len(ocs_nodes) < num_of_nodes: nodes_to_add = num_of_nodes - len(initial_worker_nodes) if "dedicated" in scenario and len(non_ocs_nodes) < num_of_nodes: nodes_to_add = num_of_nodes - len(non_ocs_nodes) if nodes_to_add > 0: logger.info(f"{nodes_to_add} extra workers nodes needed") if config.ENV_DATA["deployment_type"] == "ipi": machine_name = machine.get_machine_from_node_name( random.choice(initial_worker_nodes)) machineset_name = machine.get_machineset_from_machine_name( machine_name) machineset_replica_count = machine.get_replica_count( machineset_name) machine.add_node(machineset_name, count=machineset_replica_count + nodes_to_add) logger.info("Waiting for the new node(s) to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) else: if (config.ENV_DATA.get("platform").lower() == constants.VSPHERE_PLATFORM): pytest.skip( "Skipping add node in VSPHERE due to https://bugzilla.redhat.com/show_bug.cgi?id=1844521" ) is_rhel = config.ENV_DATA.get( "rhel_workers") or config.ENV_DATA.get("rhel_user") node_type = constants.RHEL_OS if is_rhel else constants.RHCOS node.add_new_node_and_label_upi( node_type=node_type, num_nodes=nodes_to_add, mark_for_ocs_label=False, ) new_worker_nodes = node.get_worker_nodes() new_nodes_added = list( set(new_worker_nodes) - set(initial_worker_nodes)) assert (len(new_nodes_added) == nodes_to_add ), "Extra nodes not added in the cluster" non_ocs_nodes += new_nodes_added if "colocated" in scenario and len(ocs_nodes) < num_of_nodes: logger.info("Adding OCS storage label to Non-OCS workers") node_obj = ocp.OCP(kind=constants.NODE) nodes_to_label = non_ocs_nodes[0:(num_of_nodes - len(ocs_nodes))] for node_name in nodes_to_label: node_obj.add_label(resource_name=node_name, label=constants.OPERATOR_NODE_LABEL) ocs_nodes.append(node_name) non_ocs_nodes = list(set(non_ocs_nodes) - set(ocs_nodes)) logger.info(f"The OCS nodes are : {ocs_nodes}") logger.info(f"The Non-OCS nodes are: {non_ocs_nodes}") return ocs_nodes, non_ocs_nodes
def uninstall_ocs(): """ The function uninstalls the OCS operator from a openshift cluster and removes all its settings and dependencies """ ocp_obj = ocp.OCP() provisioners = constants.OCS_PROVISIONERS # List the storage classes sc_list = [ sc for sc in get_all_storageclass() if sc.get('provisioner') in provisioners ] # Query for PVCs and OBCs that are using the storage class provisioners listed in the previous step. pvc_to_delete = [] for sc in sc_list: pvc_to_delete.extend(pvc for pvc in get_all_pvcs_in_storageclass( sc.get('metadata').get('name')) if 'noobaa' not in pvc.name) log.info("Removing monitoring stack from OpenShift Container Storage") remove_monitoring_stack_from_ocs() log.info( "Removing OpenShift Container Platform registry from OpenShift Container Storage" ) remove_ocp_registry_from_ocs(config.ENV_DATA['platform']) log.info( "Removing the cluster logging operator from OpenShift Container Storage" ) try: remove_cluster_logging_operator_from_ocs() except CommandFailed: log.info("No cluster logging found") log.info("Deleting pvcs") for pvc in pvc_to_delete: log.info(f"Deleting pvc: {pvc.name}") pvc.delete() storage_cluster = ocp.OCP(kind=constants.STORAGECLUSTER, resource_name=constants.DEFAULT_CLUSTERNAME, namespace='openshift-storage') log.info("Checking for local storage") lso_sc = None if check_local_volume(): "Local volume was found. Will be removed later" lso_sc = storage_cluster.get().get('spec').get('storageDeviceSets')[ 0].get('dataPVCTemplate').get('spec').get('storageClassName') log.info("Deleting storageCluster object") storage_cluster.delete(resource_name=constants.DEFAULT_CLUSTERNAME) log.info("Removing CRDs") crd_list = [ 'backingstores.noobaa.io', 'bucketclasses.noobaa.io', 'cephblockpools.ceph.rook.io', 'cephfilesystems.ceph.rook.io', 'cephnfses.ceph.rook.io', 'cephobjectstores.ceph.rook.io', 'cephobjectstoreusers.ceph.rook.io', 'noobaas.noobaa.io', 'ocsinitializations.ocs.openshift.io', 'storageclusterinitializations.ocs.openshift.io', 'storageclusters.ocs.openshift.io', 'cephclusters.ceph.rook.io' ] for crd in crd_list: ocp_obj.exec_oc_cmd(f"delete crd {crd} --timeout=300m") log.info("Deleting openshift-storage namespace") ocp_obj.delete_project('openshift-storage') ocp_obj.wait_for_delete('openshift-storage') switch_to_project("default") log.info("Removing rook directory from nodes") nodes_list = get_labeled_nodes(constants.OPERATOR_NODE_LABEL) for node in nodes_list: log.info(f"Removing rook from {node}") ocp_obj.exec_oc_debug_cmd(node=node, cmd_list=["rm -rf /var/lib/rook"]) log.info("Removing LSO ") if lso_sc is not None: uninstall_lso(lso_sc) log.info( "Delete the storage classes with an openshift-storage provisioner list" ) for storage_class in sc_list: log.info( f"Deleting storage class {storage_class.get('metadata').get('name')}" ) sc_obj = ocp.OCP(kind=constants.STORAGECLASS) sc_obj.delete(resource_name=storage_class.get('metadata').get('name')) log.info("Unlabeling storage nodes") nodes_list = get_all_nodes() for node in nodes_list: node_obj = ocp.OCP(kind=constants.NODE, resource_name=node) node_obj.add_label(resource_name=node, label=constants.OPERATOR_NODE_LABEL[:-3] + '-') node_obj.add_label(resource_name=node, label=constants.TOPOLOGY_ROOK_LABEL + '-') log.info("OCS was removed successfully from cluster ")
def add_worker_node(instance_type=None): global ms_name ms_name = list() worker_list = node.get_worker_nodes() ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) scale_worker = machine.get_labeled_nodes(constants.SCALE_LABEL) if config.RUN.get("use_ocs_worker_for_scale"): if not scale_worker: helpers.label_worker_node(node_list=worker_list, label_key="scale-label", label_value="app-scale") else: if not scale_worker: for node_item in ocs_worker_list: worker_list.remove(node_item) if worker_list: helpers.label_worker_node( node_list=worker_list, label_key="scale-label", label_value="app-scale", ) scale_worker_list = machine.get_labeled_nodes(constants.SCALE_LABEL) logging.info(f"Print existing scale worker {scale_worker_list}") if (config.ENV_DATA["deployment_type"] == "ipi" and config.ENV_DATA["platform"].lower() == "aws"): log.info("Adding worker nodes on the current cluster") labels = [("node-role.kubernetes.io/app", "app-scale")] # Create machineset for app worker nodes on each zone for obj in machine.get_machineset_objs(): if "app" in obj.name: ms_name.append(obj.name) if instance_type is not None: instance_type = instance_type else: instance_type = "m5.4xlarge" if not ms_name: if len(machine.get_machineset_objs()) == 3: for zone in ["a", "b", "c"]: ms_name.append( machine.create_custom_machineset( instance_type=instance_type, labels=labels, zone=zone, )) else: ms_name.append( machine.create_custom_machineset( instance_type=instance_type, labels=labels, zone="a", )) for ms in ms_name: machine.wait_for_new_node_to_be_ready(ms) worker_list = node.get_worker_nodes() ocs_worker_list = machine.get_labeled_nodes( constants.OPERATOR_NODE_LABEL) scale_label_worker = machine.get_labeled_nodes(constants.SCALE_LABEL) ocs_worker_list.extend(scale_label_worker) final_list = list(dict.fromkeys(ocs_worker_list)) for node_item in final_list: if node_item in worker_list: worker_list.remove(node_item) if worker_list: helpers.label_worker_node(node_list=worker_list, label_key="scale-label", label_value="app-scale") return True elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "vsphere"): log.info("Running scale test on existing worker nodes.") elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "baremetal"): log.info("Running scale test on existing worker nodes.") elif (config.ENV_DATA["deployment_type"] == "upi" and config.ENV_DATA["platform"].lower() == "azure"): raise UnsupportedPlatformError("Unsupported Platform")
def add_worker_node(instance_type=None): global ms_name ms_name = list() worker_list = helpers.get_worker_nodes() ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) scale_worker = machine.get_labeled_nodes(constants.SCALE_LABEL) if config.RUN.get('use_ocs_worker_for_scale'): if not scale_worker: helpers.label_worker_node(node_list=worker_list, label_key='scale-label', label_value='app-scale') else: if not scale_worker: for node_item in ocs_worker_list: worker_list.remove(node_item) if worker_list: helpers.label_worker_node(node_list=worker_list, label_key='scale-label', label_value='app-scale') scale_worker_list = machine.get_labeled_nodes(constants.SCALE_LABEL) logging.info(f"Print existing scale worker {scale_worker_list}") if config.ENV_DATA['deployment_type'] == 'ipi' and config.ENV_DATA[ 'platform'].lower() == 'aws': log.info("Adding worker nodes on the current cluster") # Create machineset for app worker nodes on each zone for obj in machine.get_machineset_objs(): if 'app' in obj.name: ms_name.append(obj.name) if instance_type is not None: instance_type = instance_type else: instance_type = 'm5.4xlarge' if not ms_name: if len(machine.get_machineset_objs()) == 3: for zone in ['a', 'b', 'c']: ms_name.append( machine.create_custom_machineset( instance_type=instance_type, zone=zone)) else: ms_name.append( machine.create_custom_machineset( instance_type=instance_type, zone='a')) for ms in ms_name: machine.wait_for_new_node_to_be_ready(ms) worker_list = helpers.get_worker_nodes() ocs_worker_list = machine.get_labeled_nodes( constants.OPERATOR_NODE_LABEL) scale_label_worker = machine.get_labeled_nodes(constants.SCALE_LABEL) ocs_worker_list.extend(scale_label_worker) final_list = list(dict.fromkeys(ocs_worker_list)) for node_item in final_list: if node_item in worker_list: worker_list.remove(node_item) if worker_list: helpers.label_worker_node(node_list=worker_list, label_key='scale-label', label_value='app-scale') return True elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'vsphere': log.info('Running pgsql on existing worker nodes') elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'baremetal': log.info('Running pgsql on existing worker nodes') elif config.ENV_DATA['deployment_type'] == 'upi' and config.ENV_DATA[ 'platform'].lower() == 'azure': raise UnsupportedPlatformError("Unsupported Platform")
def setup( self, request, scenario, nodes, multi_pvc_factory, service_account_factory, dc_pod_factory, ): """ Identify the nodes and start multiple dc pods for the test Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') nodes: A fixture to get instance of the relevant platform nodes class multi_pvc_factory: A fixture create a set of new PVCs service_account_factory: A fixture to create a service account dc_pod_factory: A fixture to create dc pod Returns: list: dc pod objs """ worker_nodes = node.get_worker_nodes() ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) non_ocs_nodes = list(set(worker_nodes) - set(ocs_nodes)) def finalizer(): helpers.remove_label_from_worker_node(node_list=worker_nodes, label_key="nodetype") # Check ceph health ceph_health_check(tries=80) request.addfinalizer(finalizer) if (scenario == "dedicated") and len(non_ocs_nodes) == 0: if config.ENV_DATA.get("deployment_type").lower() == "ipi": machines = machine.get_machinesets() node.add_new_node_and_label_it(machines[0], num_nodes=1, mark_for_ocs_label=False) else: if (config.ENV_DATA.get("platform").lower() == constants.VSPHERE_PLATFORM): pytest.skip( "Skipping add node in VSPHERE due to https://bugzilla.redhat.com/show_bug.cgi?id=1844521" ) is_rhel = config.ENV_DATA.get( "rhel_workers") or config.ENV_DATA.get("rhel_user") node_type = constants.RHEL_OS if is_rhel else constants.RHCOS node.add_new_node_and_label_upi(node_type=node_type, num_nodes=1, mark_for_ocs_label=False) non_ocs_nodes = list(set(node.get_worker_nodes()) - set(ocs_nodes)) app_pod_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes # Label nodes to be able to run app pods helpers.label_worker_node(node_list=app_pod_nodes, label_key="nodetype", label_value="app-pod") access_modes_rbd = [ constants.ACCESS_MODE_RWO, f"{constants.ACCESS_MODE_RWX}-Block", ] access_modes_cephfs = [ constants.ACCESS_MODE_RWO, constants.ACCESS_MODE_RWX ] pvcs_rbd = multi_pvc_factory( interface=constants.CEPHBLOCKPOOL, size=self.pvc_size, access_modes=access_modes_rbd, status=constants.STATUS_BOUND, num_of_pvc=len(access_modes_rbd), ) project = pvcs_rbd[0].project pvcs_cephfs = multi_pvc_factory( interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size, access_modes=access_modes_cephfs, status=constants.STATUS_BOUND, num_of_pvc=len(access_modes_cephfs), ) pvcs = pvcs_cephfs + pvcs_rbd # Set volume mode on PVC objects for pvc_obj in pvcs: pvc_info = pvc_obj.get() setattr(pvc_obj, "volume_mode", pvc_info["spec"]["volumeMode"]) sa_obj = service_account_factory(project=project) pods = [] # Create pods for pvc_obj in pvcs: if constants.CEPHFS_INTERFACE in pvc_obj.storageclass.name: interface = constants.CEPHFILESYSTEM else: interface = constants.CEPHBLOCKPOOL num_pods = 2 if pvc_obj.access_mode == constants.ACCESS_MODE_RWX else 1 logger.info("Creating app pods") for _ in range(num_pods): pods.append( dc_pod_factory( interface=interface, pvc=pvc_obj, node_selector={"nodetype": "app-pod"}, raw_block_pv=pvc_obj.volume_mode == "Block", sa_obj=sa_obj, )) logger.info( f"Created {len(pods)} pods using {len(pvcs_cephfs)} cephfs, {len(pvcs_rbd)} rbd PVCs." ) return pods
def teardown(): log.info('Clear label form worker (Application) nodes') # Getting all Application nodes app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL) helpers.remove_label_from_worker_node(app_nodes, constants.APP_NODE_LABEL)
def identify_and_add_nodes(self, scenario, num_of_nodes): """ Fetches info about the worker nodes and add nodes (if required) Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test Returns: tuple: tuple containing: list: list of OCS nodes name list: list of non-OCS nodes name """ nodes_to_add = 0 initial_worker_nodes = helpers.get_worker_nodes() ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) non_ocs_nodes = list(set(initial_worker_nodes) - set(ocs_nodes)) if 'colocated' in scenario and len(ocs_nodes) < num_of_nodes: nodes_to_add = num_of_nodes - len(initial_worker_nodes) if 'dedicated' in scenario and len(non_ocs_nodes) < num_of_nodes: nodes_to_add = num_of_nodes - len(non_ocs_nodes) if nodes_to_add > 0: logger.info(f"{nodes_to_add} extra workers nodes needed") if config.ENV_DATA['deployment_type'] == 'ipi': machine_name = machine.get_machine_from_node_name( random.choice(initial_worker_nodes) ) machineset_name = machine.get_machineset_from_machine_name( machine_name ) machineset_replica_count = machine.get_replica_count( machineset_name ) machine.add_node( machineset_name, count=machineset_replica_count + nodes_to_add ) logger.info("Waiting for the new node(s) to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) else: # TODO: Add required num of nodes instead of skipping # https://github.com/red-hat-storage/ocs-ci/issues/1291 pytest.skip("Add node not implemented for UPI, github issue #1291") new_worker_nodes = helpers.get_worker_nodes() new_nodes_added = list(set(new_worker_nodes) - set(initial_worker_nodes)) assert len(new_nodes_added) > 0, 'Extra nodes not added in the cluster' non_ocs_nodes += new_nodes_added if 'colocated' in scenario and len(ocs_nodes) < num_of_nodes: logger.info('Adding OCS storage label to Non-OCS workers') node_obj = ocp.OCP(kind=constants.NODE) nodes_to_label = non_ocs_nodes[0:(num_of_nodes - len(ocs_nodes))] for node_name in nodes_to_label: node_obj.add_label( resource_name=node_name, label=constants.OPERATOR_NODE_LABEL ) ocs_nodes.append(node_name) non_ocs_nodes = list(set(non_ocs_nodes) - set(ocs_nodes)) logger.info(f"The OCS nodes are : {ocs_nodes}") logger.info(f"The Non-OCS nodes are: {non_ocs_nodes}") return ocs_nodes, non_ocs_nodes