def start_snapshotting_threads(cls, volume_bundle, args=(), kwargs=None, logger=LOGGER): """ Start the snapshotting threads :param volume_bundle: bundle of volumes :type volume_bundle: dict :param api: api instance :param logger: logging instance :return: """ if kwargs is None: kwargs = {} threads = [] current_thread_bundle = {'index': 1, 'vdisks': []} logger.info('Starting threads.') try: for index, (vdisk_name, vdisk_object) in enumerate(volume_bundle.iteritems(), 1): vdisks = current_thread_bundle['vdisks'] vdisks.append(vdisk_object) if index % cls.VDISK_THREAD_LIMIT == 0 or index == len( volume_bundle.keys()): threads.append( ThreadHelper.start_thread_with_event( target=cls._start_snapshots, name='iops_{0}'.format( current_thread_bundle['index']), args=(vdisks, ) + args, kwargs=kwargs)) current_thread_bundle['index'] = index + 1 current_thread_bundle['vdisks'] = [] except Exception: for thread_pair in threads: # Attempt to cleanup current inflight threads if thread_pair[0].isAlive(): thread_pair[1].set() # Wait for threads to die for thread_pair in threads: thread_pair[0].join() raise return threads
def test_ha_fio(cls, fio_bin_path, cluster_info, is_ee, disk_amount=1, timeout=CIConstants.HA_TIMEOUT, logger=LOGGER): """ Uses a modified fio to work with the openvstorage protocol :param fio_bin_path: path of the fio binary :type fio_bin_path: str :param cluster_info: information about the cluster, contains all dal objects :type cluster_info: dict :param is_ee: is it an ee version or not :type is_ee: bool :param disk_amount: amount of disks to test fail over with :type disk_amount: int :param timeout: timeout in seconds :type timeout: int :param logger: logging instance :return: None :rtype: NoneType """ destination_storagedriver = cluster_info['storagedrivers'][ 'destination'] source_storagedriver = cluster_info['storagedrivers']['source'] vpool = destination_storagedriver.vpool compute_client = SSHClient(cluster_info['storagerouters']['compute'], username='******') vm_to_stop = cls.HYPERVISOR_INFO['vms'][ source_storagedriver.storage_ip]['name'] parent_hypervisor = HypervisorFactory().get() values_to_check = { 'source_std': source_storagedriver.serialize(), 'target_std': destination_storagedriver.serialize(), 'vdisks': [] } # Create vdisks protocol = source_storagedriver.cluster_node_config[ 'network_server_uri'].split(':')[0] edge_configuration = { 'fio_bin_location': fio_bin_path, 'hostname': source_storagedriver.storage_ip, 'port': source_storagedriver.ports['edge'], 'protocol': protocol, 'volumenames': [] } if is_ee is True: edge_configuration.update(cls.get_shell_user()) vdisk_info = {} failed_configurations = [] for index in xrange(0, disk_amount): try: vdisk_name = '{0}_vdisk{1}'.format(cls.TEST_NAME, str(index).zfill(3)) data_vdisk = VDiskHelper.get_vdisk_by_guid( VDiskSetup.create_vdisk(vdisk_name, vpool.name, cls.AMOUNT_TO_WRITE, source_storagedriver.storage_ip)) vdisk_info[vdisk_name] = data_vdisk edge_configuration['volumenames'].append( data_vdisk.devicename.rsplit('.', 1)[0].split('/', 1)[1]) values_to_check['vdisks'].append(data_vdisk.serialize()) except TimeOutError: logger.error('Creating the vdisk has timed out.') raise except RuntimeError as ex: logger.error('Could not create the vdisk. Got {0}'.format( str(ex))) raise configuration = random.choice(cls.DATA_TEST_CASES) threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}} vm_downed = False screen_names = [] try: logger.info( 'Starting threads.' ) # Separate because creating vdisks takes a while, while creating the threads does not io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads( volume_bundle=vdisk_info) threads['evented']['io']['pairs'] = io_thread_pairs threads['evented']['io']['r_semaphore'] = io_r_semaphore screen_names, output_files = DataWriter.write_data_fio( client=compute_client, fio_configuration={ 'io_size': cls.AMOUNT_TO_WRITE, 'configuration': configuration }, edge_configuration=edge_configuration) logger.info( 'Doing IO for {0}s before bringing down the node.'.format( cls.IO_TIME)) ThreadingHandler.keep_threads_running( r_semaphore=io_r_semaphore, threads=io_thread_pairs, shared_resource=monitoring_data, duration=cls.IO_TIME) # Threads ready for monitoring at this point ######################### # Bringing original owner of the volume down ######################### try: logger.info('Stopping {0}.'.format(vm_to_stop)) VMHandler.stop_vm(hypervisor=parent_hypervisor, vmid=vm_to_stop) downed_time = time.time() vm_downed = True except Exception as ex: logger.error('Failed to stop. Got {0}'.format(str(ex))) raise time.sleep(cls.IO_REFRESH_RATE * 2) # Start IO polling to verify nothing went down ThreadingHandler.poll_io( r_semaphore=io_r_semaphore, required_thread_amount=len(io_thread_pairs), shared_resource=monitoring_data, downed_time=downed_time, timeout=timeout, output_files=output_files, client=compute_client, disk_amount=disk_amount) cls._validate(values_to_check, monitoring_data) except Exception as ex: failed_configurations.append({ 'configuration': configuration, 'reason': str(ex) }) finally: for thread_category, thread_collection in threads[ 'evented'].iteritems(): ThreadHelper.stop_evented_threads( thread_collection['pairs'], thread_collection['r_semaphore']) if vm_downed is True: VMHandler.start_vm(parent_hypervisor, vm_to_stop) SystemHelper.idle_till_ovs_is_up( source_storagedriver.storage_ip, **cls.get_shell_user()) # @TODO: Remove when https://github.com/openvstorage/integrationtests/issues/540 is fixed FwkHandler.restart_all() if screen_names: for screen_name in screen_names: compute_client.run( ['screen', '-S', screen_name, '-X', 'quit']) for vdisk in vdisk_info.values(): VDiskRemover.remove_vdisk(vdisk.guid) assert len(failed_configurations ) == 0, 'Certain configuration failed: {0}'.format( ' '.join(failed_configurations))
def start_io_polling_threads(cls, volume_bundle, logger=LOGGER): """ Will start the io polling threads :param volume_bundle: bundle of volumes {vdiskname: vdisk object} :type volume_bundle: dict :param logger: logger instance :type logger: ovs.log.log_handler.LogHandler :return: threads, monitoring_data, r_semaphore :rtype: tuple(list, dict, ci.api_lib.helpers.thread.Waiter) """ required_thread_amount = math.ceil( float(len(volume_bundle.keys())) / cls.VDISK_THREAD_LIMIT) # Amount of threads we will need r_semaphore = Waiter( required_thread_amount + 1, auto_reset=True ) # Add another target to let this thread control the semaphore threads = [] monitoring_data = {} current_thread_bundle = {'index': 1, 'vdisks': []} logger.info( 'Starting threads.' ) # Separate because creating vdisks takes a while, while creating the threads does not try: for index, (vdisk_name, vdisk_object) in enumerate(volume_bundle.iteritems(), 1): vdisks = current_thread_bundle['vdisks'] volume_number_range = '{0}-{1}'.format( current_thread_bundle['index'], index) vdisks.append(vdisk_object) if index % cls.VDISK_THREAD_LIMIT == 0 or index == len( volume_bundle.keys()): # New thread bundle monitor_resource = { 'general': { 'io': [], 'edge_clients': {} } } # noinspection PyTypeChecker for vdisk in vdisks: monitor_resource[vdisk.name] = { 'io': { 'down': [], 'descending': [], 'rising': [], 'highest': None, 'lowest': None }, 'edge_clients': { 'down': [], 'up': [] } } monitoring_data[volume_number_range] = monitor_resource threads.append( ThreadHelper.start_thread_with_event( target=cls.monitor_changes, name='iops_{0}'.format( current_thread_bundle['index']), args=(monitor_resource, vdisks, r_semaphore))) current_thread_bundle['index'] = index + 1 current_thread_bundle['vdisks'] = [] except Exception: for thread_pair in threads: # Attempt to cleanup current inflight threads if thread_pair[0].isAlive(): thread_pair[1].set() while r_semaphore.get_counter() < len( threads ): # Wait for the number of threads we currently have. time.sleep(0.05) r_semaphore.wait( ) # Unlock them to let them stop (the object is set -> wont loop) # Wait for threads to die for thread_pair in threads: thread_pair[0].join() raise return threads, monitoring_data, r_semaphore
def run_test(cls, vm_info, cluster_info, logger=LOGGER): """ Tests the HA using a virtual machine which will write in his own filesystem :param cluster_info: information about the cluster, contains all dal objects :type cluster_info: dict :param vm_info: info about the vms :param logger: logging instance :return: None :rtype: NoneType """ compute_client = SSHClient(cluster_info['storagerouters']['compute'], username='******') failed_configurations = [] destination_storagedriver = cluster_info['storagedrivers'][ 'destination'] source_storagedriver = cluster_info['storagedrivers']['source'] # Cache to validate properties values_to_check = { 'source_std': source_storagedriver.serialize(), 'target_std': destination_storagedriver.serialize() } vm_to_stop = cls.HYPERVISOR_INFO['vms'][ source_storagedriver.storage_ip]['name'] parent_hypervisor = HypervisorFactory().get() # Extract vdisk info from vm_info vdisk_info = {} disk_amount = 0 for vm_name, vm_object in vm_info.iteritems(): for vdisk in vm_object['vdisks']: # Ignore the cd vdisk as no IO will come from it if vdisk.name == vm_object['cd_path'].replace( '.raw', '').split('/')[-1]: continue disk_amount += 1 vdisk_info.update({vdisk.name: vdisk}) with remote(compute_client.ip, [SSHClient]) as rem: configuration = random.choice(cls.DATA_TEST_CASES) threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}} output_files = [] vm_downed = False try: logger.info('Starting the following configuration: {0}'.format( configuration)) for vm_name, vm_data in vm_info.iteritems(): vm_client = rem.SSHClient(vm_data['ip'], cls.VM_USERNAME, cls.VM_PASSWORD) vm_client.file_create('/mnt/data/{0}.raw'.format( vm_data['create_msg'])) vm_data['client'] = vm_client io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads( volume_bundle=vdisk_info) threads['evented']['io']['pairs'] = io_thread_pairs threads['evented']['io']['r_semaphore'] = io_r_semaphore for vm_name, vm_data in vm_info.iteritems(): # Write data screen_names, output_files = DataWriter.write_data_fio( client=vm_data['client'], fio_configuration={ 'io_size': cls.AMOUNT_TO_WRITE, 'configuration': configuration }, file_locations=[ '/mnt/data/{0}.raw'.format(vm_data['create_msg']) ]) vm_data['screen_names'] = screen_names logger.info( 'Doing IO for {0}s before bringing down the node.'.format( cls.IO_TIME)) ThreadingHandler.keep_threads_running( r_semaphore=io_r_semaphore, threads=io_thread_pairs, shared_resource=monitoring_data, duration=cls.IO_TIME) # Threads ready for monitoring at this point ######################### # Bringing original owner of the volume down ######################### try: logger.info('Stopping {0}.'.format(vm_to_stop)) VMHandler.stop_vm(hypervisor=parent_hypervisor, vmid=vm_to_stop) vm_downed = True except Exception as ex: logger.error('Failed to stop. Got {0}'.format(str(ex))) raise downed_time = time.time() time.sleep(cls.IO_REFRESH_RATE * 2) # Start IO polling to verify nothing went down ThreadingHandler.poll_io( r_semaphore=io_r_semaphore, required_thread_amount=len(io_thread_pairs), shared_resource=monitoring_data, downed_time=downed_time, timeout=cls.HA_TIMEOUT, output_files=output_files, client=compute_client, disk_amount=disk_amount) cls._validate(values_to_check, monitoring_data) except Exception as ex: logger.error( 'Running the test for configuration {0} has failed because {1}' .format(configuration, str(ex))) failed_configurations.append({ 'configuration': configuration, 'reason': str(ex) }) finally: for thread_category, thread_collection in threads[ 'evented'].iteritems(): ThreadHelper.stop_evented_threads( thread_collection['pairs'], thread_collection['r_semaphore']) if vm_downed is True: VMHandler.start_vm(parent_hypervisor, vm_to_stop) logger.debug('Started {0}'.format(vm_to_stop)) SystemHelper.idle_till_ovs_is_up( source_storagedriver.storage_ip, **cls.get_shell_user()) # @TODO: Remove when https://github.com/openvstorage/integrationtests/issues/540 is fixed FwkHandler.restart_all() for vm_name, vm_data in vm_info.iteritems(): for screen_name in vm_data.get('screen_names', []): logger.debug('Stopping screen {0} on {1}.'.format( screen_name, vm_data['client'].ip)) vm_data['client'].run( ['screen', '-S', screen_name, '-X', 'quit']) vm_data['screen_names'] = [] assert len(failed_configurations ) == 0, 'Certain configuration failed: {0}'.format( ' '.join(failed_configurations))
def run_test(cls, vm_info, cluster_info, logger=LOGGER): """ Tests the DTL using a virtual machine which will write in his own filesystem Expects last data to be pulled from the DTL and not backend :param cluster_info: information about the cluster, contains all dal objects :type cluster_info: dict :param vm_info: info about the vms :param logger: logging instance :return: None :rtype: NoneType """ source_std = cluster_info['storagedrivers']['source'] source_client = SSHClient(source_std.storagerouter, username='******') compute_str = cluster_info['storagerouters']['compute'] compute_client = SSHClient(compute_str) # setup hypervisor details parent_hypervisor = HypervisorFactory().get() vm_to_stop = cls.HYPERVISOR_INFO['vms'][source_std.storage_ip]['name'] vdisk_info = {} disk_amount = 0 for vm_name, vm_object in vm_info.iteritems(): for vdisk in vm_object['vdisks']: # Ignore the cd vdisk as no IO will come from it if vdisk.name == vm_object['cd_path'].replace( '.raw', '').split('/')[-1]: continue disk_amount += 1 vdisk_info.update({vdisk.name: vdisk}) # Cache to validate properties values_to_check = { 'source_std': source_std.serialize(), 'vdisks': vdisk_info.values() } with remote(compute_str.ip, [SSHClient]) as rem: threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}} vm_downed = False output_files = [] try: for vm_name, vm_data in vm_info.iteritems(): vm_client = rem.SSHClient(vm_data['ip'], cls.VM_USERNAME, cls.VM_PASSWORD) vm_client.file_create('/mnt/data/{0}.raw'.format( vm_data['create_msg'])) vm_data['client'] = vm_client # Load dd, md5sum, screen & fio in memory vm_data['client'].run([ 'dd', 'if=/dev/urandom', 'of={0}'.format(cls.VM_RANDOM), 'bs=1M', 'count=2' ]) vm_data['client'].run(['md5sum', cls.VM_RANDOM]) logger.info("Stopping proxy services") service_manager = ServiceFactory.get_manager() for proxy in source_std.alba_proxies: service_manager.restart_service(proxy.service.name, client=source_client) logger.info( 'Starting to WRITE file while proxy is offline. All data should be stored in the DTL!' ) for vm_name, vm_data in vm_info.iteritems(): vm_data['client'].run( 'dd if=/dev/urandom of={0} bs=1M count=2'.format( cls.VM_FILENAME).split()) original_md5sum = ' '.join(vm_data['client'].run( ['md5sum', cls.VM_FILENAME]).split()) vm_data['original_md5sum'] = original_md5sum logger.info('Original MD5SUM for VM {0}: {1}.'.format( vm_name, original_md5sum)) logger.info('Finished to WRITE file while proxy is offline!') logger.info( "Starting fio to generate IO for failing over.".format( cls.IO_TIME)) io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads( volume_bundle=vdisk_info) threads['evented']['io']['pairs'] = io_thread_pairs threads['evented']['io']['r_semaphore'] = io_r_semaphore for vm_name, vm_data in vm_info.iteritems(): # Write data screen_names, output_files = DataWriter.write_data_fio( client=vm_data['client'], fio_configuration={ 'io_size': cls.AMOUNT_TO_WRITE, 'configuration': cls.IO_PATTERN }, file_locations=[ '/mnt/data/{0}.raw'.format(vm_data['create_msg']) ]) vm_data['screen_names'] = screen_names logger.info( 'Doing IO for {0}s before bringing down the node.'.format( cls.IO_TIME)) ThreadingHandler.keep_threads_running( r_semaphore=io_r_semaphore, threads=io_thread_pairs, shared_resource=monitoring_data, duration=cls.IO_TIME) ############################################## # Bringing original owner of the volume down # ############################################## VMHandler.stop_vm(hypervisor=parent_hypervisor, vmid=vm_to_stop) vm_downed = True downed_time = time.time() time.sleep(cls.IO_REFRESH_RATE * 2) # Start IO polling to verify nothing went down ThreadingHandler.poll_io( r_semaphore=io_r_semaphore, required_thread_amount=len(io_thread_pairs), shared_resource=monitoring_data, downed_time=downed_time, timeout=cls.HA_TIMEOUT, output_files=output_files, client=compute_client, disk_amount=disk_amount) logger.info('Starting to validate move...') cls._validate_move(values_to_check) logger.info('Finished validating move!') logger.info('Validate if DTL is working correctly!') unmatching_checksum_vms = [] for vm_name, vm_data in vm_info.iteritems(): current_md5sum = ' '.join(vm_data['client'].run( ['md5sum', cls.VM_FILENAME]).split()) if vm_data['original_md5sum'] != current_md5sum: unmatching_checksum_vms.append(vm_name) assert len( unmatching_checksum_vms ) == 0, 'Not all data was read from the DTL. Checksums do not line up for {}'.format( ', '.join(unmatching_checksum_vms)) logger.info('DTL is working correctly!') finally: for thread_category, thread_collection in threads[ 'evented'].iteritems(): ThreadHelper.stop_evented_threads( thread_collection['pairs'], thread_collection['r_semaphore']) if vm_downed is True: VMHandler.start_vm(parent_hypervisor, vm_to_stop) logger.debug('Started {0}'.format(vm_to_stop)) SystemHelper.idle_till_ovs_is_up(source_std.storage_ip, **cls.get_shell_user()) # @TODO: Remove when https://github.com/openvstorage/integrationtests/issues/540 is fixed FwkHandler.restart_all() for vm_name, vm_data in vm_info.iteritems(): for screen_name in vm_data.get('screen_names', []): logger.debug('Stopping screen {0} on {1}.'.format( screen_name, vm_data['client'].ip)) vm_data['client'].run( ['screen', '-S', screen_name, '-X', 'quit']) vm_data['screen_names'] = []
def run_test(cls, cluster_info, compute_client, vm_info, vm_username=CIConstants.VM_USERNAME, vm_password=CIConstants.VM_PASSWORD, timeout=TEST_TIMEOUT, data_test_cases=CIConstants.DATA_TEST_CASES, logger=LOGGER): """ Runs the test as described in https://github.com/openvstorage/dev_ops/issues/64 :param cluster_info: information about the cluster :param compute_client: SSHclient of the computenode :param vm_info: vm information :param vm_username: username to login on all vms :param vm_password: password to login on all vms :param timeout: timeout in seconds :param data_test_cases: data rw ratios to test :param logger: logging instance :return: """ compute_str = cluster_info['storagerouters']['compute'] destination_storagedriver = cluster_info['storagedrivers']['destination'] source_storagedriver = cluster_info['storagedrivers']['source'] # Cache to validate properties values_to_check = { 'source_std': source_storagedriver.serialize(), 'target_std': destination_storagedriver.serialize() } # Prep VM listener # failed_configurations = [] # Extract vdisk info from vm_info - only get the data ones vdisk_info = {} disk_amount = 0 for vm_name, vm_object in vm_info.iteritems(): for vdisk in vm_object['vdisks']: if 'vdisk_data' in vdisk.name: vdisk_info.update({vdisk.name: vdisk}) disk_amount += 1 try: cls._adjust_automatic_scrubbing(disable=True) with remote(compute_str.ip, [SSHClient]) as rem: configuration = random.choice(data_test_cases) threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}, 'snapshots': {'pairs': [], 'r_semaphore': None}}} output_files = [] safety_set = False try: logger.info('Starting the following configuration: {0}'.format(configuration)) for vm_name, vm_data in vm_info.iteritems(): vm_client = rem.SSHClient(vm_data['ip'], vm_username, vm_password) vm_client.file_create('/mnt/data/{0}.raw'.format(vm_data['create_msg'])) vm_data['client'] = vm_client cls._set_mds_safety(source_storagedriver.vpool, 1, checkup=True) # Set the safety to trigger the mds safety_set = True io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads(volume_bundle=vdisk_info) threads['evented']['io']['pairs'] = io_thread_pairs threads['evented']['io']['r_semaphore'] = io_r_semaphore # @todo snapshot every minute threads['evented']['snapshots']['pairs'] = ThreadingHandler.start_snapshotting_threads(volume_bundle=vdisk_info, kwargs={'interval': 15}) for vm_name, vm_data in vm_info.iteritems(): # Write data screen_names, output_files = DataWriter.write_data_fio(client=vm_data['client'], fio_configuration={ 'io_size': cls.AMOUNT_TO_WRITE, 'configuration': configuration}, file_locations=['/mnt/data/{0}.raw'.format(vm_data['create_msg'])]) vm_data['screen_names'] = screen_names logger.info('Doing IO for {0}s before bringing down the node.'.format(cls.IO_TIME)) ThreadingHandler.keep_threads_running(r_semaphore=io_r_semaphore, threads=io_thread_pairs, shared_resource=monitoring_data, duration=cls.IO_TIME / 2) ThreadHelper.stop_evented_threads(threads['evented']['snapshots']['pairs'], threads['evented']['snapshots']['r_semaphore']) # Stop snapshotting cls._delete_snapshots(volume_bundle=vdisk_info) # Start scrubbing thread async_scrubbing = cls.start_scrubbing(volume_bundle=vdisk_info) # Starting to scrub cls._trigger_mds_issue(cluster_info['vpool'], vdisk_info, destination_storagedriver.storagerouter.guid) # Trigger mds failover while scrubber is busy # Do some monitoring further for 60s ThreadingHandler.keep_threads_running(r_semaphore=io_r_semaphore, threads=io_thread_pairs, shared_resource=monitoring_data, duration=cls.IO_TIME / 2) time.sleep(cls.IO_REFRESH_RATE * 2) downed_time = time.time() # Start IO polling to verify nothing went down ThreadingHandler.poll_io(r_semaphore=io_r_semaphore, required_thread_amount=len(io_thread_pairs), shared_resource=monitoring_data, downed_time=downed_time, timeout=timeout, output_files=output_files, client=compute_client, disk_amount=disk_amount) possible_scrub_errors = async_scrubbing.get() # Wait until scrubbing calls have given a result assert len(possible_scrub_errors) == 0, 'Scrubbing has encountered some errors: {0}'.format(', '.join(possible_scrub_errors)) cls._validate(values_to_check, monitoring_data) except Exception as ex: logger.error('Running the test for configuration {0} has failed because {1}'.format(configuration, str(ex))) failed_configurations.append({'configuration': configuration, 'reason': str(ex)}) raise finally: for thread_category, thread_collection in threads['evented'].iteritems(): ThreadHelper.stop_evented_threads(thread_collection['pairs'], thread_collection['r_semaphore']) for vm_name, vm_data in vm_info.iteritems(): for screen_name in vm_data.get('screen_names', []): logger.debug('Stopping screen {0} on {1}.'.format(screen_name, vm_data['client'].ip)) vm_data['client'].run(['screen', '-S', screen_name, '-X', 'quit']) vm_data['screen_names'] = [] if safety_set is True: cls._set_mds_safety(source_storagedriver.vpool, len(StorageRouterList.get_masters()), checkup=True) finally: cls._adjust_automatic_scrubbing(disable=False) assert len(failed_configurations) == 0, 'Certain configuration failed: {0}'.format(' '.join(failed_configurations))
def test_reroute_fio(cls, fio_bin_path, cluster_info, disk_amount=1, timeout=CIConstants.HA_TIMEOUT, is_ee=False, logger=LOGGER): """ Uses a modified fio to work with the openvstorage protocol :param fio_bin_path: path of the fio binary :type fio_bin_path: str :param cluster_info: information about the cluster, contains all dal objects :type cluster_info: dict :param disk_amount: amount of disks to test fail over with :type disk_amount: int :param timeout: timeout in seconds :type timeout: int :param is_ee: is it the enterprise edition :type is_ee: bool :param logger: logger instance :type logger: ovs.log.log_handler.LogHandler :return: None :rtype: NoneType """ compute_client = SSHClient(cluster_info['storagerouters']['compute'], username='******') destination_std = cluster_info['storagedrivers']['destination'] source_std = cluster_info['storagedrivers']['source'] # will be downed vpool = source_std.vpool values_to_check = { 'source_std': source_std.serialize(), 'target_std': destination_std.serialize(), 'vdisks': [] } # Create vdisks protocol = source_std.cluster_node_config['network_server_uri'].split(':')[0] edge_configuration = {'fio_bin_location': fio_bin_path, 'hostname': source_std.storage_ip, 'port': source_std.ports['edge'], 'protocol': protocol, 'volumenames': []} vdisk_info = {} failed_configurations = [] if is_ee is True: edge_configuration.update(cls.get_shell_user()) for index in xrange(0, disk_amount): try: vdisk_name = '{0}_vdisk{1}'.format(EdgeTester.TEST_NAME, str(index).zfill(4)) data_vdisk = VDiskHelper.get_vdisk_by_guid(VDiskSetup.create_vdisk(vdisk_name, vpool.name, EdgeTester.AMOUNT_TO_WRITE * 2, source_std.storage_ip)) vdisk_info[vdisk_name] = data_vdisk edge_configuration['volumenames'].append(data_vdisk.devicename.rsplit('.', 1)[0].split('/', 1)[1]) values_to_check['vdisks'].append(data_vdisk.serialize()) except RuntimeError as ex: logger.error('Could not create the vdisk. Got {0}'.format(str(ex))) raise configuration = random.choice(cls.DATA_TEST_CASES) threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}} screen_names = [] adjusted = False try: io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads(volume_bundle=vdisk_info) threads['evented']['io']['pairs'] = io_thread_pairs threads['evented']['io']['r_semaphore'] = io_r_semaphore screen_names, output_files = DataWriter.write_data_fio(client=compute_client, fio_configuration={'io_size': cls.AMOUNT_TO_WRITE, 'configuration': configuration}, edge_configuration=edge_configuration) logger.info('Doing IO for {0}s before bringing down the node.'.format(cls.IO_TIME)) ThreadingHandler.keep_threads_running(r_semaphore=io_r_semaphore, threads=io_thread_pairs, shared_resource=monitoring_data, duration=cls.IO_TIME) # Threads ready for monitoring at this point, they are waiting to resume EdgeTester.adjust_for_reroute(source_std.storagerouter, trigger_rerout=True, ip_to_block=compute_client.ip, additional_ports=[edge_configuration['port']]) adjusted = True downed_time = time.time() logger.info('Now waiting two refreshrate intervals to avoid caching. In total {}s'.format(EdgeTester.IO_REFRESH_RATE * 2)) time.sleep(cls.IO_REFRESH_RATE * 2) ThreadingHandler.poll_io(r_semaphore=io_r_semaphore, required_thread_amount=len(io_thread_pairs), shared_resource=monitoring_data, downed_time=downed_time, timeout=timeout, output_files=output_files, client=compute_client, disk_amount=disk_amount) EdgeTester._validate_dal(values_to_check) # Validate except Exception as ex: logger.error('Got an exception while running configuration {0}. Namely: {1}'.format(configuration, str(ex))) failed_configurations.append({'configuration': configuration, 'reason': str(ex)}) finally: if adjusted is True: EdgeTester.adjust_for_reroute(source_std.storagerouter, trigger_rerout=False, ip_to_block=compute_client.ip, additional_ports=[edge_configuration['port']]) for screen_name in screen_names: compute_client.run(['screen', '-S', screen_name, '-X', 'quit']) for thread_category, thread_collection in threads['evented'].iteritems(): ThreadHelper.stop_evented_threads(thread_collection['pairs'], thread_collection['r_semaphore']) for vdisk in vdisk_info.values(): VDiskRemover.remove_vdisk(vdisk.guid) assert len(failed_configurations) == 0, 'Certain configuration failed: {0}'.format(failed_configurations)
def live_migrate(cls, vm_info, cluster_info, disk_amount, hypervisor_info, logger=LOGGER): """ Execute the live migration test Migrates the vm away using libvirt migrate call Expects the DAL to be updated due to the IO causing volumedriver to move the volume """ failed_configurations = [] destination_storagedriver = cluster_info['storagedrivers'][ 'destination'] source_storagedriver = cluster_info['storagedrivers']['source'] hv_credentials = HypervisorCredentials( ip=source_storagedriver.storage_ip, user=hypervisor_info['user'], password=hypervisor_info['password'], type=hypervisor_info['type']) source_hypervisor = HypervisorFactory().get( hv_credentials=hv_credentials) client = SSHClient(source_storagedriver.storagerouter) # Cache to validate properties values_to_check = { 'source_std': source_storagedriver.serialize(), 'target_std': destination_storagedriver.serialize() } # Extract vdisk info from vm_info vdisk_info = {} for vm_name, vm_object in vm_info.iteritems(): for vdisk in vm_object['vdisks']: vdisk_info.update({vdisk.name: vdisk}) with remote(source_storagedriver.storage_ip, [SSHClient]) as rem: test_run_nr = 0 configuration = random.choice(cls.DATA_TEST_CASES) threads = {'evented': {'io': {'pairs': [], 'r_semaphore': None}}} output_files = [] try: logger.info('Starting the following configuration: {0}'.format( configuration)) if test_run_nr == 0: # Build reusable ssh clients for vm_name, vm_data in vm_info.iteritems(): vm_client = rem.SSHClient(vm_data['ip'], cls.VM_USERNAME, cls.VM_PASSWORD) vm_client.file_create('/mnt/data/{0}.raw'.format( vm_data['create_msg'])) vm_data['client'] = vm_client else: for vm_name, vm_data in vm_info.iteritems(): vm_data['client'].run([ 'rm', '/mnt/data/{0}.raw'.format(vm_data['create_msg']) ]) io_thread_pairs, monitoring_data, io_r_semaphore = ThreadingHandler.start_io_polling_threads( volume_bundle=vdisk_info) threads['evented']['io']['pairs'] = io_thread_pairs threads['evented']['io']['r_semaphore'] = io_r_semaphore for vm_name, vm_data in vm_info.iteritems(): # Write data screen_names, output_files = DataWriter.write_data_fio( client=vm_data['client'], fio_configuration={ 'io_size': cls.AMOUNT_TO_WRITE, 'configuration': configuration }, file_locations=[ '/mnt/data/{0}.raw'.format(vm_data['create_msg']) ]) vm_data['screen_names'] = screen_names logger.info( 'Doing IO for {0}s before bringing down the node.'.format( cls.IO_TIME)) ThreadingHandler.keep_threads_running( r_semaphore=io_r_semaphore, threads=io_thread_pairs, shared_resource=monitoring_data, duration=cls.IO_TIME) # Threads ready for monitoring at this point ######################### # Migrate the VMs ######################### try: logger.info('Migrating the VM.') for vm_name in vm_info: source_hypervisor.sdk.migrate( vm_name, destination_storagedriver.storage_ip, hypervisor_info['user']) except Exception as ex: logger.error('Failed to stop. Got {0}'.format(str(ex))) raise downed_time = time.time() time.sleep(cls.IO_REFRESH_RATE * 2) # Start IO polling to verify nothing went down ThreadingHandler.poll_io( r_semaphore=io_r_semaphore, required_thread_amount=len(io_thread_pairs), shared_resource=monitoring_data, downed_time=downed_time, timeout=cls.FAILOVER_TIMEOUT, output_files=output_files, client=client, disk_amount=disk_amount) # Do some more IO to trigger ownership migration ThreadingHandler.keep_threads_running( r_semaphore=io_r_semaphore, threads=io_thread_pairs, shared_resource=monitoring_data, duration=cls.IO_TIME) cls._validate_move(values_to_check) except Exception as ex: logger.error( 'Running the test for configuration {0} has failed because {1}' .format(configuration, str(ex))) failed_configurations.append({ 'configuration': configuration, 'reason': str(ex) }) finally: for thread_category, thread_collection in threads[ 'evented'].iteritems(): ThreadHelper.stop_evented_threads( thread_collection['pairs'], thread_collection['r_semaphore']) for vm_name, vm_data in vm_info.iteritems(): for screen_name in vm_data.get('screen_names', []): logger.debug('Stopping screen {0} on {1}.'.format( screen_name, vm_data['client'].ip)) vm_data['client'].run( ['screen', '-S', screen_name, '-X', 'quit']) vm_data['screen_names'] = []