def revert_snapshot(self, name, skip_timesync=False, skip_slaves_check=False): if not self.d_env.has_snapshot(name): return False logger.info('We have snapshot with such name: {:s}'.format(name)) logger.info("Reverting the snapshot '{0}' ....".format(name)) self.d_env.revert(name) logger.info("Resuming the snapshot '{0}' ....".format(name)) self.resume_environment() if not skip_timesync: self.sync_time() try: _wait(self.fuel_web.client.get_releases, expected=EnvironmentError, timeout=300) except exceptions.Unauthorized: self.set_admin_keystone_password() self.fuel_web.get_nailgun_version() if not skip_slaves_check: _wait(lambda: self.check_slaves_are_ready(), timeout=60 * 6) return True
def check_mysql(remote, node_name): check_cmd = 'pkill -0 -x mysqld' check_crm_cmd = ('crm resource status clone_p_mysql |' ' grep -q "is running on: $HOSTNAME"') check_galera_cmd = ("mysql --connect_timeout=5 -sse \"SELECT" " VARIABLE_VALUE FROM" " information_schema.GLOBAL_STATUS" " WHERE VARIABLE_NAME" " = 'wsrep_local_state_comment';\"") try: wait(lambda: remote.execute(check_cmd)['exit_code'] == 0, timeout=300) logger.info('MySQL daemon is started on {0}'.format(node_name)) except TimeoutError: logger.error('MySQL daemon is down on {0}'.format(node_name)) raise _wait(lambda: assert_equal(remote.execute(check_crm_cmd)['exit_code'], 0, 'MySQL resource is NOT running on {0}'.format( node_name)), timeout=60) try: wait(lambda: ''.join(remote.execute( check_galera_cmd)['stdout']).rstrip() == 'Synced', timeout=600) except TimeoutError: logger.error('galera status is {0}'.format(''.join(remote.execute( check_galera_cmd)['stdout']).rstrip())) raise
def check_mysql(remote, node_name): check_cmd = 'pkill -0 -x mysqld' check_crm_cmd = ('crm resource status clone_p_mysql |' ' grep -q "is running on: $HOSTNAME"') check_galera_cmd = ("mysql --connect_timeout=5 -sse \"SELECT" " VARIABLE_VALUE FROM" " information_schema.GLOBAL_STATUS" " WHERE VARIABLE_NAME" " = 'wsrep_local_state_comment';\"") try: wait(lambda: remote.execute(check_cmd)['exit_code'] == 0, timeout=10 * 60) logger.info('MySQL daemon is started on {0}'.format(node_name)) except TimeoutError: logger.error('MySQL daemon is down on {0}'.format(node_name)) raise _wait(lambda: assert_equal( remote.execute(check_crm_cmd)['exit_code'], 0, 'MySQL resource is NOT running on {0}'.format(node_name)), timeout=60) try: wait(lambda: ''.join(remote.execute(check_galera_cmd)['stdout']). rstrip() == 'Synced', timeout=600) except TimeoutError: logger.error('galera status is {0}'.format(''.join( remote.execute(check_galera_cmd)['stdout']).rstrip())) raise
def rollback_automatically_ha_env(self): """Rollback manually simple deployed cluster Scenario: 1. Revert snapshot with simple neutron gre ha env 2. Add raise exception to openstack.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Add 1 cinder node and re-deploy cluster 7. Run OSTF """ if not self.env.get_virtual_environment().has_snapshot( 'deploy_neutron_gre_ha'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre_ha") cluster_id = self.fuel_web.get_last_created_cluster() checkers.upload_tarball(self.env.get_admin_remote(), hlp_data.TARBALL_PATH, '/var') checkers.check_tarball_exists(self.env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') checkers.untar(self.env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') self.fuel_web.modify_python_file(self.env.get_admin_remote(), "61i \ \ \ \ \ \ \ \ raise errors." "ExecutedErrorNonZeroExitCode('{0}')" .format('Some bad error'), '/var/upgrade/site-packages/' 'fuel_upgrade/engines/' 'openstack.py') checkers.run_script(self.env.get_admin_remote(), '/var', 'upgrade.sh', password= hlp_data.KEYSTONE_CREDS['password'], rollback=True, exit_code=255) checkers.wait_rollback_is_done(self.env.get_admin_remote(), 3000) checkers.check_upgraded_containers(self.env.get_admin_remote(), hlp_data.UPGRADE_FUEL_TO, hlp_data.UPGRADE_FUEL_FROM) logger.debug("all containers are ok") _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.nodes().slaves[0]), timeout=120) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state(self.env.nodes().slaves[:5]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.env.bootstrap_nodes(self.env.nodes().slaves[5:6]) self.fuel_web.update_nodes( cluster_id, {'slave-06': ['cinder']}, True, False ) self.fuel_web.deploy_cluster_wait(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id) self.env.make_snapshot("rollback_automatic_ha")
def ha_pacemaker_restart_heat_engine(self): if not self.env.d_env.has_snapshot(self.snapshot_name): raise SkipTest() self.env.revert_snapshot(self.snapshot_name) ocf_success = "DEBUG: OpenStack Orchestration Engine" \ " (heat-engine) monitor succeeded" ocf_error = "ERROR: OpenStack Heat Engine is not connected to the" \ " AMQP server: AMQP connection test returned 1" heat_name = 'heat-engine' ocf_status = \ 'script -q -c "OCF_ROOT=/usr/lib/ocf' \ ' /usr/lib/ocf/resource.d/fuel/{0}' \ ' monitor 2>&1"'.format(heat_name) remote = self.fuel_web.get_ssh_for_node( self.env.d_env.nodes().slaves[0].name) pid = ''.join(remote.execute('pgrep heat-engine')['stdout']) get_ocf_status = ''.join( remote.execute(ocf_status)['stdout']).rstrip() assert_true(ocf_success in get_ocf_status, "heat engine is not succeeded, status is {0}".format( get_ocf_status)) assert_true(len(remote.execute( "netstat -nap | grep {0} | grep :5673". format(pid))['stdout']) > 0, 'There is no amqp connections') remote.execute("iptables -I OUTPUT 1 -m owner --uid-owner heat -m" " state --state NEW,ESTABLISHED,RELATED -j DROP") cmd = "netstat -nap | grep {0} | grep :5673".format(pid) wait(lambda: len(remote.execute(cmd)['stdout']) == 0, timeout=300) get_ocf_status = ''.join( remote.execute(ocf_status)['stdout']).rstrip() logger.info('ocf status after blocking is {0}'.format( get_ocf_status)) assert_true(ocf_error in get_ocf_status, "heat engine is running, status is {0}".format( get_ocf_status)) remote.execute("iptables -D OUTPUT 1 -m owner --uid-owner heat -m" " state --state NEW,ESTABLISHED,RELATED") _wait(lambda: assert_true(ocf_success in ''.join( remote.execute(ocf_status)['stdout']).rstrip()), timeout=240) newpid = ''.join(remote.execute('pgrep heat-engine')['stdout']) assert_true(pid != newpid, "heat pid is still the same") get_ocf_status = ''.join(remote.execute( ocf_status)['stdout']).rstrip() assert_true(ocf_success in get_ocf_status, "heat engine is not succeeded, status is {0}".format( get_ocf_status)) assert_true(len( remote.execute("netstat -nap | grep {0} | grep :5673".format( newpid))['stdout']) > 0) cluster_id = self.fuel_web.get_last_created_cluster() self.fuel_web.run_ostf(cluster_id=cluster_id)
def assertClusterReady(self, node_name, smiles_count, networks_count=1, timeout=300): _wait( lambda: self.get_cluster_status( self.get_node_by_devops_node( self.ci().environment().node_by_name(node_name))['ip'], smiles_count=smiles_count, networks_count=networks_count), timeout=timeout)
def rollback_automatically_ha_env(self): """Rollback manually simple deployed cluster Scenario: 1. Revert snapshot with simple neutron gre ha env 2. Add raise exception to openstack.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Add 1 cinder node and re-deploy cluster 7. Run OSTF """ if not self.env.get_virtual_environment().has_snapshot( 'deploy_neutron_gre_ha'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre_ha") cluster_id = self.fuel_web.get_last_created_cluster() checkers.upload_tarball(self.env.get_admin_remote(), hlp_data.TARBALL_PATH, '/var') checkers.check_tarball_exists(self.env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') checkers.untar(self.env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') self.fuel_web.modify_python_file(self.env.get_admin_remote(), "2i \ \ \ \ 2014.2-6.0: blah-blah", '/var/upgrade/releases/' 'metadata.yaml') checkers.run_script(self.env.get_admin_remote(), '/var', 'upgrade.sh', password= hlp_data.KEYSTONE_CREDS['password'], rollback=True, exit_code=255) checkers.wait_rollback_is_done(self.env.get_admin_remote(), 3000) checkers.check_upgraded_containers(self.env.get_admin_remote(), hlp_data.UPGRADE_FUEL_TO, hlp_data.UPGRADE_FUEL_FROM) logger.debug("all containers are ok") _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.nodes().slaves[0]), timeout=120) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state(self.env.nodes().slaves[:5]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.env.bootstrap_nodes(self.env.nodes().slaves[5:6]) self.fuel_web.update_nodes( cluster_id, {'slave-06': ['cinder']}, True, False ) self.fuel_web.deploy_cluster_wait(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id) self.env.make_snapshot("rollback_automatic_ha")
def assert_cluster_ready(self, node_name, smiles_count, networks_count=1, timeout=300): logger.info('Assert cluster services are UP') remote = self.environment.get_ssh_to_remote_by_name(node_name) _wait(lambda: self.get_cluster_status( remote, smiles_count=smiles_count, networks_count=networks_count), timeout=timeout)
def assert_cluster_ready(self, node_name, smiles_count, networks_count=1, timeout=300): logger.info('Assert cluster services are UP') remote = self.environment.get_ssh_to_remote_by_name(node_name) _wait( lambda: self.get_cluster_status( remote, smiles_count=smiles_count, networks_count=networks_count), timeout=timeout)
def assertClusterReady(self, node_name, smiles_count, networks_count=1, timeout=300): _wait(lambda: self.get_cluster_status(self.get_node_by_devops_node( self.ci().environment().node_by_name(node_name))['ip'], smiles_count=smiles_count, networks_count=networks_count), timeout=timeout)
def assert_cluster_ready(self, node_name, smiles_count, networks_count=1, timeout=300): remote = self.environment.get_ssh_to_remote( self.get_nailgun_node_by_devops_node(self.environment.get_virtual_environment().node_by_name(node_name))[ "ip" ] ) _wait( lambda: self.get_cluster_status(remote, smiles_count=smiles_count, networks_count=networks_count), timeout=timeout, )
def rollback_automatically_delete_node(self): """Rollback automatically ha one controller deployed cluster and delete node from cluster Scenario: 1. Revert snapshot with deploy Neutron GRE 6.1 env 2. Add raise exception to docker_engine.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Run network verification 7. Run OSTF 8. Delete 1 node and re-deploy cluster 9. Run OSTF """ # TODO(ddmitriev): change snapshot name to actual when reverting 7.0 if not self.env.d_env.has_snapshot('deploy_neutron_gre'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre") cluster_id = self.fuel_web.get_last_created_cluster() self.env.admin_actions.upgrade_master_node(rollback=True) _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity']) nailgun_nodes = self.fuel_web.update_nodes( cluster_id, {'slave-03': ['compute', 'cinder']}, False, True) task = self.fuel_web.deploy_cluster(cluster_id) self.fuel_web.assert_task_success(task) nodes = filter(lambda x: x["pending_deletion"] is True, nailgun_nodes) try: wait(lambda: len(self.fuel_web.client.list_nodes()) == 3, timeout=5 * 60) except TimeoutError: assert_true( len(self.fuel_web.client.list_nodes()) == 3, 'Node {0} is not discovered in timeout 10 *60'.format( nodes[0])) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity'], should_fail=1) self.env.make_snapshot("rollback_automatically_delete_node")
def rollback_automatically_ha_one_controller(self): """Rollback automatically ha one controller deployed cluster Scenario: 1. Revert snapshot with deploy Neutron VXLAN env 2. Add raise exception to docker_engine.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Run network verification 7. Run OSTF 8. Add 1 ceph node and re-deploy cluster 9. Run OSTF """ if not self.env.d_env.has_snapshot('ceph_ha_one_controller_compact'): raise SkipTest() self.env.revert_snapshot('ceph_ha_one_controller_compact') cluster_id = self.fuel_web.get_last_created_cluster() _ip = self.fuel_web.get_nailgun_node_by_name('slave-01')['ip'] with self.env.d_env.get_ssh_to_remote(_ip) as remote: expected_kernel = UpgradeFuelMaster.get_slave_kernel(remote) self.env.admin_actions.upgrade_master_node(rollback=True) _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.env.bootstrap_nodes( self.env.d_env.nodes().slaves[3:4]) self.fuel_web.update_nodes( cluster_id, {'slave-04': ['ceph-osd']}, True, False ) self.fuel_web.deploy_cluster_wait(cluster_id) if hlp_data.OPENSTACK_RELEASE_UBUNTU in hlp_data.OPENSTACK_RELEASE: _ip = self.fuel_web.get_nailgun_node_by_name('slave-04')['ip'] with self.env.d_env.get_ssh_to_remote(_ip) as remote: kernel = UpgradeFuelMaster.get_slave_kernel(remote) checkers.check_kernel(kernel, expected_kernel) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.env.make_snapshot("rollback_automatically_ha_one_controller")
def rollback_automatically_delete_node(self): """Rollback automatically ha one controller deployed cluster and delete node from cluster Scenario: 1. Revert snapshot with deploy Neutron GRE 6.1 env 2. Add raise exception to docker_engine.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Run network verification 7. Run OSTF 8. Delete 1 node and re-deploy cluster 9. Run OSTF """ # TODO(ddmitriev): change snapshot name to actual when reverting 7.0 if not self.env.d_env.has_snapshot('deploy_neutron_gre'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre") cluster_id = self.fuel_web.get_last_created_cluster() self.env.admin_actions.upgrade_master_node(rollback=True) _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity']) nailgun_nodes = self.fuel_web.update_nodes( cluster_id, {'slave-03': ['compute', 'cinder']}, False, True) task = self.fuel_web.deploy_cluster(cluster_id) self.fuel_web.assert_task_success(task) nodes = filter(lambda x: x["pending_deletion"] is True, nailgun_nodes) try: wait(lambda: len(self.fuel_web.client.list_nodes()) == 3, timeout=5 * 60) except TimeoutError: assert_true(len(self.fuel_web.client.list_nodes()) == 3, 'Node {0} is not discovered in timeout 10 *60'.format( nodes[0])) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity'], should_fail=1) self.env.make_snapshot("rollback_automatically_delete_node")
def revert_snapshot(self, name): if self.get_virtual_environment().has_snapshot(name): logger.info('We have snapshot with such name %s' % name) self.get_virtual_environment().revert(name) logger.info('Starting snapshot reverting ....') self.get_virtual_environment().resume() logger.info('Starting snapshot resuming ...') admin = self.nodes().admin try: admin. await (self.admin_net, timeout=10 * 60, by_port=8000) except Exception as e: logger.warning("From first time admin isn't reverted: " "{0}".format(e)) admin.destroy() logger.info('Admin node was destroyed. Wait 10 sec.') time.sleep(10) self.get_virtual_environment().start(self.nodes().admins) logger.info('Admin node started second time.') self.nodes().admin. await (self.admin_net, timeout=10 * 60, by_port=8000) self.set_admin_ssh_password() try: _wait(self._fuel_web.client.get_releases, expected=EnvironmentError, timeout=300) except exceptions.Unauthorized: self.set_admin_keystone_password() self._fuel_web.get_nailgun_version() self.sync_time_admin_node() for node in self.nodes().slaves: if not node.driver.node_active(node): continue try: logger.info("Sync time on revert for node %s" % node.name) self.sync_node_time( self.get_ssh_to_remote_by_name(node.name)) except Exception as e: logger.warning( 'Exception caught while trying to sync time on {0}:' ' {1}'.format(node.name, e)) self.run_nailgun_agent( self.get_ssh_to_remote_by_name(node.name)) return True return False
def revert_snapshot(self, name): if self.get_virtual_environment().has_snapshot(name): logger.info('We have snapshot with such name %s' % name) self.get_virtual_environment().revert(name) logger.info('Starting snapshot reverting ....') self.get_virtual_environment().resume() logger.info('Starting snapshot resuming ...') admin = self.nodes().admin try: admin.await( self.admin_net, timeout=10 * 60, by_port=8000) except Exception as e: logger.warning("From first time admin isn't reverted: " "{0}".format(e)) admin.destroy() logger.info('Admin node was destroyed. Wait 10 sec.') time.sleep(10) self.get_virtual_environment().start(self.nodes().admins) logger.info('Admin node started second time.') self.nodes().admin.await( self.admin_net, timeout=10 * 60, by_port=8000) self.set_admin_ssh_password() try: _wait(self._fuel_web.client.get_releases, expected=EnvironmentError, timeout=300) except exceptions.Unauthorized: self.set_admin_keystone_password() self._fuel_web.get_nailgun_version() self.sync_time_admin_node() for node in self.nodes().slaves: if not node.driver.node_active(node): continue try: logger.info("Sync time on revert for node %s" % node.name) self.sync_node_time( self.get_ssh_to_remote_by_name(node.name)) except Exception as e: logger.warning( 'Exception caught while trying to sync time on {0}:' ' {1}'.format(node.name, e)) self.run_nailgun_agent( self.get_ssh_to_remote_by_name(node.name)) return True return False
def check_mysql(remote, node_name): check_cmd = 'pkill -0 -x mysqld' check_crm_cmd = ('crm resource status clone_p_mysql |' ' grep -q "is running on: $HOSTNAME"') try: wait(lambda: remote.execute(check_cmd)['exit_code'] == 0, timeout=300) logger.info('MySQL daemon is started on {0}'.format(node_name)) except TimeoutError: logger.error('MySQL daemon is down on {0}'.format(node_name)) raise _wait(lambda: assert_equal(remote.execute(check_crm_cmd)['exit_code'], 0, 'MySQL resource is NOT running on {0}'.format( node_name)), timeout=60)
def rollback_automatically_ha(self): """Rollback manually ha deployed cluster Scenario: 1. Revert snapshot with Neutron GRE HA 6.1 env 2. Add raise exception to openstack.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Run network verification 7. Run OSTF 8. Add 1 cinder node and re-deploy cluster 9. Run OSTF """ # TODO(ddmitriev): change snapshot name to actual when reverting 7.0 if not self.env.d_env.has_snapshot('deploy_neutron_gre_ha'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre_ha") cluster_id = self.fuel_web.get_last_created_cluster() self.env.admin_actions.upgrade_master_node(rollback=True) _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:5]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.env.bootstrap_nodes( self.env.d_env.nodes().slaves[5:6]) self.fuel_web.update_nodes( cluster_id, {'slave-06': ['cinder']}, True, False ) self.fuel_web.deploy_cluster_wait(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.env.make_snapshot("rollback_automatically_ha")
def check_mysql(remote, node_name): if OPENSTACK_RELEASE_UBUNTU in OPENSTACK_RELEASE: mysql_pidfile = '/var/run/mysqld/mysqld.pid' else: mysql_pidfile = '/var/run/mysql/mysqld.pid' check_cmd = '[ -r {0} ] && pkill -0 -F {0}'.format(mysql_pidfile) check_crm_cmd = ('crm resource status clone_p_mysql |' ' grep -q "is running on: $HOSTNAME"') try: wait(lambda: remote.execute(check_cmd)['exit_code'] == 0, timeout=300) logger.info('MySQL daemon is started on {0}'.format(node_name)) except TimeoutError: logger.error('MySQL daemon is down on {0}'.format(node_name)) raise _wait(lambda: assert_equal( remote.execute(check_crm_cmd)['exit_code'], 0, 'MySQL resource is NOT running on {0}'.format(node_name)), timeout=60)
def check_mysql(remote, node_name): if OPENSTACK_RELEASE_UBUNTU in OPENSTACK_RELEASE: mysql_pidfile = '/var/run/mysqld/mysqld.pid' else: mysql_pidfile = '/var/run/mysql/mysqld.pid' check_cmd = '[ -r {0} ] && pkill -0 -F {0}'.format(mysql_pidfile) check_crm_cmd = ('crm resource status clone_p_mysql |' ' grep -q "is running on: $HOSTNAME"') try: wait(lambda: remote.execute(check_cmd)['exit_code'] == 0, timeout=300) logger.info('MySQL daemon is started on {0}'.format(node_name)) except TimeoutError: logger.error('MySQL daemon is down on {0}'.format(node_name)) raise _wait(lambda: assert_equal(remote.execute(check_crm_cmd)['exit_code'], 0, 'MySQL resource is NOT running on {0}'.format( node_name)), timeout=60)
def rollback_automatically_ha_one_controller_env(self): """Rollback automatically ha one controller deployed cluster Scenario: 1. Revert snapshot with deploy neutron gre env 2. Add raise exception to docker_engine.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Run network verification 7. Run OSTF 8. Add 1 ceph node and re-deploy cluster 9. Run OSTF """ if not self.env.d_env.has_snapshot('ceph_multinode_compact'): raise SkipTest() self.env.revert_snapshot("ceph_multinode_compact") cluster_id = self.fuel_web.get_last_created_cluster() _ip = self.fuel_web.get_nailgun_node_by_name('slave-01')['ip'] remote = self.env.d_env.get_ssh_to_remote(_ip) expected_kernel = UpgradeFuelMaster.get_slave_kernel(remote) checkers.upload_tarball(self.env.d_env.get_admin_remote(), hlp_data.TARBALL_PATH, '/var') checkers.check_tarball_exists(self.env.d_env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') checkers.untar(self.env.d_env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') # we expect 255 exit code here because upgrade failed # and exit status is 255 checkers.run_script(self.env.d_env.get_admin_remote(), '/var', 'upgrade.sh', password=hlp_data.KEYSTONE_CREDS['password'], rollback=True, exit_code=255) checkers.wait_rollback_is_done(self.env.d_env.get_admin_remote(), 3000) checkers.check_upgraded_containers(self.env.d_env.get_admin_remote(), hlp_data.UPGRADE_FUEL_TO, hlp_data.UPGRADE_FUEL_FROM) logger.debug("all containers are ok") _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id) self.env.bootstrap_nodes( self.env.d_env.nodes().slaves[3:4]) self.fuel_web.update_nodes( cluster_id, {'slave-04': ['ceph-osd']}, True, False ) self.fuel_web.deploy_cluster_wait(cluster_id) if hlp_data.OPENSTACK_RELEASE_UBUNTU in hlp_data.OPENSTACK_RELEASE: _ip = self.fuel_web.get_nailgun_node_by_name('slave-04')['ip'] remote = self.env.d_env.get_ssh_to_remote(_ip) kernel = UpgradeFuelMaster.get_slave_kernel(remote) checkers.check_kernel(kernel, expected_kernel) self.fuel_web.run_ostf(cluster_id=cluster_id) self.env.make_snapshot("rollback_automatic_ha_one_controller")
def ha_pacemaker_restart_heat_engine(self): """Verify heat engine service is restarted by pacemaker on amqp connection loss Scenario: 1. SSH to any controller 2. Check heat-engine status 3. Block heat-engine amqp connections 4. Check heat-engine was stopped on current controller 5. Unblock heat-engine amqp connections 6. Check heat-engine process is running with new pid 7. Check amqp connection re-appears for heat-engine Snapshot ha_pacemaker_restart_heat_engine """ self.env.revert_snapshot("deploy_ha") ocf_success = "DEBUG: OpenStack Orchestration Engine" \ " (heat-engine) monitor succeeded" ocf_error = "ERROR: OpenStack Heat Engine is not connected to the" \ " AMQP server: AMQP connection test returned 1" heat_name = 'heat-engine' ocf_status = \ 'script -q -c "OCF_ROOT=/usr/lib/ocf' \ ' /usr/lib/ocf/resource.d/mirantis/{0}' \ ' monitor 2>&1"'.format(heat_name) remote = self.fuel_web.get_ssh_for_node( self.env.nodes().slaves[0].name) pid = ''.join(remote.execute('pgrep heat-engine')['stdout']) get_ocf_status = ''.join( remote.execute(ocf_status)['stdout']).rstrip() assert_true(ocf_success in get_ocf_status, "heat engine is not succeeded, status is {0}".format( get_ocf_status)) assert_true(len(remote.execute( "netstat -nap | grep {0} | grep :5673". format(pid))['stdout']) > 0, 'There is no amqp connections') remote.execute("iptables -I OUTPUT 1 -m owner --uid-owner heat -m" " state --state NEW,ESTABLISHED,RELATED -j DROP") wait(lambda: len(remote.execute ("netstat -nap | grep {0} | grep :5673". format(pid))['stdout']) == 0, timeout=300) get_ocf_status = ''.join( remote.execute(ocf_status)['stdout']).rstrip() logger.info('ocf status after blocking is {0}'.format( get_ocf_status)) assert_true(ocf_error in get_ocf_status, "heat engine is running, status is {0}".format( get_ocf_status)) remote.execute("iptables -D OUTPUT 1 -m owner --uid-owner heat -m" " state --state NEW,ESTABLISHED,RELATED") _wait(lambda: assert_true(ocf_success in ''.join( remote.execute(ocf_status)['stdout']).rstrip()), timeout=240) newpid = ''.join(remote.execute('pgrep heat-engine')['stdout']) assert_true(pid != newpid, "heat pid is still the same") get_ocf_status = ''.join(remote.execute( ocf_status)['stdout']).rstrip() assert_true(ocf_success in get_ocf_status, "heat engine is not succeeded, status is {0}".format( get_ocf_status)) assert_true(len( remote.execute("netstat -nap | grep {0} | grep :5673".format( newpid))['stdout']) > 0) cluster_id = self.fuel_web.get_last_created_cluster() self.fuel_web.run_ostf(cluster_id=cluster_id)
def ha_corosync_stability_check(self): @logwrap def _get_pcm_nodes(remote, pure=False): nodes = {} pcs_status = remote.execute('pcs status nodes')['stdout'] pcm_nodes = yaml.load(''.join(pcs_status).strip()) for status in ('Online', 'Offline', 'Standby'): list_nodes = (pcm_nodes['Pacemaker Nodes'][status] or '').split() if not pure: nodes[status] = [ self.fuel_web.get_fqdn_by_hostname(x) for x in list_nodes ] else: nodes[status] = list_nodes return nodes def _check_all_pcs_nodes_status(ctrl_remotes, pcs_nodes_online, status): for remote in ctrl_remotes: pcs_nodes = _get_pcm_nodes(remote) logger.debug( "Status of pacemaker nodes on node {0}: {1}".format( node['name'], pcs_nodes)) if set(pcs_nodes_online) != set(pcs_nodes[status]): return False return True if not self.env.d_env.has_snapshot(self.snapshot_name): raise SkipTest() self.env.revert_snapshot(self.snapshot_name) devops_name = self.env.d_env.nodes().slaves[0].name controller_node = self.fuel_web.get_nailgun_node_by_name(devops_name) with self.fuel_web.get_ssh_for_node(devops_name) as remote_controller: pcs_nodes = self.fuel_web.get_pcm_nodes(devops_name) assert_true( not pcs_nodes['Offline'], "There are offline nodes: {0}".format(pcs_nodes['Offline'])) pcs_nodes_online = pcs_nodes['Online'] cluster_id = self.fuel_web.get_last_created_cluster() ctrl_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles( cluster_id, ['controller']) alive_corosync_nodes = [ node for node in ctrl_nodes if node['mac'] != controller_node['mac'] ] ctrl_remotes = [ self.env.d_env.get_ssh_to_remote(node['ip']) for node in ctrl_nodes ] live_remotes = [ self.env.d_env.get_ssh_to_remote(node['ip']) for node in alive_corosync_nodes ] for count in xrange(500): logger.debug('Checking splitbrain in the loop, ' 'count number: {0}'.format(count)) _wait(lambda: assert_equal( remote_controller.execute('killall -TERM corosync') ['exit_code'], 0, 'Corosync was not killed on controller, ' 'see debug log, count-{0}'.format(count)), timeout=20) _wait(lambda: assert_true( _check_all_pcs_nodes_status(live_remotes, [ controller_node['fqdn'] ], 'Offline'), 'Caught splitbrain, see debug log, ' 'count-{0}'.format(count)), timeout=20) _wait(lambda: assert_equal( remote_controller.execute( 'service corosync start && service pacemaker ' 'restart')['exit_code'], 0, 'Corosync was not started, see debug log,' ' count-{0}'.format(count)), timeout=20) _wait(lambda: assert_true( _check_all_pcs_nodes_status(ctrl_remotes, pcs_nodes_online, 'Online'), 'Corosync was not started on controller, see debug ' 'log, count: {0}'.format(count)), timeout=20) for remote in ctrl_remotes: remote.clear() for remote in live_remotes: remote.clear()
def fuel_migration(self): """Fuel master migration to VM Scenario: 1. Create cluster 2. Run OSTF tests 3. Run Network check 4. Migrate fuel-master to VM 5. Run OSTF tests 6. Run Network check 7. Check statuses for master services Duration 210m """ self.env.revert_snapshot("ready_with_3_slaves") data = { 'net_provider': 'neutron', 'net_segment_type': settings.NEUTRON_SEGMENT_TYPE } cluster_id = self.fuel_web.create_cluster( name=self.__class__.__name__, mode=settings.DEPLOYMENT_MODE_HA, settings=data) self.fuel_web.update_nodes(cluster_id, { 'slave-01': ['controller'], 'slave-02': ['compute'] }) # Check network self.fuel_web.verify_network(cluster_id) # Cluster deploy self.fuel_web.deploy_cluster_wait(cluster_id) # Check network self.fuel_web.verify_network(cluster_id) # Fuel migration remote = self.env.d_env.get_admin_remote() logger.info('Fuel migration on compute slave-02') result = remote.execute( 'fuel-migrate ' + self.fuel_web.get_nailgun_node_by_name('slave-02')['ip'] + ' >/dev/null &') assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( 'fuel-migrate' + self.env.d_env.nodes().slaves[0].name, result)) checkers.wait_phrase_in_log(remote, 60 * 60, interval=0.2, phrase='Rebooting to begin ' 'the data sync process', log_path='/var/log/fuel-migrate.log') remote.clear() logger.info('Rebooting to begin the data sync process for fuel ' 'migrate') wait(lambda: not icmp_ping(self.env.get_admin_node_ip()), timeout=60 * 15, timeout_msg='Master node has not become offline ' 'after rebooting') wait(lambda: icmp_ping(self.env.get_admin_node_ip()), timeout=60 * 15, timeout_msg='Master node has not become online ' 'after rebooting') self.env.d_env.nodes().admin. await (network_name=self.d_env.admin_net, timeout=60 * 15) with self.env.d_env.get_admin_remote() as remote: checkers.wait_phrase_in_log(remote, 60 * 90, interval=0.1, phrase='Stop network and up with ' 'new settings', log_path='/var/log/fuel-migrate.log') logger.info('Shutting down network') wait(lambda: not icmp_ping(self.env.get_admin_node_ip()), timeout=60 * 15, interval=0.1, timeout_msg='Master node has not become offline shutting network') wait(lambda: icmp_ping(self.env.get_admin_node_ip()), timeout=60 * 15, timeout_msg='Master node has not become online shutting network') self.env.d_env.nodes().admin. await (network_name=self.d_env.admin_net, timeout=60 * 10) logger.info("Check containers") self.env.docker_actions.wait_for_ready_containers(timeout=60 * 30) logger.info("Check services") cluster_id = self.fuel_web.get_last_created_cluster() self.fuel_web.assert_ha_services_ready(cluster_id) self.fuel_web.assert_os_services_ready(cluster_id) # Check network self.fuel_web.verify_network(cluster_id) # Run ostf _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']), timeout=1500) logger.debug("OSTF tests are pass now")
def upgrade_fuel_after_rollback(self): """Upgrade Fuel after rollback and deploy new cluster Scenario: 1. Revert deploy_neutron_gre snapshot with 6.1 env 2. Upgrade with rollback 3. Run OSTF 4. Run network verification 5. Upgrade fuel master 6. Check upgrading was successful 7. Deploy 6.1 cluster with 3 nodes and neutron vlan 8. Run OSTF for new cluster 9. Run network verification """ # TODO(ddmitriev): change snapshot name to actual when reverting 7.0 if not self.env.d_env.has_snapshot('deploy_neutron_gre'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre") available_releases_before = self.fuel_web.get_releases_list_for_os( release_name=hlp_data.OPENSTACK_RELEASE) cluster_id = self.fuel_web.get_last_created_cluster() self.env.admin_actions.upgrade_master_node(rollback=True) _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.env.admin_actions.upgrade_master_node(file_upload=False) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_TO) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nailgun_upgrade_migration() # Deploy new cluster available_releases_after = self.fuel_web.get_releases_list_for_os( release_name=hlp_data.OPENSTACK_RELEASE) added_release = [id for id in available_releases_after if id not in available_releases_before] self.env.bootstrap_nodes( self.env.d_env.nodes().slaves[3:6]) new_cluster_id = self.fuel_web.create_cluster( name=self.__class__.__name__, release_id=added_release[0], mode=hlp_data.DEPLOYMENT_MODE, settings={ 'net_provider': 'neutron', 'net_segment_type': hlp_data.NEUTRON_SEGMENT['vlan'] } ) self.fuel_web.update_nodes( new_cluster_id, { 'slave-04': ['controller'], 'slave-05': ['compute'], 'slave-06': ['cinder'] } ) self.fuel_web.run_network_verify(new_cluster_id) self.fuel_web.deploy_cluster_wait(new_cluster_id) self.fuel_web.run_ostf(new_cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.fuel_web.run_network_verify(new_cluster_id) self.env.make_snapshot("upgrade_fuel_after_rollback")
def negative_auto_cic_maintenance_mode(self): """Check negative scenario for auto maintenance mode Scenario: 1. Revert snapshot 2. Disable UMM 3. Change UMM.CONF 4. Unexpected reboot 5. Check the controller not switching in maintenance mode 6. Check the controller become available Duration 85m """ self.env.revert_snapshot('cic_maintenance_mode') cluster_id = self.fuel_web.get_last_created_cluster() n_ctrls = self.fuel_web.get_nailgun_cluster_nodes_by_roles( cluster_id, ['controller']) d_ctrls = self.fuel_web.get_devops_nodes_by_nailgun_nodes(n_ctrls) for devops_node in d_ctrls: _ip = self.fuel_web.get_nailgun_node_by_name( devops_node.name)['ip'] asserts.assert_true('True' in checkers.check_available_mode(_ip), "Maintenance mode is not available") logger.info('Change UMM.CONF on node {0}'.format(devops_node.name)) command1 = ("echo -e 'UMM=yes\nREBOOT_COUNT=0\n" "COUNTER_RESET_TIME=10' > /etc/umm.conf") self.ssh_manager.execute_on_remote(ip=_ip, cmd=command1) self.ssh_manager.execute_on_remote(ip=_ip, cmd="umm disable") asserts.assert_false('True' in checkers.check_available_mode(_ip), "Maintenance mode should not be available") command2 = 'reboot --force >/dev/null & ' logger.info('Unexpected reboot on node {0}'.format( devops_node.name)) self.ssh_manager.execute_on_remote(ip=_ip, cmd=command2) wait(lambda: not checkers.check_ping(self.env.get_admin_node_ip(), _ip), timeout=60 * 10) # Node don't have enough time for set offline status # after reboot --force # Just waiting asserts.assert_true( checkers.check_ping(self.env.get_admin_node_ip(), _ip, deadline=600), "Host {0} is not reachable by ping during 600 sec".format(_ip)) logger.info('Wait a {0} node online status after unexpected ' 'reboot'.format(devops_node.name)) self.fuel_web.wait_nodes_get_online_state([devops_node]) logger.info('Check that {0} node not in maintenance mode after' ' unexpected reboot'.format(devops_node.name)) asserts.assert_false('True' in checkers.check_auto_mode(_ip), "Maintenance mode should not switched") # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up([n.name for n in d_ctrls]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up([n.name for n in d_ctrls]) _wait(lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=['sanity'], test_name=ostf_test_mapping.OSTF_TEST_MAPPING.get( 'Check that required services are running')), timeout=1500) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['ha']), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']) except AssertionError: logger.debug("Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails ") time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity'])
def negative_auto_cic_maintenance_mode(self): """Check negative scenario for auto maintenance mode Scenario: 1. Revert snapshot 2. Disable UMM 3. Change UMM.CONF 4. Unexpected reboot 5. Check the controller not switching in maintenance mode 6. Check the controller become available Duration 85m """ self.env.revert_snapshot('cic_maintenance_mode') cluster_id = self.fuel_web.get_last_created_cluster() for nailgun_node in self.env.d_env.nodes().slaves[0:3]: with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_available_mode(remote), "Maintenance mode is not available") logger.info('Change UMM.CONF on node %s', nailgun_node.name) command1 = ("echo -e 'UMM=yes\nREBOOT_COUNT=0\n" "COUNTER_RESET_TIME=10' > /etc/umm.conf") result = remote.execute(command1) assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( command1, result)) result = remote.execute('umm disable') assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( 'umm disable', result)) assert_false('True' in check_available_mode(remote), "Maintenance mode should not be available") logger.info('Unexpected reboot on node %s', nailgun_node.name) command2 = ('reboot --force >/dev/null & ') result = remote.execute(command2) assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( command2, result)) # Node don't have enough time for set offline status # after reboot --force # Just waiting _ip = self.fuel_web.get_nailgun_node_by_name( nailgun_node.name)['ip'] _wait(lambda: _tcp_ping(_ip, 22), timeout=120) logger.info( 'Wait a %s node online status after unexpected ' 'reboot', nailgun_node.name) self.fuel_web.wait_nodes_get_online_state([nailgun_node]) logger.info( 'Check that %s node not in maintenance mode after' ' unexpected reboot', nailgun_node.name) with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_false('True' in check_auto_mode(remote), "Maintenance mode should not switched") # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) _wait(lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=['sanity'], test_name=map_ostf.OSTF_TEST_MAPPING.get( 'Check that required services are running')), timeout=1500) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['ha']), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']) except AssertionError: logger.debug("Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails ") time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity'])
def manual_cic_maintenance_mode(self): """Check manual maintenance mode for controller Scenario: 1. Revert snapshot 2. Switch in maintenance mode 3. Wait until controller is rebooting 4. Exit maintenance mode 5. Check the controller become available Duration 155m """ self.env.revert_snapshot("cic_maintenance_mode") cluster_id = self.fuel_web.get_last_created_cluster() n_ctrls = self.fuel_web.get_nailgun_cluster_nodes_by_roles(cluster_id, ["controller"]) d_ctrls = self.fuel_web.get_devops_nodes_by_nailgun_nodes(n_ctrls) for devops_node in d_ctrls: with self.fuel_web.get_ssh_for_node(devops_node.name) as remote: assert_true("True" in check_available_mode(remote), "Maintenance mode is not available") logger.info("Maintenance mode for node %s", devops_node.name) result = remote.execute("umm on") assert_equal( result["exit_code"], 0, 'Failed to execute "{0}" on remote host: {1}'.format("umm on", result) ) logger.info("Wait a %s node offline status after switching " "maintenance mode ", devops_node.name) try: wait(lambda: not self.fuel_web.get_nailgun_node_by_devops_node(devops_node)["online"], timeout=60 * 10) except TimeoutError: assert_false( self.fuel_web.get_nailgun_node_by_devops_node(devops_node)["online"], "Node {0} has not become offline after" "switching maintenance mode".format(devops_node.name), ) logger.info("Check that %s node in maintenance mode after " "switching", devops_node.name) _ip = self.fuel_web.get_nailgun_node_by_name(devops_node.name)["ip"] wait(lambda: tcp_ping(_ip, 22), timeout=60 * 10) with self.fuel_web.get_ssh_for_node(devops_node.name) as remote: assert_true("True" in check_auto_mode(remote), "Maintenance mode is not switch") result = remote.execute("umm off") assert_equal( result["exit_code"], 0, 'Failed to execute "{0}" on remote host: {1}'.format("umm off", result) ) logger.info("Wait a %s node online status", devops_node.name) try: wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node(devops_node)["online"], timeout=60 * 10) except TimeoutError: assert_true( self.fuel_web.get_nailgun_node_by_devops_node(devops_node)["online"], "Node {0} has not become online after " "exiting maintenance mode".format(devops_node.name), ) # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up([n.name for n in d_ctrls]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up([n.name for n in d_ctrls]) _wait( lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=["sanity"], test_name=map_ostf.OSTF_TEST_MAPPING.get("Check that required services are running"), ), timeout=1500, ) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=["ha"]), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=["smoke", "sanity"]) except AssertionError: logger.debug( "Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails " ) time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=["smoke", "sanity"])
def wait_for_provisioning(self): _wait(lambda: _tcp_ping( self.nodes().admin.get_ip_address_by_network_name(self.admin_net), 22), timeout=5 * 60)
def await_node_deploy(ip, name): client = CobblerClient(ip) token = client.login('cobbler', 'cobbler') _wait(lambda: check_node_ready(client, token, name), timeout=30 * 60)
def ha_corosync_stability_check(self): @logwrap def _get_pcm_nodes(remote, pure=False): nodes = {} pcs_status = remote.execute('pcs status nodes')['stdout'] pcm_nodes = yaml.load(''.join(pcs_status).strip()) for status in ('Online', 'Offline', 'Standby'): list_nodes = (pcm_nodes['Pacemaker Nodes'] [status] or '').split() if not pure: nodes[status] = [self.fuel_web.get_fqdn_by_hostname(x) for x in list_nodes] else: nodes[status] = list_nodes return nodes def _check_all_pcs_nodes_status(ctrl_remotes, pcs_nodes_online, status): for remote in ctrl_remotes: pcs_nodes = _get_pcm_nodes(remote) logger.debug( "Status of pacemaker nodes on node {0}: {1}". format(node['name'], pcs_nodes)) if set(pcs_nodes_online) != set(pcs_nodes[status]): return False return True if not self.env.d_env.has_snapshot(self.snapshot_name): raise SkipTest() self.env.revert_snapshot(self.snapshot_name) devops_name = self.env.d_env.nodes().slaves[0].name controller_node = self.fuel_web.get_nailgun_node_by_name(devops_name) with self.fuel_web.get_ssh_for_node( devops_name) as remote_controller: pcs_nodes = self.fuel_web.get_pcm_nodes(devops_name) assert_true( not pcs_nodes['Offline'], "There are offline nodes: {0}". format(pcs_nodes['Offline'])) pcs_nodes_online = pcs_nodes['Online'] cluster_id = self.fuel_web.get_last_created_cluster() ctrl_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles( cluster_id, ['controller']) alive_corosync_nodes = [node for node in ctrl_nodes if node['mac'] != controller_node['mac']] ctrl_remotes = [self.env.d_env.get_ssh_to_remote(node['ip']) for node in ctrl_nodes] live_remotes = [self.env.d_env.get_ssh_to_remote(node['ip']) for node in alive_corosync_nodes] for count in xrange(500): logger.debug('Checking splitbrain in the loop, ' 'count number: {0}'.format(count)) _wait( lambda: assert_equal( remote_controller.execute( 'killall -TERM corosync')['exit_code'], 0, 'Corosync was not killed on controller, ' 'see debug log, count-{0}'.format(count)), timeout=20) _wait( lambda: assert_true( _check_all_pcs_nodes_status( live_remotes, [controller_node['fqdn']], 'Offline'), 'Caught splitbrain, see debug log, ' 'count-{0}'.format(count)), timeout=20) _wait( lambda: assert_equal( remote_controller.execute( 'service corosync start && service pacemaker ' 'restart')['exit_code'], 0, 'Corosync was not started, see debug log,' ' count-{0}'.format(count)), timeout=20) _wait( lambda: assert_true( _check_all_pcs_nodes_status( ctrl_remotes, pcs_nodes_online, 'Online'), 'Corosync was not started on controller, see debug ' 'log, count: {0}'.format(count)), timeout=20) for remote in ctrl_remotes: remote.clear() for remote in live_remotes: remote.clear()
def upgrade_fuel_after_rollback(self): """Upgrade Fuel after rollback and deploy new cluster Scenario: 1. Revert deploy_neutron_gre snapshot with 6.1 env 2. Upgrade with rollback 3. Run OSTF 4. Run network verification 5. Upgrade fuel master 6. Check upgrading was successful 7. Deploy 6.1 cluster with 3 nodes and neutron vlan 8. Run OSTF for new cluster 9. Run network verification """ # TODO(ddmitriev): change snapshot name to actual when reverting 7.0 if not self.env.d_env.has_snapshot('deploy_neutron_gre'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre") available_releases_before = self.fuel_web.get_releases_list_for_os( release_name=hlp_data.OPENSTACK_RELEASE) cluster_id = self.fuel_web.get_last_created_cluster() self.env.admin_actions.upgrade_master_node(rollback=True) _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.env.admin_actions.upgrade_master_node(file_upload=False) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_TO) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nailgun_upgrade_migration() # Deploy new cluster available_releases_after = self.fuel_web.get_releases_list_for_os( release_name=hlp_data.OPENSTACK_RELEASE) added_release = [ release_id for release_id in available_releases_after if release_id not in available_releases_before ] self.env.bootstrap_nodes(self.env.d_env.nodes().slaves[3:6]) new_cluster_id = self.fuel_web.create_cluster( name=self.__class__.__name__, release_id=added_release[0], mode=hlp_data.DEPLOYMENT_MODE, settings={ 'net_provider': 'neutron', 'net_segment_type': hlp_data.NEUTRON_SEGMENT['vlan'] }) self.fuel_web.update_nodes( new_cluster_id, { 'slave-04': ['controller'], 'slave-05': ['compute'], 'slave-06': ['cinder'] }) self.fuel_web.run_network_verify(new_cluster_id) self.fuel_web.deploy_cluster_wait(new_cluster_id) self.fuel_web.run_ostf(new_cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.fuel_web.run_network_verify(new_cluster_id) self.env.make_snapshot("upgrade_fuel_after_rollback")
def upgrade_fuel_after_rollback(self): """Upgrade Fuel after rollback and deploy new cluster Scenario: 1. Revert deploy_neutron_gre snapshot with 6.1 env 2. Upgrade with rollback 3. Run OSTF 4. Run network verification 5. Upgrade fuel master 6. Check upgrading was successful 7. Deploy 6.1 cluster with 3 nodes and neutron vlan 8. Run OSTF for new cluster 9. Run network verification """ #(ddmitriev)TODO: change the snapshot name to actual when reverting 7.0 if not self.env.d_env.has_snapshot('deploy_neutron_gre'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre") available_releases_before = self.fuel_web.get_releases_list_for_os( release_name=hlp_data.OPENSTACK_RELEASE) remote = self.env.d_env.get_admin_remote cluster_id = self.fuel_web.get_last_created_cluster() checkers.upload_tarball(remote(), hlp_data.TARBALL_PATH, '/var') checkers.check_file_exists(remote(), os.path.join('/var', os.path.basename( hlp_data.TARBALL_PATH))) checkers.untar(remote(), os.path.basename(hlp_data.TARBALL_PATH), '/var') # Upgrade with rollback keystone_pass = hlp_data.KEYSTONE_CREDS['password'] checkers.run_script(remote(), '/var', 'upgrade.sh', password=keystone_pass, rollback=True, exit_code=255) checkers.wait_rollback_is_done(remote(), 3000) checkers.check_upgraded_containers(remote(), hlp_data.UPGRADE_FUEL_TO, hlp_data.UPGRADE_FUEL_FROM) logger.debug("all containers are ok") _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id, test_sets=['ha', 'smoke', 'sanity']) # Upgrade fuel master checkers.run_script(remote(), '/var', 'upgrade.sh', password=keystone_pass) checkers.wait_upgrade_is_done(remote(), 3000, phrase='*** UPGRADING MASTER NODE' ' DONE SUCCESSFULLY') checkers.check_upgraded_containers(remote(), hlp_data.UPGRADE_FUEL_FROM, hlp_data.UPGRADE_FUEL_TO) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_TO) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nailgun_upgrade_migration() # Deploy new cluster available_releases_after = self.fuel_web.get_releases_list_for_os( release_name=hlp_data.OPENSTACK_RELEASE) added_release = [id for id in available_releases_after if id not in available_releases_before] self.env.bootstrap_nodes( self.env.d_env.nodes().slaves[3:6]) new_cluster_id = self.fuel_web.create_cluster( name=self.__class__.__name__, release_id=added_release[0], mode=hlp_data.DEPLOYMENT_MODE, settings={ 'net_provider': 'neutron', 'net_segment_type': hlp_data.NEUTRON_SEGMENT['vlan'] } ) self.fuel_web.update_nodes( new_cluster_id, { 'slave-04': ['controller'], 'slave-05': ['compute'], 'slave-06': ['cinder'] } ) self.fuel_web.run_network_verify(new_cluster_id) self.fuel_web.deploy_cluster_wait(new_cluster_id) self.fuel_web.run_ostf(new_cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.fuel_web.run_network_verify(new_cluster_id) self.env.make_snapshot("upgrade_fuel_after_rollback")
def wait_for_provisioning(self, timeout=settings.WAIT_FOR_PROVISIONING_TIMEOUT): _wait(lambda: _tcp_ping( self.d_env.nodes().admin.get_ip_address_by_network_name( self.d_env.admin_net), 22), timeout=timeout)
def create_component_and_env_configdb(self): """ Install and check ConfigDB Scenario: 1. Revert snapshot empty 2. Install configDB extension 3. Create components 4. Create environment with component 5. Get and check created data 6. Make snapshot Duration: 5 min Snapshot: create_component_and_env_configdb """ self.check_run('create_component_and_env_configdb') self.show_step(1) self.env.revert_snapshot('empty') self.show_step(2) install_configdb(master_node_ip=self.ssh_manager.admin_ip) logger.debug('Waiting for ConfigDB') _wait(lambda: self.fuel_web.client.get_components(), timeout=45) logger.debug('Get env and component data') components = self.fuel_web.client.get_components() envs = self.fuel_web.client.get_environments() assert_false(components, "Components is not empty after tuningbox installation") assert_false(envs, "Environments is not empty after tuningbox installation") # Uploaded data component = { "name": "comp1", "resource_definitions": [{ "name": self.RESOURCE_NAME_1, "content": {} }, { "name": self.SLASHED_RESOURCE, "content": {} }] } environment = { "name": "env1", "components": ["comp1"], "hierarchy_levels": ["nodes"] } self.show_step(3) self.fuel_web.client.create_component(component) self.show_step(4) self.fuel_web.client.create_environment(environment) self.show_step(5) comp = self.fuel_web.client.get_components(comp_id=1) env = self.fuel_web.client.get_environments(env_id=1) expected_comp = { 'resource_definitions': [{ 'content': {}, 'component_id': 1, 'id': 1, 'name': self.RESOURCE_NAME_1 }, { 'content': {}, 'component_id': 1, 'id': 2, 'name': self.SLASHED_RESOURCE }], 'id': 1, 'name': "comp1" } expected_env = { 'hierarchy_levels': ["nodes"], 'id': 1, 'components': [1] } logger.debug('Compare original component with ' 'received component from API') assert_equal(comp, expected_comp) logger.debug('Compare original env with received env from API') assert_equal(env, expected_env) self.show_step(6) self.env.make_snapshot('create_component_and_env_configdb', is_make=True)
def negative_auto_cic_maintenance_mode(self): """Check negative scenario for auto maintenance mode Scenario: 1. Revert snapshot 2. Disable UMM 3. Change UMM.CONF 4. Unexpected reboot 5. Check the controller not switching in maintenance mode 6. Check the controller become available Duration 85m """ self.env.revert_snapshot("cic_maintenance_mode") cluster_id = self.fuel_web.get_last_created_cluster() n_ctrls = self.fuel_web.get_nailgun_cluster_nodes_by_roles(cluster_id, ["controller"]) d_ctrls = self.fuel_web.get_devops_nodes_by_nailgun_nodes(n_ctrls) for devops_node in d_ctrls: _ip = self.fuel_web.get_nailgun_node_by_name(devops_node.name)["ip"] with self.fuel_web.get_ssh_for_node(devops_node.name) as remote: assert_true("True" in check_available_mode(remote), "Maintenance mode is not available") logger.info("Change UMM.CONF on node %s", devops_node.name) command1 = "echo -e 'UMM=yes\nREBOOT_COUNT=0\n" "COUNTER_RESET_TIME=10' > /etc/umm.conf" result = remote.execute(command1) assert_equal( result["exit_code"], 0, 'Failed to execute "{0}" on remote host: {1}'.format(command1, result) ) result = remote.execute("umm disable") assert_equal( result["exit_code"], 0, 'Failed to execute "{0}" on remote host: {1}'.format("umm disable", result) ) assert_false("True" in check_available_mode(remote), "Maintenance mode should not be available") logger.info("Unexpected reboot on node %s", devops_node.name) command2 = "reboot --force >/dev/null & " result = remote.execute(command2) assert_equal( result["exit_code"], 0, 'Failed to execute "{0}" on remote host: {1}'.format(command2, result) ) wait(lambda: not tcp_ping(_ip, 22), timeout=60 * 10) # Node don't have enough time for set offline status # after reboot --force # Just waiting wait(lambda: tcp_ping(_ip, 22), timeout=60 * 10) logger.info("Wait a %s node online status after unexpected " "reboot", devops_node.name) self.fuel_web.wait_nodes_get_online_state([devops_node]) logger.info("Check that %s node not in maintenance mode after" " unexpected reboot", devops_node.name) with self.fuel_web.get_ssh_for_node(devops_node.name) as remote: assert_false("True" in check_auto_mode(remote), "Maintenance mode should not switched") # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up([n.name for n in d_ctrls]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up([n.name for n in d_ctrls]) _wait( lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=["sanity"], test_name=map_ostf.OSTF_TEST_MAPPING.get("Check that required services are running"), ), timeout=1500, ) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=["ha"]), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=["smoke", "sanity"]) except AssertionError: logger.debug( "Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails " ) time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=["smoke", "sanity"])
def rollback_automatically_delete_node(self): """Rollback automatically ha one controller deployed cluster and delete node from cluster Scenario: 1. Revert snapshot with deploy neutron gre env 2. Add raise exception to docker_engine.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Run network verification 7. Run OSTF 8. Delete 1 node and re-deploy cluster 9. Run OSTF """ if not self.env.d_env.has_snapshot('deploy_neutron_gre'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre") cluster_id = self.fuel_web.get_last_created_cluster() checkers.upload_tarball(self.env.d_env.get_admin_remote(), hlp_data.TARBALL_PATH, '/var') checkers.check_tarball_exists(self.env.d_env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') checkers.untar(self.env.d_env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') # we expect 255 exit code here because upgrade failed # and exit status is 255 checkers.run_script(self.env.d_env.get_admin_remote(), '/var', 'upgrade.sh', password=hlp_data.KEYSTONE_CREDS['password'], rollback=True, exit_code=255) checkers.wait_rollback_is_done(self.env.d_env.get_admin_remote(), 3000) checkers.check_upgraded_containers(self.env.d_env.get_admin_remote(), hlp_data.UPGRADE_FUEL_TO, hlp_data.UPGRADE_FUEL_FROM) logger.debug("all containers are ok") _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id) nailgun_nodes = self.fuel_web.update_nodes( cluster_id, {'slave-03': ['compute', 'cinder']}, False, True) task = self.fuel_web.deploy_cluster(cluster_id) self.fuel_web.assert_task_success(task) nodes = filter(lambda x: x["pending_deletion"] is True, nailgun_nodes) try: wait(lambda: len(self.fuel_web.client.list_nodes()) == 3, timeout=5 * 60) except TimeoutError: assert_true(len(self.fuel_web.client.list_nodes()) == 3, 'Node {0} is not discovered in timeout 10 *60'.format( nodes[0])) self.fuel_web.run_ostf(cluster_id=cluster_id, should_fail=1) self.env.make_snapshot("rollback_automatically_delete_mode")
def auto_cic_maintenance_mode(self): """Check auto maintenance mode for controller Scenario: 1. Revert snapshot 2. Unexpected reboot 3. Wait until controller is switching in maintenance mode 4. Exit maintenance mode 5. Check the controller become available Duration 155m """ self.env.revert_snapshot('cic_maintenance_mode') cluster_id = self.fuel_web.get_last_created_cluster() for nailgun_node in self.env.d_env.nodes().slaves[0:3]: with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_available_mode(remote), "Maintenance mode is not available") logger.info('Change UMM.CONF on node %s', nailgun_node.name) command1 = ("echo -e 'UMM=yes\nREBOOT_COUNT=0\n" "COUNTER_RESET_TIME=10' > /etc/umm.conf") result = remote.execute(command1) assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( command1, result)) logger.info('Unexpected reboot on node %s', nailgun_node.name) command2 = ('reboot --force >/dev/null & ') result = remote.execute(command2) assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( command2, result)) logger.info( 'Wait a %s node offline status after unexpected ' 'reboot', nailgun_node.name) try: wait(lambda: not self.fuel_web.get_nailgun_node_by_devops_node( nailgun_node)['online'], timeout=60 * 10) except TimeoutError: assert_false( self.fuel_web.get_nailgun_node_by_devops_node( nailgun_node)['online'], 'Node {0} has not become offline after unexpected' 'reboot'.format(nailgun_node.name)) logger.info( 'Check that %s node in maintenance mode after' ' unexpected reboot', nailgun_node.name) with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_auto_mode(remote), "Maintenance mode is not switch") result = remote.execute('umm off') assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( 'umm off', result)) # Wait umm stops time.sleep(30) command3 = ("echo -e 'UMM=yes\nREBOOT_COUNT=2\n" "COUNTER_RESET_TIME=10' > /etc/umm.conf") result = remote.execute(command3) assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( command3, result)) logger.info('Wait a %s node online status', nailgun_node.name) try: wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( nailgun_node)['online'], timeout=90 * 10) except TimeoutError: assert_true( self.fuel_web.get_nailgun_node_by_devops_node( nailgun_node)['online'], 'Node {0} has not become online after umm off'.format( nailgun_node.name)) # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) _wait(lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=['sanity'], test_name=map_ostf.OSTF_TEST_MAPPING.get( 'Check that required services are running')), timeout=1500) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['ha']), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']) except AssertionError: logger.debug("Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails ") time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity'])
"""Create SSH-connection to the network :rtype : SSHClient """ return SSHClient( self.get_ip_address_by_network_name(network_name), username=login, password=password, private_keys=private_keys, ) def send_keys(self, keys): self.driver.node_send_keys(self, keys) def await(self, network_name, timeout=120, by_port=22): _wait(lambda: _tcp_ping(self.get_ip_address_by_network_name(network_name), by_port), timeout=timeout) def define(self): self.driver.node_define(self) self.save() def start(self): self.create(verbose=False) def create(self, verbose=False): if verbose or not self.driver.node_active(self): self.driver.node_create(self) def destroy(self, verbose=False): if verbose or self.driver.node_active(self): self.driver.node_destroy(self)
def manual_cic_maintenance_mode(self): """Check manual maintenance mode for controller Scenario: 1. Revert snapshot 2. Switch in maintenance mode 3. Wait until controller is rebooting 4. Exit maintenance mode 5. Check the controller become available Duration 155m """ self.env.revert_snapshot('cic_maintenance_mode') cluster_id = self.fuel_web.get_last_created_cluster() for nailgun_node in self.env.d_env.nodes().slaves[0:3]: with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_available_mode(remote), "Maintenance mode is not available") logger.info('Maintenance mode for node %s', nailgun_node.name) result = remote.execute('umm on') assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( 'umm on', result)) logger.info( 'Wait a %s node offline status after switching ' 'maintenance mode ', nailgun_node.name) try: wait(lambda: not self.fuel_web.get_nailgun_node_by_devops_node( nailgun_node)['online'], timeout=60 * 10) except TimeoutError: assert_false( self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], 'Node {0} has not become offline after' 'switching maintenance mode'.format(nailgun_node.name)) logger.info( 'Check that %s node in maintenance mode after ' 'switching', nailgun_node.name) with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_auto_mode(remote), "Maintenance mode is not switch") result = remote.execute('umm off') assert_equal( result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'.format( 'umm off', result)) logger.info('Wait a %s node online status', nailgun_node.name) try: wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( nailgun_node)['online'], timeout=60 * 10) except TimeoutError: assert_true( self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], 'Node {0} has not become online after ' 'exiting maintenance mode'.format(nailgun_node.name)) # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) _wait(lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=['sanity'], test_name=map_ostf.OSTF_TEST_MAPPING.get( 'Check that required services are running')), timeout=1500) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['ha']), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']) except AssertionError: logger.debug("Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails ") time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity'])
def auto_cic_maintenance_mode(self): """Check auto maintenance mode for controller Scenario: 1. Revert snapshot 2. Unexpected reboot 3. Wait until controller is switching in maintenance mode 4. Exit maintenance mode 5. Check the controller become available Duration 155m """ self.env.revert_snapshot('cic_maintenance_mode') cluster_id = self.fuel_web.get_last_created_cluster() for nailgun_node in self.env.d_env.nodes().slaves[0:3]: with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_available_mode(remote), "Maintenance mode is not available") logger.info('Change UMM.CONF on node %s', nailgun_node.name) command1 = ("echo -e 'UMM=yes\nREBOOT_COUNT=0\n" "COUNTER_RESET_TIME=10' > /etc/umm.conf") result = remote.execute(command1) assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format(command1, result)) logger.info('Unexpected reboot on node %s', nailgun_node.name) command2 = ('reboot --force >/dev/null & ') result = remote.execute(command2) assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format(command2, result)) logger.info('Wait a %s node offline status after unexpected ' 'reboot', nailgun_node.name) try: wait( lambda: not self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], timeout=60 * 10) except TimeoutError: assert_false( self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], 'Node {0} has not become offline after unexpected' 'reboot'.format(nailgun_node.name)) logger.info('Check that %s node in maintenance mode after' ' unexpected reboot', nailgun_node.name) with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_auto_mode(remote), "Maintenance mode is not switch") result = remote.execute('umm off') assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format('umm off', result)) # Wait umm stops time.sleep(30) command3 = ("echo -e 'UMM=yes\nREBOOT_COUNT=2\n" "COUNTER_RESET_TIME=10' > /etc/umm.conf") result = remote.execute(command3) assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format(command3, result)) logger.info('Wait a %s node online status', nailgun_node.name) try: wait( lambda: self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], timeout=90 * 10) except TimeoutError: assert_true( self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], 'Node {0} has not become online after umm off'.format( nailgun_node.name)) # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) _wait(lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=['sanity'], test_name=map_ostf.OSTF_TEST_MAPPING.get( 'Check that required services are running')), timeout=1500) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['ha']), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']) except AssertionError: logger.debug("Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails ") time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity'])
def manual_cic_maintenance_mode(self): """Check manual maintenance mode for controller Scenario: 1. Revert snapshot 2. Switch in maintenance mode 3. Wait until controller is rebooting 4. Exit maintenance mode 5. Check the controller become available Duration 155m """ self.env.revert_snapshot('cic_maintenance_mode') cluster_id = self.fuel_web.get_last_created_cluster() n_ctrls = self.fuel_web.get_nailgun_cluster_nodes_by_roles( cluster_id, ['controller']) d_ctrls = self.fuel_web.get_devops_nodes_by_nailgun_nodes(n_ctrls) for devops_node in d_ctrls: _ip = self.fuel_web.get_nailgun_node_by_name( devops_node.name)['ip'] logger.info('Maintenance mode for node {0}'.format( devops_node.name)) asserts.assert_true('True' in checkers.check_available_mode(_ip), "Maintenance mode is not available") self.ssh_manager.execute_on_remote(ip=_ip, cmd="umm on") logger.info('Wait a {0} node offline status after switching ' 'maintenance mode '.format(devops_node.name)) try: wait(lambda: not self.fuel_web.get_nailgun_node_by_devops_node( devops_node)['online'], timeout=60 * 10) except TimeoutError: asserts.assert_false( self.fuel_web.get_nailgun_node_by_devops_node(devops_node) ['online'], 'Node {0} has not become offline after' 'switching maintenance mode'.format(devops_node.name)) logger.info('Check that {0} node in maintenance mode after ' 'switching'.format(devops_node.name)) _ip = self.fuel_web.get_nailgun_node_by_name( devops_node.name)['ip'] asserts.assert_true( checkers.check_ping(self.env.get_admin_node_ip(), _ip, deadline=600), "Host {0} is not reachable by ping during 600 sec".format(_ip)) asserts.assert_true('True' in checkers.check_auto_mode(_ip), "Maintenance mode is not switch") self.ssh_manager.execute_on_remote(ip=_ip, cmd="umm off") logger.info('Wait a {0} node online status'.format( devops_node.name)) try: wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( devops_node)['online'], timeout=60 * 10) except TimeoutError: asserts.assert_true( self.fuel_web.get_nailgun_node_by_devops_node(devops_node) ['online'], 'Node {0} has not become online after ' 'exiting maintenance mode'.format(devops_node.name)) # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up([n.name for n in d_ctrls]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up([n.name for n in d_ctrls]) _wait(lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=['sanity'], test_name=ostf_test_mapping.OSTF_TEST_MAPPING.get( 'Check that required services are running')), timeout=1500) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['ha']), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']) except AssertionError: logger.debug("Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails ") time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity'])
def negative_auto_cic_maintenance_mode(self): """Check negative scenario for auto maintenance mode Scenario: 1. Revert snapshot 2. Disable UMM 3. Change UMM.CONF 4. Unexpected reboot 5. Check the controller not switching in maintenance mode 6. Check the controller become available Duration 85m """ self.env.revert_snapshot('cic_maintenance_mode') cluster_id = self.fuel_web.get_last_created_cluster() for nailgun_node in self.env.d_env.nodes().slaves[0:3]: with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_available_mode(remote), "Maintenance mode is not available") logger.info('Change UMM.CONF on node %s', nailgun_node.name) command1 = ("echo -e 'UMM=yes\nREBOOT_COUNT=0\n" "COUNTER_RESET_TIME=10' > /etc/umm.conf") result = remote.execute(command1) assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format(command1, result)) result = remote.execute('umm disable') assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format('umm disable', result)) assert_false('True' in check_available_mode(remote), "Maintenance mode should not be available") logger.info('Unexpected reboot on node %s', nailgun_node.name) command2 = ('reboot --force >/dev/null & ') result = remote.execute(command2) assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format(command2, result)) # Node don't have enough time for set offline status # after reboot --force # Just waiting _ip = self.fuel_web.get_nailgun_node_by_name( nailgun_node.name)['ip'] _wait(lambda: _tcp_ping(_ip, 22), timeout=120) logger.info('Wait a %s node online status after unexpected ' 'reboot', nailgun_node.name) self.fuel_web.wait_nodes_get_online_state([nailgun_node]) logger.info('Check that %s node not in maintenance mode after' ' unexpected reboot', nailgun_node.name) with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_false('True' in check_auto_mode(remote), "Maintenance mode should not switched") # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) _wait(lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=['sanity'], test_name=map_ostf.OSTF_TEST_MAPPING.get( 'Check that required services are running')), timeout=1500) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['ha']), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']) except AssertionError: logger.debug("Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails ") time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity'])
def manual_cic_maintenance_mode(self): """Check manual maintenance mode for controller Scenario: 1. Revert snapshot 2. Switch in maintenance mode 3. Wait until controller is rebooting 4. Exit maintenance mode 5. Check the controller become available Duration 155m """ self.env.revert_snapshot('cic_maintenance_mode') cluster_id = self.fuel_web.get_last_created_cluster() for nailgun_node in self.env.d_env.nodes().slaves[0:3]: with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_available_mode(remote), "Maintenance mode is not available") logger.info('Maintenance mode for node %s', nailgun_node.name) result = remote.execute('umm on') assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format('umm on', result)) logger.info('Wait a %s node offline status after switching ' 'maintenance mode ', nailgun_node.name) try: wait( lambda: not self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], timeout=60 * 10) except TimeoutError: assert_false( self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], 'Node {0} has not become offline after' 'switching maintenance mode'.format(nailgun_node.name)) logger.info('Check that %s node in maintenance mode after ' 'switching', nailgun_node.name) with self.fuel_web.get_ssh_for_node(nailgun_node.name) as remote: assert_true('True' in check_auto_mode(remote), "Maintenance mode is not switch") result = remote.execute('umm off') assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format('umm off', result)) logger.info('Wait a %s node online status', nailgun_node.name) try: wait( lambda: self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], timeout=60 * 10) except TimeoutError: assert_true( self.fuel_web.get_nailgun_node_by_devops_node(nailgun_node) ['online'], 'Node {0} has not become online after ' 'exiting maintenance mode'.format(nailgun_node.name)) # Wait until MySQL Galera is UP on some controller self.fuel_web.wait_mysql_galera_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) # Wait until Cinder services UP on a controller self.fuel_web.wait_cinder_is_up( [n.name for n in self.env.d_env.nodes().slaves[0:3]]) _wait(lambda: self.fuel_web.run_single_ostf_test( cluster_id, test_sets=['sanity'], test_name=map_ostf.OSTF_TEST_MAPPING.get( 'Check that required services are running')), timeout=1500) logger.debug("Required services are running") _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['ha']), timeout=1500) logger.debug("HA tests are pass now") try: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']) except AssertionError: logger.debug("Test failed from first probe," " we sleep 600 second try one more time" " and if it fails again - test will fails ") time.sleep(600) self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity'])
def await (self, network_name, timeout=120): _wait(lambda: _tcp_ping( self.get_ip_address_by_network_name(network_name), 22), timeout=timeout)
def ha_pacemaker_restart_heat_engine(self): if not self.env.d_env.has_snapshot(self.snapshot_name): raise SkipTest() self.env.revert_snapshot(self.snapshot_name) ocf_success = "DEBUG: OpenStack Orchestration Engine" \ " (heat-engine) monitor succeeded" ocf_error = "ERROR: OpenStack Heat Engine is not connected to the" \ " AMQP server: AMQP connection test returned 1" heat_name = 'heat-engine' ocf_status = \ 'script -q -c "OCF_ROOT=/usr/lib/ocf' \ ' /usr/lib/ocf/resource.d/fuel/{0}' \ ' monitor 2>&1"'.format(heat_name) node_name = self.env.d_env.nodes().slaves[0].name with self.fuel_web.get_ssh_for_node(node_name) as remote: pid = ''.join( remote.execute('pgrep {0}'.format(heat_name))['stdout']) get_ocf_status = ''.join( remote.execute(ocf_status)['stdout']).rstrip() assert_true( ocf_success in get_ocf_status, "heat engine is not succeeded, status is {0}".format( get_ocf_status)) with self.fuel_web.get_ssh_for_node(node_name) as remote: amqp_con = len( remote.execute("netstat -nap | grep {0} | grep :5673".format( pid))['stdout']) assert_true(amqp_con > 0, 'There is no amqp connections') with self.fuel_web.get_ssh_for_node(node_name) as remote: remote.execute("iptables -I OUTPUT 1 -m owner --uid-owner heat -m" " state --state NEW,ESTABLISHED,RELATED -j DROP") cmd = "netstat -nap | grep {0} | grep :5673".format(pid) wait(lambda: len(remote.execute(cmd)['stdout']) == 0, timeout=300) get_ocf_status = ''.join( remote.execute(ocf_status)['stdout']).rstrip() logger.info('ocf status after blocking is {0}'.format(get_ocf_status)) assert_true( ocf_error in get_ocf_status, "heat engine is running, status is {0}".format(get_ocf_status)) with self.fuel_web.get_ssh_for_node(node_name) as remote: remote.execute("iptables -D OUTPUT 1 -m owner --uid-owner heat -m" " state --state NEW,ESTABLISHED,RELATED") _wait(lambda: assert_true(ocf_success in ''.join( remote.execute(ocf_status)['stdout']).rstrip()), timeout=240) newpid = ''.join( remote.execute('pgrep {0}'.format(heat_name))['stdout']) assert_true(pid != newpid, "heat pid is still the same") get_ocf_status = ''.join( remote.execute(ocf_status)['stdout']).rstrip() assert_true( ocf_success in get_ocf_status, "heat engine is not succeeded, status is {0}".format( get_ocf_status)) with self.fuel_web.get_ssh_for_node(node_name) as remote: heat = len( remote.execute("netstat -nap | grep {0} | grep :5673".format( newpid))['stdout']) assert_true(heat > 0) cluster_id = self.fuel_web.get_last_created_cluster() self.fuel_web.run_ostf(cluster_id=cluster_id)
def rollback_automatically_ha(self): """Rollback manually ha deployed cluster Scenario: 1. Revert snapshot with Neutron GRE HA 6.1 env 2. Add raise exception to openstack.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded 6. Run network verification 7. Run OSTF 8. Add 1 cinder node and re-deploy cluster 9. Run OSTF """ #(ddmitriev)TODO: change the snapshot name to actual when reverting 7.0 if not self.env.d_env.has_snapshot('deploy_neutron_gre_ha'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre_ha") cluster_id = self.fuel_web.get_last_created_cluster() checkers.upload_tarball(self.env.d_env.get_admin_remote(), hlp_data.TARBALL_PATH, '/var') checkers.check_file_exists(self.env.d_env.get_admin_remote(), os.path.join('/var', os.path.basename( hlp_data.TARBALL_PATH))) checkers.untar(self.env.d_env.get_admin_remote(), os.path.basename(hlp_data. TARBALL_PATH), '/var') checkers.run_script(self.env.d_env.get_admin_remote(), '/var', 'upgrade.sh', password=hlp_data.KEYSTONE_CREDS['password'], rollback=True, exit_code=255) checkers.wait_rollback_is_done(self.env.d_env.get_admin_remote(), 3000) checkers.check_upgraded_containers(self.env.d_env.get_admin_remote(), hlp_data.UPGRADE_FUEL_TO, hlp_data.UPGRADE_FUEL_FROM) logger.debug("all containers are ok") _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.d_env.nodes().slaves[0]), timeout=8 * 60) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state( self.env.d_env.nodes().slaves[:5]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.verify_network(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.env.bootstrap_nodes( self.env.d_env.nodes().slaves[5:6]) self.fuel_web.update_nodes( cluster_id, {'slave-06': ['cinder']}, True, False ) self.fuel_web.deploy_cluster_wait(cluster_id) self.fuel_web.run_ostf(cluster_id=cluster_id, test_sets=['ha', 'smoke', 'sanity']) self.env.make_snapshot("rollback_automatically_ha")
def fuel_migration(self): """Fuel master migration to VM Scenario: 1. Create cluster 2. Run OSTF tests 3. Run Network check 4. Migrate fuel-master to VM 5. Run OSTF tests 6. Run Network check 7. Check statuses for master services Duration 210m """ self.env.revert_snapshot("ready_with_3_slaves") data = { 'net_provider': 'neutron', 'net_segment_type': settings.NEUTRON_SEGMENT_TYPE } cluster_id = self.fuel_web.create_cluster( name=self.__class__.__name__, mode=settings.DEPLOYMENT_MODE_HA, settings=data) self.fuel_web.update_nodes( cluster_id, { 'slave-01': ['controller'], 'slave-02': ['compute'] } ) # Check network self.fuel_web.verify_network(cluster_id) # Cluster deploy self.fuel_web.deploy_cluster_wait(cluster_id) # Check network self.fuel_web.verify_network(cluster_id) # Fuel migration remote = self.env.d_env.get_admin_remote() logger.info('Fuel migration on compute slave-02') result = remote.execute('fuel-migrate ' + self.fuel_web. get_nailgun_node_by_name('slave-02')['ip'] + ' >/dev/null &') assert_equal(result['exit_code'], 0, 'Failed to execute "{0}" on remote host: {1}'. format('fuel-migrate' + self.env.d_env.nodes().slaves[0]. name, result)) checkers.wait_phrase_in_log(remote, 60 * 60, interval=0.2, phrase='Rebooting to begin ' 'the data sync process', log_path='/var/log/fuel-migrate.log') remote.clear() logger.info('Rebooting to begin the data sync process for fuel ' 'migrate') wait(lambda: not icmp_ping(self.env.get_admin_node_ip()), timeout=60 * 15, timeout_msg='Master node has not become offline ' 'after rebooting') wait(lambda: icmp_ping(self.env.get_admin_node_ip()), timeout=60 * 15, timeout_msg='Master node has not become online ' 'after rebooting') self.env.d_env.nodes().admin.await(network_name=self.d_env.admin_net, timeout=60 * 15) with self.env.d_env.get_admin_remote() as remote: checkers.wait_phrase_in_log(remote, 60 * 90, interval=0.1, phrase='Stop network and up with ' 'new settings', log_path='/var/log/fuel-migrate.log') logger.info('Shutting down network') wait(lambda: not icmp_ping(self.env.get_admin_node_ip()), timeout=60 * 15, interval=0.1, timeout_msg='Master node has not become offline shutting network') wait(lambda: icmp_ping(self.env.get_admin_node_ip()), timeout=60 * 15, timeout_msg='Master node has not become online shutting network') self.env.d_env.nodes().admin.await(network_name=self.d_env.admin_net, timeout=60 * 10) logger.info("Check containers") self.env.docker_actions.wait_for_ready_containers(timeout=60 * 30) logger.info("Check services") cluster_id = self.fuel_web.get_last_created_cluster() self.fuel_web.assert_ha_services_ready(cluster_id) self.fuel_web.assert_os_services_ready(cluster_id) # Check network self.fuel_web.verify_network(cluster_id) # Run ostf _wait(lambda: self.fuel_web.run_ostf(cluster_id, test_sets=['smoke', 'sanity']), timeout=1500) logger.debug("OSTF tests are pass now")
def wait_for_provisioning(self): _wait(lambda: _tcp_ping( self.nodes().admin.get_ip_address_by_network_name (self.admin_net), 22), timeout=5 * 60)
def rollback_automatically_simple_env(self): """Rollback automatically simple deployed cluster Scenario: 1. Revert snapshot with simple neutron gre env 2. Add raise exception to docker_engine.py file 3. Run upgrade on master 4. Check that rollback starts automatically 5. Check that cluster was not upgraded and run OSTf 6. Add 1 cinder node and re-deploy cluster 7. Run OSTF """ if not self.env.get_virtual_environment().has_snapshot( 'deploy_neutron_gre'): raise SkipTest() self.env.revert_snapshot("deploy_neutron_gre") cluster_id = self.fuel_web.get_last_created_cluster() remote = self.env.get_ssh_to_remote_by_name('slave-01') expected_kernel = UpgradeFuelMaster.get_slave_kernel(remote) checkers.upload_tarball(self.env.get_admin_remote(), hlp_data.TARBALL_PATH, '/var') checkers.check_tarball_exists(self.env.get_admin_remote(), os.path.basename(hlp_data.TARBALL_PATH), '/var') checkers.untar(self.env.get_admin_remote(), os.path.basename(hlp_data.TARBALL_PATH), '/var') #we expect 255 exit code here because upgrade failed # and exit status is 255 checkers.run_script(self.env.get_admin_remote(), '/var', 'upgrade.sh', password=hlp_data.KEYSTONE_CREDS['password'], rollback=True, exit_code=255) checkers.wait_rollback_is_done(self.env.get_admin_remote(), 3000) checkers.check_upgraded_containers(self.env.get_admin_remote(), hlp_data.UPGRADE_FUEL_TO, hlp_data.UPGRADE_FUEL_FROM) logger.debug("all containers are ok") _wait(lambda: self.fuel_web.get_nailgun_node_by_devops_node( self.env.nodes().slaves[0]), timeout=120) logger.debug("all services are up now") self.fuel_web.wait_nodes_get_online_state(self.env.nodes().slaves[:3]) self.fuel_web.assert_nodes_in_ready_state(cluster_id) self.fuel_web.assert_fuel_version(hlp_data.UPGRADE_FUEL_FROM) self.fuel_web.run_ostf(cluster_id=cluster_id) self.env.bootstrap_nodes(self.env.nodes().slaves[3:4]) self.fuel_web.update_nodes(cluster_id, {'slave-04': ['cinder']}, True, False) self.fuel_web.deploy_cluster_wait(cluster_id) if hlp_data.OPENSTACK_RELEASE_UBUNTU in hlp_data.OPENSTACK_RELEASE: remote = self.env.get_ssh_to_remote_by_name('slave-04') kernel = UpgradeFuelMaster.get_slave_kernel(remote) checkers.check_kernel(kernel, expected_kernel) self.fuel_web.run_ostf(cluster_id=cluster_id) self.env.make_snapshot("rollback_automatic_simple")
def wait_for_provisioning(self, timeout=settings.WAIT_FOR_PROVISIONING_TIMEOUT): _wait(lambda: _tcp_ping( self.d_env.nodes( ).admin.get_ip_address_by_network_name (self.d_env.admin_net), 22), timeout=timeout)
def ha_pacemaker_restart_heat_engine(self): """Verify heat engine service is restarted by pacemaker on amqp connection loss Scenario: 1. SSH to any controller 2. Check heat-engine status 3. Block heat-engine amqp connections 4. Check heat-engine was stopped on current controller 5. Unblock heat-engine amqp connections 6. Check heat-engine process is running with new pid 7. Check amqp connection re-appears for heat-engine Snapshot ha_pacemaker_restart_heat_engine """ self.env.revert_snapshot("deploy_ha") ocf_success = "DEBUG: OpenStack Orchestration Engine" \ " (heat-engine) monitor succeeded" ocf_error = "ERROR: OpenStack Heat Engine is not connected to the" \ " AMQP server: AMQP connection test returned 1" heat_name = 'heat-engine' ocf_status = \ 'script -q -c "OCF_ROOT=/usr/lib/ocf' \ ' /usr/lib/ocf/resource.d/fuel/{0}' \ ' monitor 2>&1"'.format(heat_name) remote = self.fuel_web.get_ssh_for_node( self.env.nodes().slaves[0].name) pid = ''.join(remote.execute('pgrep heat-engine')['stdout']) get_ocf_status = ''.join(remote.execute(ocf_status)['stdout']).rstrip() assert_true( ocf_success in get_ocf_status, "heat engine is not succeeded, status is {0}".format( get_ocf_status)) assert_true( len( remote.execute("netstat -nap | grep {0} | grep :5673".format( pid))['stdout']) > 0, 'There is no amqp connections') remote.execute("iptables -I OUTPUT 1 -m owner --uid-owner heat -m" " state --state NEW,ESTABLISHED,RELATED -j DROP") wait(lambda: len( remote.execute("netstat -nap | grep {0} | grep :5673".format(pid))[ 'stdout']) == 0, timeout=300) get_ocf_status = ''.join(remote.execute(ocf_status)['stdout']).rstrip() logger.info('ocf status after blocking is {0}'.format(get_ocf_status)) assert_true( ocf_error in get_ocf_status, "heat engine is running, status is {0}".format(get_ocf_status)) remote.execute("iptables -D OUTPUT 1 -m owner --uid-owner heat -m" " state --state NEW,ESTABLISHED,RELATED") _wait(lambda: assert_true(ocf_success in ''.join( remote.execute(ocf_status)['stdout']).rstrip()), timeout=240) newpid = ''.join(remote.execute('pgrep heat-engine')['stdout']) assert_true(pid != newpid, "heat pid is still the same") get_ocf_status = ''.join(remote.execute(ocf_status)['stdout']).rstrip() assert_true( ocf_success in get_ocf_status, "heat engine is not succeeded, status is {0}".format( get_ocf_status)) assert_true( len( remote.execute("netstat -nap | grep {0} | grep :5673".format( newpid))['stdout']) > 0) cluster_id = self.fuel_web.get_last_created_cluster() self.fuel_web.run_ostf(cluster_id=cluster_id)