class GpRecoversegRegressionTests(unittest.TestCase): def setUp(self): self.gprec = GpRecover() self.gphome = os.environ.get('GPHOME') def failover(self, type = 'mirror'): if type == 'mirror': fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f filerep_consumer -m async -y fault -r mirror -H ALL' % self.gphome else: fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f postmaster -m async -y panic -r primary -H ALL' % self.gphome return run_shell_command(fault_str, cmdname = 'Run fault injector to failover') def test_incr_gprecoverseg(self): self.gprec.wait_till_insync_transition() if(self.failover()): self.assertTrue(self.gprec.incremental()) def test_full_gprecoverseg(self): self.gprec.wait_till_insync_transition() if(self.failover()): self.assertTrue(self.gprec.full()) def test_gprecoverseg_rebalance(self): self.gprec.wait_till_insync_transition() if(self.failover('primary')): PSQL.run_sql_file(local_path('mirror_failover_trigger.sql')) self.gprec.incremental() if (self.gprec.wait_till_insync_transition()): self.assertTrue(self.gprec.rebalance()) def test_wait_till_insync(self): self.gprec.wait_till_insync_transition() if(self.failover()): self.gprec.incremental() self.assertTrue(self.gprec.wait_till_insync_transition())
class GpRecoversegRegressionTests(unittest.TestCase): def setUp(self): self.gprec = GpRecover() self.gphome = os.environ.get('GPHOME') def failover(self, type='mirror'): if type == 'mirror': fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f filerep_consumer -m async -y fault -r mirror -H ALL' % self.gphome else: fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f postmaster -m async -y panic -r primary -H ALL' % self.gphome return run_shell_command(fault_str, cmdname='Run fault injector to failover') def test_incr_gprecoverseg(self): self.gprec.wait_till_insync_transition() if (self.failover()): self.assertTrue(self.gprec.incremental()) def test_full_gprecoverseg(self): self.gprec.wait_till_insync_transition() if (self.failover()): self.assertTrue(self.gprec.full()) def test_gprecoverseg_rebalance(self): self.gprec.wait_till_insync_transition() if (self.failover('primary')): PSQL.run_sql_file(local_path('mirror_failover_trigger.sql')) self.gprec.incremental() if (self.gprec.wait_till_insync_transition()): self.assertTrue(self.gprec.rebalance()) def test_wait_till_insync(self): self.gprec.wait_till_insync_transition() if (self.failover()): self.gprec.incremental() self.assertTrue(self.gprec.wait_till_insync_transition())
class FilerepTestCase(MPPTestCase): def __init__(self, methodName): self.pgport = os.environ.get('PGPORT') self.util = Filerepe2e_Util() self.gpconfig = GpConfig() self.config = GPDBConfig() self.gpr = GpRecover(self.config) self.dbstate = DbStateClass('run_validation', self.config) self.gpstart = GpStart() self.gpstop = GpStop() super(FilerepTestCase, self).__init__(methodName) def sleep(self, seconds=60): time.sleep(seconds) def create_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('create a file', 'touch %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def remove_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('remove a file', 'rm %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def get_timestamp_of_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command( 'check timestamp', """ python -c "import os; print os.stat('%s').st_mtime" """ % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) res = cmd.get_results().stdout.strip() return res def verify_file_exists(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('check if file exists', 'test -f %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def handle_ext_cases(self, file): """ @file: wet sql file to replace with specific machine env. """ host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP querystring = "gpfdist://" + host + ":8088" if os.path.isfile(file): for line in fileinput.FileInput(file, inplace=1): line = re.sub('gpfdist.+8088', querystring, line) print str(re.sub('\n', '', line)) def handle_hybrid_part_cases(self, file): """ @file: hybrid sql file to replace with specific machine env """ querystring = "FROM '" + local_path('hybrid_part.data') + "'" if os.path.isfile(file): for line in fileinput.FileInput(file, inplace=1): line = re.sub('FROM\s\'.+hybrid_part.data\'', querystring, line) print str(re.sub('\n', '', line)) def preprocess(self): """ Replace the hard-coded information from sql files with correct hostname and ip address,etc """ list_workload_dir = [ 'set_sync1', 'sync1', 'set_ck_sync1', 'ck_sync1', 'set_ct', 'ct', 'set_resync', 'resync', 'set_sync2', 'sync2' ] for dir in list_workload_dir: sql_path = os.path.join(local_path(dir), 'sql') ans_path = os.path.join(local_path(dir), 'expected') for file in os.listdir(sql_path): if (file.find('wet_ret') >= 0): self.handle_ext_cases(os.path.join(sql_path, file)) if (file.find('hybrid_part') >= 0): self.handle_hybrid_part_cases(os.path.join(sql_path, file)) for file in os.listdir(ans_path): if (file.find('wet_ret') >= 0): self.handle_ext_cases(os.path.join(ans_path, file)) if (file.find('hybrid_part') >= 0): self.handle_hybrid_part_cases(os.path.join(ans_path, file)) def clean_data(self): """ Clean the data by removing the external table, otherwise, more data will be appended to the same external table from running multiple sql files. """ test = local_path("") test = str(test) + "data/*.*" cmd = 'rm -rfv ' + test run_shell_command(cmd) def anydownsegments(self): """ checks if any segments are down """ tinctest.logger.info("Checking if any segments are down") num_segments_down = self.count_of_nodes_down() if int(num_segments_down) == 0: return True else: return False def stop_start_validate(self, stopValidate=True): """ Do gpstop -i, gpstart and see if all segments come back up fine """ tinctest.logger.info("Performing stop start validate") tinctest.logger.info("Shutting down the cluster") ok = self.gpstop.run_gpstop_cmd(immediate='i', validate=stopValidate) if not ok and stopValidate: raise Exception('Problem while shutting down the cluster') tinctest.logger.info("Successfully shutdown the cluster.") tinctest.logger.info("Restarting the cluster.") ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('Failed to bring the cluster back up') tinctest.logger.info("Successfully restarted the cluster.") if not self.anydownsegments(): raise Exception("segments were marked down") else: return (True, "All segments are up") def method_reset_fault_injection(self): """ Resets fault injection Return: (True, [result]) if OK, or (False, [result]) otherwise """ tinctest.logger.info("Resetting fault injection") (ok1, out1) = self.util.inject_fault(f='filerep_resync', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("Fault injection failed") tinctest.logger.info("Done Injecting Fault to reset resync") return (True, str(out1)) def method_resume_filerep_resync(self): """ Resumes the process of resync """ tinctest.logger.info("Resuming Resync") (ok, out) = self.util.inject_fault(f='filerep_resync', m='async', y='resume', r='primary', H='ALL') if not ok: raise Exception("Fault injection failed") tinctest.logger.info("Done resuming resync") return (ok, out) def run_method_suspendresync(self): """ Stops the cluster from going to resync """ tinctest.logger.info("Suspending resync") (ok, out) = self.util.inject_fault(f='filerep_resync', m='async', y='suspend', r='primary', H='ALL') tinctest.logger.info('output from suspend resync %s' % out) if not ok: raise Exception("Fault injection failed") tinctest.logger.info("Done Injecting Fault to suspend resync") return (ok, out) def count_of_masters(self): """ Gives count of number of nodes in the cluster that are master Return: count of number of nodes in the cluster that are master """ tinctest.logger.info("Count the number of masters") cmd = "select count(*) from gp_segment_configuration where content = -1" (out) = PSQL.run_sql_command(cmd) num_master = out.split('\n')[3].strip() return num_master def count_of_nodes(self): """ Gives count of number of nodes in the cluster Return: count of number of nodes in the cluster """ tinctest.logger.info("Counting number of nodes") cmd = "select count(*) from gp_segment_configuration" (num_cl) = PSQL.run_sql_command(cmd) total_num_rows = num_cl.split('\n')[3].strip() return total_num_rows def count_of_nodes_in_ct(self): """ Gives count of number of nodes in change tracking Return: count of number of nodes in change tracking """ tinctest.logger.info("Counting number of nodes in ct") sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'" (num_cl) = PSQL.run_sql_command(sqlcmd) num_cl = num_cl.split('\n')[3].strip() return num_cl def count_of_nodes_down(self): """ Gives count of number of nodes marked as down Return: count of number of nodes marked as down """ tinctest.logger.info("Counting the number of nodes down") sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'" (num_down) = PSQL.run_sql_command(sqlcmd) num_down = num_down.split('\n')[3].strip() return num_down def count_of_nodes_sync(self): """ Gives count of number of nodes in sync Return: count of number of nodes in sync """ tinctest.logger.info("Counting the number of nodes in sync") sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'" (num_sync) = PSQL.run_sql_command(sqlcmd) num_sync = num_sync.split('\n')[3].strip() return num_sync def count_of_nodes_not_sync(self): """ Gives count of number of nodes not in sync Return: count of number of nodes not in sync """ tinctest.logger.info("Counting number of nodes not in sync") sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'" (num_sync) = PSQL.run_sql_command(sqlcmd) num_sync = num_sync.split('\n')[3].strip() return num_sync def inject_fault_on_first_primary(self): """ @product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2] """ tinctest.logger.info("\n Injecting faults on first primary") (ok, out) = self.util.inject_fault(f='filerep_immediate_shutdown_request', m='async', y='infinite_loop', r='primary', seg_id=2, sleeptime=300) if not ok: raise Exception( "Fault filerep_immediate_shutdown_request injection failed") (ok, out) = self.util.inject_fault(f='fileRep_is_operation_completed', m='async', y='infinite_loop', r='primary', seg_id=2) if not ok: raise Exception( "Fault fileRep_is_operation_completed injection failed") tinctest.logger.info("\n Done Injecting Fault") def inject_fault_on_first_mirror(self): """ @product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2] """ sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'" (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd) first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip() tinctest.logger.info("\n Injecting faults on first mirror") flag = self.util.check_fault_status( fault_name='fileRep_is_operation_completed', status='triggered', max_cycle=100) if not flag: raise Exception( "Fault fileRep_is_operation_completed didn't trigger") (ok, out) = self.util.inject_fault(f='filerep_consumer', m='async', y='panic', r='mirror', seg_id=first_mirror_dbid) if not ok: raise Exception("Fault filerep_consumer injection failed") tinctest.logger.info("\n Done Injecting Fault") def setupGpfdist(self, port, path): gpfdist = Gpfdist(port, self.hostIP()) gpfdist.killGpfdist() gpfdist.startGpfdist(' -t 30 -m 1048576 -d ' + path) return True def cleanupGpfdist(self, port, path): gpfdist = Gpfdist(port, self.hostIP()) gpfdist.killGpfdist() return True def hostIP(self): ok = run_shell_command('which gpfdist') if not ok: raise GPtestError("Error:'which gpfdist' command failed.") hostname = socket.gethostname() if hostname.find('mdw') > 0: host = 'mdw' else: host = str(socket.gethostbyname( socket.gethostname())) #Must be an IP tinctest.logger.info('current host is %s' % host) return host def method_setup(self): tinctest.logger.info("Performing setup tasks") gpfs = Gpfilespace() gpfs.create_filespace('filerep_fs_a') gpfs.create_filespace('filerep_fs_b') gpfs.create_filespace('filerep_fs_c') gpfs.create_filespace('filerep_fs_z') gpfs.create_filespace('sync1_fs_1') # Set max_resource_queues to 100 cmd = 'gpconfig -c max_resource_queues -v 100 ' ok = run_shell_command(cmd) if not ok: raise Exception( 'Failure during setting the max_resource_queues value to 100 using gpconfig tool' ) #Restart the cluster self.gpstop.run_gpstop_cmd(immediate='i') ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('Failure during restarting the cluster') return True def get_ext_table_query_from_gpstate(self): outfile = local_path("gpstate_tmp") ok = run_shell_command("gpstate --printSampleExternalTableSql >" + outfile) querystring = "" flag = 'false' out = open(outfile, 'r').readlines() for line in out: line.strip() if (line.find( 'DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status') >= 0): flag = 'true' if flag == 'true': querystring = querystring + line return querystring ############RUN QYUERY def check_gpstate(self, type, phase): """ Perform gpstate for each different transition state @type: failover type @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2 """ if phase == 'sync1': state_num = self.query_select_count( "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'" ) sync1_num = self.query_select_count( "select count(*) from gp_segment_configuration where content <> -1" ) if int(sync1_num) <> int(state_num): raise Exception("gpstate in Sync state failed") tinctest.logger.info("Done Running gpstate in %s phase " % (phase)) elif phase == 'ct': p_num = self.query_select_count( "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking' and role = 'Primary' and status_in_config='Up' and instance_status='Up'" ) m_num = self.query_select_count( "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync' and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' " ) if int(p_num) <> int(m_num): raise Exception("gpstate in CT state failed") tinctest.logger.info("Done Running gpstate in %s phase " % (phase)) elif phase == 'resync_incr': if type == 'primary': query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'" resync_incr_num = self.query_select_count(query) else: query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'" resync_incr_num = self.query_select_count(query) query_num_rows = "select count(*) from gp_segment_configuration where content <> -1" num_rows = self.query_select_count(query_num_rows) if int(resync_incr_num) <> int(num_rows): tinctest.logger.info("resync_incr_num query run %s" % query) tinctest.logger.info("num_rows query run %s" % query_num_rows) raise Exception( "gpstate in Resync Incremental state failed. resync_incr_num %s <> num_rows %s" % (resync_incr_num, num_rows)) tinctest.logger.info("Done Running gpstate in %s phase " % (phase)) elif phase == 'resync_full': num_rows = self.query_select_count( "select count(*) from gp_segment_configuration where content <> -1" ) if type == 'primary': resync_full_num = self.query_select_count( "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Full'" ) else: resync_full_num = self.query_select_count( "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Full'" ) if int(resync_full_num) <> int(num_rows): raise Exception("gptate in Resync Full state failed") tinctest.logger.info("Done Running gpstate in %s phase " % (phase)) return True def trigger_transition(self): PSQL.run_sql_file(local_path('mirrors.sql')) def run_gpstate(self, type, phase): """ Perform gpstate for each different transition state @type: failover type @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2 """ tinctest.logger.info("running gpstate") querystring = self.get_ext_table_query_from_gpstate() file1 = local_path('create_table_gpstate.sql') f1 = open(file1, 'w') f1.write(querystring) f1.write('\n') f1.close() PSQL.run_sql_file(local_path('create_table_gpstate.sql')) gpstate_outfile = local_path('gpstate_out') cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile) ok = run_shell_command(cmd) self.check_gpstate(type, phase) return ok def check_mirror_seg(self, master=False): tinctest.logger.info("running check mirror") self.dbstate.check_mirrorintegrity() def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): tinctest.logger.info("running gpcheckcat") self.dbstate.check_catalog(outputFile=outputFile) def query_select_count(self, sqlcmd): (num) = PSQL.run_sql_command(sqlcmd) num = num.split('\n')[3].strip() return num def method_run_failover(self, type): """ Inject fault to failover nodes @type: primary [induces fault in mirror] mirror [creates panic in primary] Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise """ if type == 'primary': tinctest.logger.info("\n primary failover") (ok, out) = self.util.inject_fault(f='filerep_consumer', m='async', y='fault', r='mirror', H='ALL') tinctest.logger.info("\n Done Injecting Fault") elif type == 'mirror': tinctest.logger.info("\n Mirror failover") (ok, out) = self.util.inject_fault(f='postmaster', m='async', y='panic', r='primary', H='ALL') tinctest.logger.info("\n Done Injecting Fault") return True def wait_till_change_tracking_transition(self): self.util.wait_till_change_tracking_transition() def wait_till_insync_transition(self): self.gpr.wait_till_insync_transition() def run_gprecoverseg(self, recover_mode): if recover_mode == 'full': self.gpr.full() else: self.gpr.incremental() def run_gpconfig(self, parameter, master_value, segment_value): if (parameter is not None): self.gpconfig.setParameter(parameter, master_value, segment_value) self.gpstop.run_gpstop_cmd(restart='r') def inject_fault(self, fault=None, mode=None, operation=None, prim_mirr=None, host='All', table=None, database=None, seg_id=None, sleeptime=None, occurence=None): if (fault == None or mode == None or operation == None or prim_mirr == None): raise Exception('Incorrect parameters provided for inject fault') (ok, out) = self.util.inject_fault(f=fault, m=mode, y=operation, r=prim_mirr, H='ALL', table=table, database=database, sleeptime=sleeptime, o=occurence, seg_id=seg_id)
def test_recovery_full(self): gprecover = GpRecover() gprecover.full() gprecover.wait_till_insync_transition()
def full_recoverseg(self): gprecover = GpRecover(GPDBConfig()) gprecover.full()
class FilerepTestCase(MPPTestCase): def __init__(self, methodName): self.pgport = os.environ.get('PGPORT') self.util = Filerepe2e_Util() self.gpconfig = GpConfig() self.config = GPDBConfig() self.gpr = GpRecover(self.config) self.dbstate = DbStateClass('run_validation',self.config) self.gpstart = GpStart() self.gpstop = GpStop() super(FilerepTestCase,self).__init__(methodName) def sleep(self, seconds=60): time.sleep(seconds) def create_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('create a file', 'touch %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def remove_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('remove a file', 'rm %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def get_timestamp_of_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('check timestamp', """ python -c "import os; print os.stat('%s').st_mtime" """ % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) res = cmd.get_results().stdout.strip() return res def verify_file_exists(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('check if file exists', 'test -f %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def handle_ext_cases(self,file): """ @file: wet sql file to replace with specific machine env. """ host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP querystring = "gpfdist://"+host+":8088" if os.path.isfile(file): for line in fileinput.FileInput(file,inplace=1): line = re.sub('gpfdist.+8088',querystring,line) print str(re.sub('\n','',line)) def handle_hybrid_part_cases(self, file): """ @file: hybrid sql file to replace with specific machine env """ querystring = "FROM '"+local_path('hybrid_part.data')+"'" if os.path.isfile(file): for line in fileinput.FileInput(file,inplace=1): line = re.sub('FROM\s\'.+hybrid_part.data\'',querystring,line) print str(re.sub('\n','',line)) def preprocess(self): """ Replace the hard-coded information from sql files with correct hostname and ip address,etc """ list_workload_dir = ['set_sync1','sync1','set_ck_sync1','ck_sync1', 'set_ct','ct','set_resync','resync','set_sync2','sync2'] for dir in list_workload_dir: sql_path = os.path.join(local_path(dir),'sql') ans_path = os.path.join(local_path(dir),'expected') for file in os.listdir(sql_path): if (file.find('wet_ret')>=0): self.handle_ext_cases(os.path.join(sql_path,file)) if (file.find('hybrid_part')>=0): self.handle_hybrid_part_cases(os.path.join(sql_path,file)) for file in os.listdir(ans_path): if (file.find('wet_ret')>=0): self.handle_ext_cases(os.path.join(ans_path,file)) if (file.find('hybrid_part')>=0): self.handle_hybrid_part_cases(os.path.join(ans_path,file)) def clean_data(self): """ Clean the data by removing the external table, otherwise, more data will be appended to the same external table from running multiple sql files. """ test = local_path("") test = str(test) +"data/*.*" cmd = 'rm -rfv '+test run_shell_command(cmd) def anydownsegments(self): """ checks if any segments are down """ tinctest.logger.info("Checking if any segments are down") num_segments_down = self.count_of_nodes_down() if int(num_segments_down) == 0: return True else: return False def stop_start_validate(self, stopValidate=True): """ Do gpstop -i, gpstart and see if all segments come back up fine """ tinctest.logger.info("Performing stop start validate") tinctest.logger.info("Shutting down the cluster") ok = self.gpstop.run_gpstop_cmd(immediate = 'i', validate=stopValidate) if not ok and stopValidate: raise Exception('Problem while shutting down the cluster') tinctest.logger.info("Successfully shutdown the cluster.") tinctest.logger.info("Restarting the cluster.") ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('Failed to bring the cluster back up') tinctest.logger.info("Successfully restarted the cluster.") if not self.anydownsegments(): raise Exception("segments were marked down") else: return (True, "All segments are up") def method_reset_fault_injection(self): """ Resets fault injection Return: (True, [result]) if OK, or (False, [result]) otherwise """ tinctest.logger.info("Resetting fault injection") (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("Fault injection failed") tinctest.logger.info("Done Injecting Fault to reset resync") return (True, str(out1)) def method_resume_filerep_resync(self): """ Resumes the process of resync """ tinctest.logger.info("Resuming Resync") (ok, out) = self.util.inject_fault(f='filerep_resync', m='async',y='resume', r='primary', H='ALL') if not ok: raise Exception("Fault injection failed") tinctest.logger.info("Done resuming resync") return (ok, out) def run_method_suspendresync(self): """ Stops the cluster from going to resync """ tinctest.logger.info("Suspending resync") (ok,out) = self.util.inject_fault(f='filerep_resync', m='async' , y='suspend', r ='primary', H='ALL') tinctest.logger.info('output from suspend resync %s'%out) if not ok: raise Exception("Fault injection failed") tinctest.logger.info("Done Injecting Fault to suspend resync") return (ok, out) def count_of_masters(self): """ Gives count of number of nodes in the cluster that are master Return: count of number of nodes in the cluster that are master """ tinctest.logger.info("Count the number of masters") cmd = "select count(*) from gp_segment_configuration where content = -1" (out) = PSQL.run_sql_command(cmd) num_master = out.split('\n')[3].strip() return num_master def count_of_nodes(self): """ Gives count of number of nodes in the cluster Return: count of number of nodes in the cluster """ tinctest.logger.info("Counting number of nodes") cmd = "select count(*) from gp_segment_configuration" (num_cl) = PSQL.run_sql_command(cmd) total_num_rows = num_cl.split('\n')[3].strip() return total_num_rows def count_of_nodes_in_ct(self): """ Gives count of number of nodes in change tracking Return: count of number of nodes in change tracking """ tinctest.logger.info("Counting number of nodes in ct") sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'" (num_cl) = PSQL.run_sql_command(sqlcmd) num_cl = num_cl.split('\n')[3].strip() return num_cl def count_of_nodes_down(self): """ Gives count of number of nodes marked as down Return: count of number of nodes marked as down """ tinctest.logger.info("Counting the number of nodes down") sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'" (num_down) = PSQL.run_sql_command(sqlcmd) num_down = num_down.split('\n')[3].strip() return num_down def count_of_nodes_sync(self): """ Gives count of number of nodes in sync Return: count of number of nodes in sync """ tinctest.logger.info("Counting the number of nodes in sync") sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'" (num_sync) = PSQL.run_sql_command(sqlcmd) num_sync = num_sync.split('\n')[3].strip() return num_sync def count_of_nodes_not_sync(self): """ Gives count of number of nodes not in sync Return: count of number of nodes not in sync """ tinctest.logger.info("Counting number of nodes not in sync") sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'" (num_sync) = PSQL.run_sql_command(sqlcmd) num_sync = num_sync.split('\n')[3].strip() return num_sync def inject_fault_on_first_primary(self): """ @product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2] """ tinctest.logger.info("\n Injecting faults on first primary") (ok,out) = self.util.inject_fault(f='filerep_immediate_shutdown_request', m='async' , y='infinite_loop', r ='primary', seg_id=2, sleeptime=300) if not ok: raise Exception("Fault filerep_immediate_shutdown_request injection failed") (ok,out) = self.util.inject_fault(f='fileRep_is_operation_completed', m='async' , y='infinite_loop', r ='primary', seg_id=2) if not ok: raise Exception("Fault fileRep_is_operation_completed injection failed") tinctest.logger.info("\n Done Injecting Fault") def inject_fault_on_first_mirror(self): """ @product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2] """ sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'" (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd) first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip() tinctest.logger.info("\n Injecting faults on first mirror") flag = self.util.check_fault_status(fault_name='fileRep_is_operation_completed', status='triggered', max_cycle=100); if not flag: raise Exception("Fault fileRep_is_operation_completed didn't trigger") (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='panic', r ='mirror', seg_id=first_mirror_dbid) if not ok: raise Exception("Fault filerep_consumer injection failed") tinctest.logger.info("\n Done Injecting Fault") def setupGpfdist(self, port, path): gpfdist = Gpfdist(port , self.hostIP()) gpfdist.killGpfdist() gpfdist.startGpfdist(' -t 30 -m 1048576 -d '+path) return True def cleanupGpfdist(self, port,path): gpfdist = Gpfdist(port , self.hostIP()) gpfdist.killGpfdist() return True def hostIP(self): ok = run_shell_command('which gpfdist') if not ok: raise GPtestError("Error:'which gpfdist' command failed.") hostname = socket.gethostname() if hostname.find('mdw') > 0 : host = 'mdw' else: host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP tinctest.logger.info('current host is %s'%host) return host def method_setup(self): tinctest.logger.info("Performing setup tasks") gpfs=Gpfilespace() gpfs.create_filespace('filerep_fs_a') gpfs.create_filespace('filerep_fs_b') gpfs.create_filespace('filerep_fs_c') gpfs.create_filespace('filerep_fs_z') gpfs.create_filespace('sync1_fs_1') # Set max_resource_queues to 100 cmd = 'gpconfig -c max_resource_queues -v 100 ' ok = run_shell_command(cmd) if not ok: raise Exception('Failure during setting the max_resource_queues value to 100 using gpconfig tool') #Restart the cluster self.gpstop.run_gpstop_cmd(immediate = 'i') ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('Failure during restarting the cluster') return True def get_ext_table_query_from_gpstate(self): outfile = local_path("gpstate_tmp") ok = run_shell_command("gpstate --printSampleExternalTableSql >"+ outfile) querystring = "" flag = 'false' out = open(outfile, 'r').readlines() for line in out: line.strip() if (line.find('DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status')>=0): flag = 'true' if flag == 'true': querystring = querystring + line return querystring ############RUN QYUERY def check_gpstate(self, type, phase): """ Perform gpstate for each different transition state @type: failover type @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2 """ if phase == 'sync1': state_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'") sync1_num = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1") if int(sync1_num) <> int(state_num): raise Exception("gpstate in Sync state failed") tinctest.logger.info("Done Running gpstate in %s phase " %(phase)) elif phase == 'ct': p_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking' and role = 'Primary' and status_in_config='Up' and instance_status='Up'") m_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync' and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' ") if int(p_num) <> int(m_num): raise Exception("gpstate in CT state failed") tinctest.logger.info("Done Running gpstate in %s phase " %(phase)) elif phase == 'resync_incr': if type == 'primary': query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'" resync_incr_num = self.query_select_count(query) else: query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'" resync_incr_num = self.query_select_count(query) query_num_rows = "select count(*) from gp_segment_configuration where content <> -1" num_rows = self.query_select_count(query_num_rows) if int(resync_incr_num) <> int(num_rows): tinctest.logger.info("resync_incr_num query run %s" % query) tinctest.logger.info("num_rows query run %s" % query_num_rows) raise Exception("gpstate in Resync Incremental state failed. resync_incr_num %s <> num_rows %s" % (resync_incr_num, num_rows)) tinctest.logger.info("Done Running gpstate in %s phase " %(phase)) elif phase == 'resync_full': num_rows = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1") if type == 'primary': resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Full'") else: resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Full'") if int(resync_full_num) <> int(num_rows): raise Exception("gptate in Resync Full state failed") tinctest.logger.info("Done Running gpstate in %s phase " %(phase)) return True def trigger_transition(self): PSQL.run_sql_file(local_path('mirrors.sql')) def run_gpstate(self, type, phase): """ Perform gpstate for each different transition state @type: failover type @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2 """ tinctest.logger.info("running gpstate") querystring = self.get_ext_table_query_from_gpstate() file1 = local_path('create_table_gpstate.sql') f1 = open(file1,'w') f1.write(querystring) f1.write('\n') f1.close() PSQL.run_sql_file(local_path('create_table_gpstate.sql')) gpstate_outfile = local_path('gpstate_out') cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile) ok = run_shell_command(cmd) self.check_gpstate(type, phase) return ok def check_mirror_seg(self, master=False): tinctest.logger.info("running check mirror") self.dbstate.check_mirrorintegrity() def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): tinctest.logger.info("running gpcheckcat") self.dbstate.check_catalog(outputFile=outputFile) def query_select_count(self,sqlcmd): (num) = PSQL.run_sql_command(sqlcmd) num = num.split('\n')[3].strip() return num def method_run_failover(self,type): """ Inject fault to failover nodes @type: primary [induces fault in mirror] mirror [creates panic in primary] Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise """ if type == 'primary': tinctest.logger.info("\n primary failover") (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='fault', r ='mirror', H='ALL') tinctest.logger.info("\n Done Injecting Fault") elif type == 'mirror': tinctest.logger.info("\n Mirror failover") (ok,out) = self.util.inject_fault(f='postmaster', m='async' , y='panic', r ='primary', H='ALL') tinctest.logger.info("\n Done Injecting Fault") return True def wait_till_change_tracking_transition(self): self.util.wait_till_change_tracking_transition() def wait_till_insync_transition(self): self.gpr.wait_till_insync_transition() def run_gprecoverseg(self,recover_mode): if recover_mode == 'full': self.gpr.full() else: self.gpr.incremental() def run_gpconfig(self, parameter, master_value, segment_value): if (parameter is not None): self.gpconfig.setParameter(parameter, master_value, segment_value) self.gpstop.run_gpstop_cmd(restart='r') def inject_fault(self, fault = None, mode = None, operation = None, prim_mirr = None, host = 'All', table = None, database = None, seg_id = None, sleeptime = None, occurence = None): if (fault == None or mode == None or operation == None or prim_mirr == None): raise Exception('Incorrect parameters provided for inject fault') (ok,out) = self.util.inject_fault(f=fault, m=mode , y=operation, r=prim_mirr, H='ALL', table=table, database=database, sleeptime=sleeptime, o=occurence, seg_id=seg_id)
class SubTransactionLimitRemovalTestCase(MPPTestCase): def __init__(self, methodName): super(SubTransactionLimitRemovalTestCase,self).__init__(methodName) def check_system(self): ''' @summary: Check whether the system is up and sync. Exit out if not ''' tinctest.logger.info("[STLRTest] Running check_system") tinctest.logger.info("[STLRTest] Check whether the system is up and sync") cmd ="select count(*) from gp_segment_configuration where content<> -1 ;" (num_cl) = PSQL.run_sql_command(cmd) count_all = num_cl.split('\n')[3].strip() cmd ="select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';" (num_cl) = PSQL.run_sql_command(cmd) count_up_and_sync = num_cl.split('\n')[3].strip() tinctest.logger.info("[STLRTest] printing gp segment configuration") (gp_seg_conf) = PSQL.run_sql_command("select * from gp_segment_configuration order by dbid") tinctest.logger.info(gp_seg_conf) if count_all != count_up_and_sync : raise Exception("[STLRTest] System not in sync and up. Exiting test") else: tinctest.logger.info("[STLRTest] Starting New Test: System is up and in sync...") def run_sqls(self,test): ''' @summary : Run the sql @param test: the sql file list ''' tinctest.logger.info("[STLRTest] Running run_sqls") tinctest.logger.info("[STLRTest]Starting new thread to run sql %s"%(test)) PSQL.run_sql_file(local_path(test)) def suspend_faults(self,fault_name): ''' @summary : Suspend the specified fault: reset it before issuing suspend @param fault_name : Name of the fault to suspend ''' tinctest.logger.info("[STLRTest] Running suspend_faults") self.util = Filerepe2e_Util() (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting the %s fault"%(fault_name)) (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'suspend', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done suspending the %s fault"%(fault_name)) def check_fault_status(self,fault_name = None, status = None, max_cycle=10): ''' Check whether a fault is triggered. Poll till the fault is triggered @param name : Fault name @param status : Status to be checked - triggered/completed ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running check_fault_status %s", status) if (not fault_name) or (not status) : raise Exception("[STLRTest]Need a value for fault_name and status to continue") poll =0 while(poll < max_cycle): (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'status', r = 'primary', H ='ALL') poll +=1 for line in out1.splitlines(): if line.find(fault_name) > 0 and line.find(status) > 0 : tinctest.logger.info("[STLRTest]Fault %s is %s " % (fault_name,status)) poll = 0 tinctest.logger.info("[STLRTest] Running check_fault_status %s TRUE", status) return True #sleep a while before start polling again sleep(10) tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE", status) return False def filerep_fault(self,trans_state): ''' @summary : Inject the filerep fault supplied @param trans_state : type of transition ''' tinctest.logger.info("[STLRTest] Running filerep_fault") self.util = Filerepe2e_Util() if trans_state == 'failover_to_primary': tinctest.logger.info("[STLRTest] primary failover") (ok1,out1) = self.util.inject_fault(f='filerep_consumer', m = 'async', y = 'fault', r = 'mirror', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done primary failover fault") elif trans_state == 'failover_to_mirror': tinctest.logger.info("[STLRTest] fault for postmaster panic") (ok1,out1) = self.util.inject_fault(f='postmaster', m = 'async', y = 'panic', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done postmaster panic fault") elif trans_state == 'postmaster_reset': tinctest.logger.info("[STLRTest] fault for filerep_sender panic") (ok1,out1) = self.util.inject_fault(f='filerep_sender', m = 'async', y = 'panic', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done filerep_sender panic fault") tinctest.logger.info("[STLRTest] Done Injecting Fault") def resume_faults(self,fault_name,trans_state): '''' @summary : Resume the fault and check status ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running resume_faults") if not trans_state == 'failover_to_mirror' : tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name) (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault resume failed") tinctest.logger.info("[STLRTest]Done fault for %s resume" % fault_name) if trans_state == 'postmaster_reset': (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'mirror', H ='ALL') if not ok1: tinctest.logger.info("[STLRTest]Failed fault for %s resume on mirror" % fault_name) if trans_state == 'failover_to_primary' : self.check_fault_status(fault_name,'completed') def checkPSQLRun(self, test): '''Check if the psql run started in parallel is over before running the _post.sql ''' tinctest.logger.info("[STLRTest] Running checkPSQLRun") cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql' while(1): is_running = 0 (rc , out) = shell.run(cmd_str) for line in out: if '%s' %test in line: is_running = 1 if is_running == 0: return True else: sleep(10) return False def resume_filerep_resync(self): self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running resume_filerep_resync") tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume") (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'resume', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done fault for failover_to_mirror resume") sleep(10) def stop_start_validate(self, expect_down_segments=False): """ Do gpstop -i, gpstart and see if all segments come back up fine """ self.gpstart = GpStart() self.gpstop = GpStop() tinctest.logger.info("[STLRTest] Running stop_start_validate") tinctest.logger.info("[STLRTest]Shutting down the cluster") ok = self.gpstop.run_gpstop_cmd(immediate = 'i') if not expect_down_segments: if not ok: raise Exception('[STLRTest]Problem while shutting down the cluster') tinctest.logger.info("[STLRTest]Successfully shutdown the cluster.") tinctest.logger.info("[STLRTest]Restarting the cluster.") ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('[STLRTest]Failed to bring the cluster back up') tinctest.logger.info("[STLRTest]Successfully restarted the cluster.") if not self.anydownsegments(): raise Exception("[STLRTest]segments were marked down") else: return (True, "All segments are up") def run_gprecoverseg(self,recover_option): ''' @summary : Call gpecoverseg full or incremental to bring back the cluster to sync ''' self.gpr = GpRecover() tinctest.logger.info("[STLRTest] Running run_gprecoverseg") if recover_option == 'full': self.gpr.full() else: self.gpr.incremental() self.gpr.wait_till_insync_transition() def run_restart_database(self): ''' @summary : Restart the database ''' self.gpstart = GpStart() self.gpstop = GpStop() tinctest.logger.info("[STLRTest] Running run_restart_database") ok = self.gpstop.run_gpstop_cmd(immediate = 'i') tinctest.logger.info(ok) ok = self.gpstart.run_gpstart_cmd() tinctest.logger.info(ok) def reset_faults(self,fault_name,current_cluster_state): '''' @summary : Reset the faults at the end of test ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running reset_faults") tinctest.logger.info("[STLRTest] Resetting fault before ending test") (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting %s fault" %(fault_name)) if current_cluster_state == 'resync': (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done filerep_resync fault") (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting checkpoint fault" ) def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running do_gpcheckcat") self.dbstate.check_catalog() return True def _validation(self): ''' @summary :gpcheckcat and gpcheckmirrorintegrity ''' ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;") ###sleep(30) # sleep for some time for the segments to be in sync before validation self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running _validation") outfile = local_path("subt_checkcat.out") self.dbstate.check_catalog(outputFile=outfile) self.dbstate.check_mirrorintegrity() def inject_and_resume_fault(self, fault_name, trans_state): self.check_fault_status(fault_name, 'triggered') self.filerep_fault(trans_state) if trans_state == 'failover_to_mirror' : PSQL.run_sql_file(local_path('test_while_ct.sql')) self.resume_faults(fault_name, trans_state) def run_post_sqls(self, fault_name ='', trans_state=''): PSQL.wait_for_database_up(); if (trans_state == 'failover_to_primary' or trans_state == ''): post_sql = "failover_sql/subt_create_table_ao_post_commit" else: post_sql = "failover_sql/subt_create_table_ao_post_abort" sql_file = post_sql+".sql" ans_file = post_sql+".ans" out_file = post_sql+".out" PSQL.run_sql_file(sql_file = local_path(sql_file), out_file = local_path(out_file)) diff_res = Gpdiff.are_files_equal(local_path(out_file), local_path(ans_file)) if not diff_res: self.fail("[STLRTest]Gpdiff failed for : %s %s" %(fault_name, trans_state)) def reset_all_faults(self): '''' @summary : Reset all faults on primary and mirror ''' tinctest.logger.info("[STLRTest] Running reset_all_faults") self.util = Filerepe2e_Util() (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting all faults on primary") (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'mirror', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting all faults fault on mirror") def kill_zombies(self): ''' @summary : There are stray zombie processes running after each test. This method clears them ''' tinctest.logger.info("[STLRTest] Running kill_zombies") cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1" cmd = Command("shell_command", cmd_str) tinctest.logger.info('Executing command: %s : %s' %("shell command", cmd_str)) cmd.run() result = cmd.get_results() out = result.stdout lines = out.split('\n') for line in lines: pids = line.split('#') if pids[0] == '1': kill_str= "kill -9 %s" %(pids[1]) cmd2 = Command("kill_command", kill_str) cmd2.run() def skip_checkpoint(self): ''' @summary : Routine to inject fault that skips checkpointing ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running skip_checkpoint") (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault") (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'skip', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault") def method_setup(self): tinctest.logger.info("Performing setup tasks") gpfs=Gpfilespace() gpfs.create_filespace('subt_filespace_a') def cleandb(self): db = Database() db.setupDatabase('gptest')
class SubTransactionLimitRemovalTestCase(MPPTestCase): def __init__(self, methodName): super(SubTransactionLimitRemovalTestCase, self).__init__(methodName) def check_system(self): ''' @summary: Check whether the system is up and sync. Exit out if not ''' tinctest.logger.info("[STLRTest] Running check_system") tinctest.logger.info( "[STLRTest] Check whether the system is up and sync") cmd = "select count(*) from gp_segment_configuration where content<> -1 ;" (num_cl) = PSQL.run_sql_command(cmd) count_all = num_cl.split('\n')[3].strip() cmd = "select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';" (num_cl) = PSQL.run_sql_command(cmd) count_up_and_sync = num_cl.split('\n')[3].strip() tinctest.logger.info("[STLRTest] printing gp segment configuration") (gp_seg_conf) = PSQL.run_sql_command( "select * from gp_segment_configuration order by dbid") tinctest.logger.info(gp_seg_conf) if count_all != count_up_and_sync: raise Exception( "[STLRTest] System not in sync and up. Exiting test") else: tinctest.logger.info( "[STLRTest] Starting New Test: System is up and in sync...") def run_sqls(self, test): ''' @summary : Run the sql @param test: the sql file list ''' tinctest.logger.info("[STLRTest] Running run_sqls") tinctest.logger.info("[STLRTest]Starting new thread to run sql %s" % (test)) PSQL.run_sql_file(local_path(test)) def suspend_faults(self, fault_name): ''' @summary : Suspend the specified fault: reset it before issuing suspend @param fault_name : Name of the fault to suspend ''' tinctest.logger.info("[STLRTest] Running suspend_faults") self.util = Filerepe2e_Util() (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting the %s fault" % (fault_name)) (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='suspend', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done suspending the %s fault" % (fault_name)) def check_fault_status(self, fault_name=None, status=None, max_cycle=10): ''' Check whether a fault is triggered. Poll till the fault is triggered @param name : Fault name @param status : Status to be checked - triggered/completed ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running check_fault_status %s", status) if (not fault_name) or (not status): raise Exception( "[STLRTest]Need a value for fault_name and status to continue") poll = 0 while (poll < max_cycle): (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='status', r='primary', H='ALL') poll += 1 for line in out1.splitlines(): if line.find(fault_name) > 0 and line.find(status) > 0: tinctest.logger.info("[STLRTest]Fault %s is %s " % (fault_name, status)) poll = 0 tinctest.logger.info( "[STLRTest] Running check_fault_status %s TRUE", status) return True #sleep a while before start polling again sleep(10) tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE", status) return False def filerep_fault(self, trans_state): ''' @summary : Inject the filerep fault supplied @param trans_state : type of transition ''' tinctest.logger.info("[STLRTest] Running filerep_fault") self.util = Filerepe2e_Util() if trans_state == 'failover_to_primary': tinctest.logger.info("[STLRTest] primary failover") (ok1, out1) = self.util.inject_fault(f='filerep_consumer', m='async', y='fault', r='mirror', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done primary failover fault") elif trans_state == 'failover_to_mirror': tinctest.logger.info("[STLRTest] fault for postmaster panic") (ok1, out1) = self.util.inject_fault(f='postmaster', m='async', y='panic', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done postmaster panic fault") elif trans_state == 'postmaster_reset': tinctest.logger.info("[STLRTest] fault for filerep_sender panic") (ok1, out1) = self.util.inject_fault(f='filerep_sender', m='async', y='panic', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done filerep_sender panic fault") tinctest.logger.info("[STLRTest] Done Injecting Fault") def resume_faults(self, fault_name, trans_state): '''' @summary : Resume the fault and check status ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running resume_faults") if not trans_state == 'failover_to_mirror': tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name) (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='resume', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault resume failed") tinctest.logger.info("[STLRTest]Done fault for %s resume" % fault_name) if trans_state == 'postmaster_reset': (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='resume', r='mirror', H='ALL') if not ok1: tinctest.logger.info( "[STLRTest]Failed fault for %s resume on mirror" % fault_name) if trans_state == 'failover_to_primary': self.check_fault_status(fault_name, 'completed') def checkPSQLRun(self, test): '''Check if the psql run started in parallel is over before running the _post.sql ''' tinctest.logger.info("[STLRTest] Running checkPSQLRun") cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql' while (1): is_running = 0 (rc, out) = shell.run(cmd_str) for line in out: if '%s' % test in line: is_running = 1 if is_running == 0: return True else: sleep(10) return False def resume_filerep_resync(self): self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running resume_filerep_resync") tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume") (ok1, out1) = self.util.inject_fault(f='filerep_resync', m='async', y='resume', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info( "[STLRTest]Done fault for failover_to_mirror resume") sleep(10) def stop_start_validate(self, expect_down_segments=False): """ Do gpstop -i, gpstart and see if all segments come back up fine """ self.gpstart = GpStart() self.gpstop = GpStop() tinctest.logger.info("[STLRTest] Running stop_start_validate") tinctest.logger.info("[STLRTest]Shutting down the cluster") ok = self.gpstop.run_gpstop_cmd(immediate='i') if not expect_down_segments: if not ok: raise Exception( '[STLRTest]Problem while shutting down the cluster') tinctest.logger.info( "[STLRTest]Successfully shutdown the cluster.") tinctest.logger.info("[STLRTest]Restarting the cluster.") ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('[STLRTest]Failed to bring the cluster back up') tinctest.logger.info("[STLRTest]Successfully restarted the cluster.") if not self.anydownsegments(): raise Exception("[STLRTest]segments were marked down") else: return (True, "All segments are up") def run_gprecoverseg(self, recover_option): ''' @summary : Call gpecoverseg full or incremental to bring back the cluster to sync ''' self.gpr = GpRecover() tinctest.logger.info("[STLRTest] Running run_gprecoverseg") if recover_option == 'full': self.gpr.full() else: self.gpr.incremental() self.gpr.wait_till_insync_transition() def run_restart_database(self): ''' @summary : Restart the database ''' self.gpstart = GpStart() self.gpstop = GpStop() tinctest.logger.info("[STLRTest] Running run_restart_database") ok = self.gpstop.run_gpstop_cmd(immediate='i') tinctest.logger.info(ok) ok = self.gpstart.run_gpstart_cmd() tinctest.logger.info(ok) def reset_faults(self, fault_name, current_cluster_state): '''' @summary : Reset the faults at the end of test ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running reset_faults") tinctest.logger.info("[STLRTest] Resetting fault before ending test") (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting %s fault" % (fault_name)) if current_cluster_state == 'resync': (ok1, out1) = self.util.inject_fault(f='filerep_resync', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done filerep_resync fault") (ok1, out1) = self.util.inject_fault(f='checkpoint', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting checkpoint fault") def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running do_gpcheckcat") self.dbstate.check_catalog() return True def _validation(self): ''' @summary :gpcheckcat and gpcheckmirrorintegrity ''' ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;") ###sleep(30) # sleep for some time for the segments to be in sync before validation self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running _validation") outfile = local_path("subt_checkcat.out") self.dbstate.check_catalog(outputFile=outfile) self.dbstate.check_mirrorintegrity() def inject_and_resume_fault(self, fault_name, trans_state): self.check_fault_status(fault_name, 'triggered') self.filerep_fault(trans_state) if trans_state == 'failover_to_mirror': PSQL.run_sql_file(local_path('test_while_ct.sql')) self.resume_faults(fault_name, trans_state) def run_post_sqls(self, fault_name='', trans_state=''): PSQL.wait_for_database_up() if (trans_state == 'failover_to_primary' or trans_state == ''): post_sql = "failover_sql/subt_create_table_ao_post_commit" else: post_sql = "failover_sql/subt_create_table_ao_post_abort" sql_file = post_sql + ".sql" ans_file = post_sql + ".ans" out_file = post_sql + ".out" PSQL.run_sql_file(sql_file=local_path(sql_file), out_file=local_path(out_file)) diff_res = Gpdiff.are_files_equal(local_path(out_file), local_path(ans_file)) if not diff_res: self.fail("[STLRTest]Gpdiff failed for : %s %s" % (fault_name, trans_state)) def reset_all_faults(self): '''' @summary : Reset all faults on primary and mirror ''' tinctest.logger.info("[STLRTest] Running reset_all_faults") self.util = Filerepe2e_Util() (ok1, out1) = self.util.inject_fault(f='all', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting all faults on primary") (ok1, out1) = self.util.inject_fault(f='all', m='async', y='reset', r='mirror', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info( "[STLRTest]Done resetting all faults fault on mirror") def kill_zombies(self): ''' @summary : There are stray zombie processes running after each test. This method clears them ''' tinctest.logger.info("[STLRTest] Running kill_zombies") cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1" cmd = Command("shell_command", cmd_str) tinctest.logger.info('Executing command: %s : %s' % ("shell command", cmd_str)) cmd.run() result = cmd.get_results() out = result.stdout lines = out.split('\n') for line in lines: pids = line.split('#') if pids[0] == '1': kill_str = "kill -9 %s" % (pids[1]) cmd2 = Command("kill_command", kill_str) cmd2.run() def skip_checkpoint(self): ''' @summary : Routine to inject fault that skips checkpointing ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running skip_checkpoint") (ok1, out1) = self.util.inject_fault(f='checkpoint', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault") (ok1, out1) = self.util.inject_fault(f='checkpoint', m='async', y='skip', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault") def method_setup(self): tinctest.logger.info("Performing setup tasks") gpfs = Gpfilespace() gpfs.create_filespace('subt_filespace_a') def cleandb(self): db = Database() db.setupDatabase('gptest')