def test_pg_aocsseg_corruption(self): self.create_appendonly_tables(row=False) config = GPDBConfig() host, port = config.get_hostandport_of_segment() self.transform_sql_file(os.path.join(self.sql_dir, 'corrupt_pg_aocsseg.sql.t'), 'co1') out_file = os.path.join(self.output_dir, 'corrupt_pg_aocsseg.out') ans_file = os.path.join(self.ans_dir, 'corrupt_pg_aocsseg.ans') sql_file = os.path.join(self.sql_dir, 'corrupt_pg_aocsseg.sql') PSQL.run_sql_file_utility_mode(sql_file, out_file=out_file, host=host, port=port, dbname=os.environ['PGDATABASE']) if not Gpdiff.are_files_equal(out_file, ans_file, match_sub=[local_path('sql/init_file')]): raise Exception('Corruption test of pg_aocsseg failed for appendonly tables !')
def run_test(self, data_file, sql_file, out_file, ans_file, table): base_dir = os.path.dirname(sys.modules[self.__module__].__file__) out_file = os.path.join(base_dir, 'output', out_file) ans_file = os.path.join(base_dir, 'expected', ans_file) sql_file = os.path.join(base_dir, 'sql', sql_file) data_out_file = os.path.join(base_dir, 'output', data_file.strip('.sql') + '.out') data_ans_file = os.path.join(base_dir, 'expected', data_file.strip('.sql') + '.ans') data_file = os.path.join(base_dir, 'sql', data_file) PSQL.run_sql_file(data_file, out_file=data_out_file) self.assertTrue(Gpdiff.are_files_equal(data_out_file, data_ans_file)) host, port = self._get_host_and_port_for_table(table) PSQL.run_sql_file_utility_mode(sql_file, host=host, port=port, out_file=out_file) self.assertTrue(Gpdiff.are_files_equal(out_file, ans_file))
def test_pg_aocsseg_corruption(self): self.create_appendonly_tables(row=False) config = GPDBConfig() host, port = config.get_hostandport_of_segment() self.transform_sql_file( os.path.join(self.sql_dir, 'corrupt_pg_aocsseg.sql.t'), 'co1') out_file = os.path.join(self.output_dir, 'corrupt_pg_aocsseg.out') ans_file = os.path.join(self.ans_dir, 'corrupt_pg_aocsseg.ans') sql_file = os.path.join(self.sql_dir, 'corrupt_pg_aocsseg.sql') PSQL.run_sql_file_utility_mode(sql_file, out_file=out_file, host=host, port=port, dbname=os.environ['PGDATABASE']) if not Gpdiff.are_files_equal( out_file, ans_file, match_sub=[local_path('sql/init_file')]): raise Exception( 'Corruption test of pg_aocsseg failed for appendonly tables !')
def run_sql_utility_mode(self,filename,host=None,port=None): fname=filename sql_file = self.get_sql_files(fname) out_file = self.base_dir+ "/sql/"+fname +'.out' ans_file = self.base_dir+ "/expected/"+fname+'.ans' tinctest.logger.info( '\n==============================') tinctest.logger.info( sql_file) tinctest.logger.info( out_file) tinctest.logger.info( ans_file) tinctest.logger.info( '==============================') result=PSQL.run_sql_file_utility_mode(sql_file,out_file=out_file,host=host, port=port) self.validate_sql(ans_file,out_file)
def test_run_sql_file_utility_mode(self): sql_file = os.path.join(os.path.dirname(inspect.getfile(self.__class__)),'test_utility_mode.sql') out_file = os.path.join(os.path.dirname(inspect.getfile(self.__class__)),'test_utility_mode.out') self.assertFalse(os.path.exists(out_file)) try: self.assertTrue(PSQL.run_sql_file_utility_mode(sql_file = sql_file, out_file = out_file)) self.assertTrue(os.path.exists(out_file)) with open(out_file, 'r') as f: output = f.read() self.assertIsNotNone(re.search('utility', output)) finally: os.remove(out_file) self.assertFalse(os.path.exists(out_file))
def test_singledb_corruption(self): """ Test that gpcheckcat reports errors and it generates the verify file """ dbname = 'test_singledb_corruption' PSQL.run_sql_command('DROP DATABASE IF EXISTS %s' % dbname) stdout = PSQL.run_sql_command('CREATE DATABASE %s' % dbname) if not stdout.endswith('CREATE DATABASE\n'): self.fail('failed to create database: %s' % stdout) sql_file = local_path('sql/create_tables.sql') if not PSQL.run_sql_file(sql_file, dbname=dbname, output_to_file=False): self.fail('failed to create tables') host, port = self.config.get_hostandport_of_segment() sql_file = local_path('sql/catalog_corruption.sql') if not PSQL.run_sql_file_utility_mode( sql_file, dbname=dbname, host=host, port=port, output_to_file=False): self.fail('failed to introduce catalog corruption') res = {'rc': 0, 'stdout' : '', 'stderr': ''} run_shell_command( "cd %s && $GPHOME/bin/lib/gpcheckcat -p %s %s" % (self.gpcheckcat_test_dir, self.master_port, dbname), results=res) self.assertEqual(3, res['rc']) found = False for f in os.listdir(self.gpcheckcat_test_dir): if fnmatch.fnmatch(f, 'gpcheckcat.verify.%s.*' % dbname): found = True self.assertTrue(found) verify_file_pat = 'gpcheckcat.verify.%s.*' % dbname mtime = lambda f: os.stat( os.path.join(self.gpcheckcat_test_dir, f)).st_mtime fname = list(sorted( fnmatch.filter( os.listdir(self.gpcheckcat_test_dir), verify_file_pat), key=mtime))[-1] if not PSQL.run_sql_file(os.path.join(self.gpcheckcat_test_dir, fname), output_to_file=False): self.fail('failed to run verify file for database %s' % dbname)
def test_run_sql_file_utility_mode(self): sql_file = os.path.join( os.path.dirname(inspect.getfile(self.__class__)), 'test_utility_mode.sql') out_file = os.path.join( os.path.dirname(inspect.getfile(self.__class__)), 'test_utility_mode.out') self.assertFalse(os.path.exists(out_file)) try: self.assertTrue( PSQL.run_sql_file_utility_mode(sql_file=sql_file, out_file=out_file)) self.assertTrue(os.path.exists(out_file)) with open(out_file, 'r') as f: output = f.read() self.assertIsNotNone(re.search('utility', output)) finally: os.remove(out_file) self.assertFalse(os.path.exists(out_file))
def test_error(self): """ Test for errors during the generation of verify file """ dbname = 'test_error' PSQL.run_sql_command('DROP DATABASE IF EXISTS %s' % dbname) stdout = PSQL.run_sql_command('CREATE DATABASE %s' % dbname) if not stdout.endswith('CREATE DATABASE\n'): self.fail('failed to create database: %s' % stdout) # Remove old verify files before runing the test. if not run_shell_command('rm -f %s/gpcheckcat.verify.%s.*' % (self.gpcheckcat_test_dir, dbname)): self.fail('failed to remove old verify files') sql_file = local_path('sql/create_tables.sql') if not PSQL.run_sql_file(sql_file, dbname=dbname, output_to_file=False): self.fail('failed to create tables') host, port = self.config.get_hostandport_of_segment() sql_file = local_path('sql/catalog_corruption.sql') if not PSQL.run_sql_file_utility_mode( sql_file, dbname=dbname, host=host, port=port, output_to_file=False): self.fail('failed to introduce catalog corruption') os.chmod(self.gpcheckcat_test_dir, 0555) res = {'rc': 0, 'stdout' : '', 'stderr': ''} run_shell_command( "cd %s && $GPHOME/bin/lib/gpcheckcat -p %s %s" % (self.gpcheckcat_test_dir, self.master_port, dbname), results=res) self.assertEqual(3, res['rc']) for f in os.listdir(self.gpcheckcat_test_dir): if fnmatch.fnmatch(f, 'gpcheckcat.verify.%s.*' % dbname): self.fail('found verify file when not expecting it')
def test_resync_ct_blocks_per_query(self): '''Catch a bug in resync that manifests only after rebalance. The logic used by a resync worker to obtain changed blocks from CT log had a bug. The SQL query used to obtain a batch of changed blocks from CT log was incorrectly using LSN to filter out changed blocks. All of the following must be true for the bug to occur: * More than gp_filerep_ct_batch_size blocks of a relation are changed on a segment in changetracking. * A block with a higher number is changed earlier (lower LSN) than lower numbered blocks. * The first batch of changed blocks obtained by resync worker from CT log for this relation contains only lower (according to block number) blocks. The higher block with lower LSN is not included in this batch. Another query must be run against CT log to obtain this block. * The SQL query used to obtain next batch of changed blocks for this relation contains incorrect WHERE clause involving a filter based on LSN of previously obtained blocks. The higher numbered block is missed out - not returned by the query as changed block for the relation. The block is never shipped from primary to mirror, resulting in data loss. The test aims to verify that this doesn't happen as the bug is now fixed. ''' config = GPDBConfig() assert (config.is_not_insync_segments() & config.is_balanced_segments() ), 'cluster not in-sync and balanced' # Create table and insert data so that adequate number of # blocks are occupied. self.run_sql('resync_bug_setup') # Bring down primaries and transition mirrors to # changetracking. filerep = Filerepe2e_Util() filerep.inject_fault(y='fault', f='segment_probe_response', r='primary') # Trigger the fault by running a sql file. PSQL.run_sql_file(local_path('test_ddl.sql')) filerep.wait_till_change_tracking_transition() # Set gp_filerep_ct_batch_size = 3. cmd = Command('reduce resync batch size', 'gpconfig -c gp_filerep_ct_batch_size -v 3') cmd.run() assert cmd.get_results().rc == 0, 'gpconfig failed' cmd = Command('load updated config', 'gpstop -au') cmd.run() assert cmd.get_results().rc == 0, '"gpstop -au" failed' self.run_sql('change_blocks_in_ct') # Capture change tracking log contents from the segment of # interest for debugging, in case the test fails. (host, port) = GPDBConfig().get_hostandport_of_segment(0, 'p') assert PSQL.run_sql_file_utility_mode( sql_file=local_path('sql/ct_log_contents.sql'), out_file=local_path('output/ct_log_contents.out'), host=host, port=port), sql_file gprecover = GpRecover(GPDBConfig()) gprecover.incremental(False) gprecover.wait_till_insync_transition() # Rebalance, so that original primary is back in the role gprecover = GpRecover(GPDBConfig()) gprecover.rebalance() gprecover.wait_till_insync_transition() # Reset gp_filerep_ct_batch_size cmd = Command('reset resync batch size', 'gpconfig -r gp_filerep_ct_batch_size') cmd.run() assert cmd.get_results().rc == 0, 'gpconfig failed' cmd = Command('load updated config', 'gpstop -au') cmd.run() assert cmd.get_results().rc == 0, '"gpstop -au" failed' self.run_sql('select_after_rebalance')
def test_resync_ct_blocks_per_query(self): '''Catch a bug in resync that manifests only after rebalance. The logic used by a resync worker to obtain changed blocks from CT log had a bug. The SQL query used to obtain a batch of changed blocks from CT log was incorrectly using LSN to filter out changed blocks. All of the following must be true for the bug to occur: * More than gp_filerep_ct_batch_size blocks of a relation are changed on a segment in changetracking. * A block with a higher number is changed earlier (lower LSN) than lower numbered blocks. * The first batch of changed blocks obtained by resync worker from CT log for this relation contains only lower (according to block number) blocks. The higher block with lower LSN is not included in this batch. Another query must be run against CT log to obtain this block. * The SQL query used to obtain next batch of changed blocks for this relation contains incorrect WHERE clause involving a filter based on LSN of previously obtained blocks. The higher numbered block is missed out - not returned by the query as changed block for the relation. The block is never shipped from primary to mirror, resulting in data loss. The test aims to verify that this doesn't happen as the bug is now fixed. ''' config = GPDBConfig() assert (config.is_not_insync_segments() & config.is_balanced_segments()), 'cluster not in-sync and balanced' # Create table and insert data so that adequate number of # blocks are occupied. self.run_sql('resync_bug_setup') # Bring down primaries and transition mirrors to # changetracking. filerep = Filerepe2e_Util() filerep.inject_fault(y='fault', f='segment_probe_response', r='primary') # Trigger the fault by running a sql file. PSQL.run_sql_file(local_path('test_ddl.sql')) filerep.wait_till_change_tracking_transition() # Set gp_filerep_ct_batch_size = 3. cmd = Command('reduce resync batch size', 'gpconfig -c gp_filerep_ct_batch_size -v 3') cmd.run() assert cmd.get_results().rc == 0, 'gpconfig failed' cmd = Command('load updated config', 'gpstop -au') cmd.run() assert cmd.get_results().rc == 0, '"gpstop -au" failed' self.run_sql('change_blocks_in_ct') # Capture change tracking log contents from the segment of # interest for debugging, in case the test fails. (host, port) = GPDBConfig().get_hostandport_of_segment(0, 'p') assert PSQL.run_sql_file_utility_mode( sql_file=local_path('sql/ct_log_contents.sql'), out_file=local_path('output/ct_log_contents.out'), host=host, port=port), sql_file gprecover = GpRecover(GPDBConfig()) gprecover.incremental(False) gprecover.wait_till_insync_transition() # Rebalance, so that original primary is back in the role gprecover = GpRecover(GPDBConfig()) gprecover.rebalance() gprecover.wait_till_insync_transition() # Reset gp_filerep_ct_batch_size cmd = Command('reset resync batch size', 'gpconfig -r gp_filerep_ct_batch_size') cmd.run() assert cmd.get_results().rc == 0, 'gpconfig failed' cmd = Command('load updated config', 'gpstop -au') cmd.run() assert cmd.get_results().rc == 0, '"gpstop -au" failed' self.run_sql('select_after_rebalance')
def test_multidb_corruption(self): """ Test that gpcheckcat reports errors and it generates the verify file """ dbname1 = 'test_multidb_corruption1' dbname2 = 'test_multidb_corruption2' PSQL.run_sql_command('DROP DATABASE IF EXISTS %s' % dbname1) stdout = PSQL.run_sql_command('CREATE DATABASE %s' % dbname1) if not stdout.endswith('CREATE DATABASE\n'): self.fail('failed to create database: %s' % stdout) PSQL.run_sql_command('DROP DATABASE IF EXISTS %s' % dbname2) stdout = PSQL.run_sql_command('CREATE DATABASE %s' % dbname2) if not stdout.endswith('CREATE DATABASE\n'): self.fail('failed to create database: %s' % stdout) sql_file = local_path('sql/create_tables.sql') if not PSQL.run_sql_file(sql_file, dbname=dbname1, output_to_file=False): self.fail('failed to create tables in database %s' % dbname1) if not PSQL.run_sql_file(sql_file, dbname=dbname2, output_to_file=False): self.fail('failed to create tables in database %s' % dbname2) host, port = self.config.get_hostandport_of_segment() sql_file = local_path('sql/catalog_corruption.sql') if not PSQL.run_sql_file_utility_mode( sql_file, dbname=dbname1, host=host, port=port, output_to_file=False): self.fail('failed to introduce corruption in database %s' % dbname1) if not PSQL.run_sql_file_utility_mode( sql_file, dbname=dbname2, host=host, port=port, output_to_file=False): self.fail('failed to introduce corruption in database %s' % dbname2) res = {'rc': 0, 'stdout' : '', 'stderr': ''} run_shell_command("cd %s && $GPHOME/bin/lib/gpcheckcat -p %s %s" % (self.gpcheckcat_test_dir, self.master_port, dbname1), results=res) self.assertTrue(res['rc'] > 0) run_shell_command("cd %s && $GPHOME/bin/lib/gpcheckcat -p %s %s" % (self.gpcheckcat_test_dir, self.master_port, dbname2), results=res) self.assertTrue(res['rc'] > 0) found = False for f in os.listdir(self.gpcheckcat_test_dir): if fnmatch.fnmatch(f, 'gpcheckcat.verify.%s.*' % dbname1): found = True self.assertTrue(found) found = False for f in os.listdir(self.gpcheckcat_test_dir): if fnmatch.fnmatch(f, 'gpcheckcat.verify.%s.*' % dbname2): found = True self.assertTrue(found) mtime = lambda f: os.stat( os.path.join(self.gpcheckcat_test_dir, f)).st_mtime # Choose the most recent verify file with dbname1 in its name. verify_file_pat = 'gpcheckcat.verify.%s.*' % dbname1 fname = list( sorted( fnmatch.filter( os.listdir(self.gpcheckcat_test_dir), verify_file_pat), key=mtime))[-1] # Ensure that the verify file can be run. It is difficult to # assert the SQL output against an expected answer file # because the output mostly has OIDs. We are therefore # skipping this level of assertion for now. if not PSQL.run_sql_file(os.path.join(self.gpcheckcat_test_dir, fname), output_to_file=False): self.fail('failed to run verify file for database %s' % dbname1) # Similarly for dbname2. verify_file_pat = 'gpcheckcat.verify.%s.*' % dbname2 mtime = lambda f: os.stat( os.path.join(self.gpcheckcat_test_dir, f)).st_mtime fname = list(sorted( fnmatch.filter( os.listdir(self.gpcheckcat_test_dir), verify_file_pat), key=mtime))[-1] if not PSQL.run_sql_file(os.path.join(self.gpcheckcat_test_dir, fname), output_to_file=False): self.fail('failed to run verify file for database %s' % dbname2)