class GpRecoversegRegressionTests(unittest.TestCase): def setUp(self): self.gprec = GpRecover() self.gphome = os.environ.get('GPHOME') def failover(self, type = 'mirror'): if type == 'mirror': fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f filerep_consumer -m async -y fault -r mirror -H ALL' % self.gphome else: fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f postmaster -m async -y panic -r primary -H ALL' % self.gphome return run_shell_command(fault_str, cmdname = 'Run fault injector to failover') def test_incr_gprecoverseg(self): self.gprec.wait_till_insync_transition() if(self.failover()): self.assertTrue(self.gprec.incremental()) def test_full_gprecoverseg(self): self.gprec.wait_till_insync_transition() if(self.failover()): self.assertTrue(self.gprec.full()) def test_gprecoverseg_rebalance(self): self.gprec.wait_till_insync_transition() if(self.failover('primary')): PSQL.run_sql_file(local_path('mirror_failover_trigger.sql')) self.gprec.incremental() if (self.gprec.wait_till_insync_transition()): self.assertTrue(self.gprec.rebalance()) def test_wait_till_insync(self): self.gprec.wait_till_insync_transition() if(self.failover()): self.gprec.incremental() self.assertTrue(self.gprec.wait_till_insync_transition())
class GpRecoversegRegressionTests(unittest.TestCase): def setUp(self): self.gprec = GpRecover() self.gphome = os.environ.get('GPHOME') def failover(self, type='mirror'): if type == 'mirror': fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f filerep_consumer -m async -y fault -r mirror -H ALL' % self.gphome else: fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f postmaster -m async -y panic -r primary -H ALL' % self.gphome return run_shell_command(fault_str, cmdname='Run fault injector to failover') def test_incr_gprecoverseg(self): self.gprec.wait_till_insync_transition() if (self.failover()): self.assertTrue(self.gprec.incremental()) def test_full_gprecoverseg(self): self.gprec.wait_till_insync_transition() if (self.failover()): self.assertTrue(self.gprec.full()) def test_gprecoverseg_rebalance(self): self.gprec.wait_till_insync_transition() if (self.failover('primary')): PSQL.run_sql_file(local_path('mirror_failover_trigger.sql')) self.gprec.incremental() if (self.gprec.wait_till_insync_transition()): self.assertTrue(self.gprec.rebalance()) def test_wait_till_insync(self): self.gprec.wait_till_insync_transition() if (self.failover()): self.gprec.incremental() self.assertTrue(self.gprec.wait_till_insync_transition())
def test_resync_ct_blocks_per_query(self): '''Catch a bug in resync that manifests only after rebalance. The logic used by a resync worker to obtain changed blocks from CT log had a bug. The SQL query used to obtain a batch of changed blocks from CT log was incorrectly using LSN to filter out changed blocks. All of the following must be true for the bug to occur: * More than gp_filerep_ct_batch_size blocks of a relation are changed on a segment in changetracking. * A block with a higher number is changed earlier (lower LSN) than lower numbered blocks. * The first batch of changed blocks obtained by resync worker from CT log for this relation contains only lower (according to block number) blocks. The higher block with lower LSN is not included in this batch. Another query must be run against CT log to obtain this block. * The SQL query used to obtain next batch of changed blocks for this relation contains incorrect WHERE clause involving a filter based on LSN of previously obtained blocks. The higher numbered block is missed out - not returned by the query as changed block for the relation. The block is never shipped from primary to mirror, resulting in data loss. The test aims to verify that this doesn't happen as the bug is now fixed. ''' config = GPDBConfig() assert (config.is_not_insync_segments() & config.is_balanced_segments() ), 'cluster not in-sync and balanced' # Create table and insert data so that adequate number of # blocks are occupied. self.run_sql('resync_bug_setup') # Bring down primaries and transition mirrors to # changetracking. filerep = Filerepe2e_Util() filerep.inject_fault(y='fault', f='segment_probe_response', r='primary') # Trigger the fault by running a sql file. PSQL.run_sql_file(local_path('test_ddl.sql')) filerep.wait_till_change_tracking_transition() # Set gp_filerep_ct_batch_size = 3. cmd = Command('reduce resync batch size', 'gpconfig -c gp_filerep_ct_batch_size -v 3') cmd.run() assert cmd.get_results().rc == 0, 'gpconfig failed' cmd = Command('load updated config', 'gpstop -au') cmd.run() assert cmd.get_results().rc == 0, '"gpstop -au" failed' self.run_sql('change_blocks_in_ct') # Capture change tracking log contents from the segment of # interest for debugging, in case the test fails. (host, port) = GPDBConfig().get_hostandport_of_segment(0, 'p') assert PSQL.run_sql_file_utility_mode( sql_file=local_path('sql/ct_log_contents.sql'), out_file=local_path('output/ct_log_contents.out'), host=host, port=port), sql_file gprecover = GpRecover(GPDBConfig()) gprecover.incremental(False) gprecover.wait_till_insync_transition() # Rebalance, so that original primary is back in the role gprecover = GpRecover(GPDBConfig()) gprecover.rebalance() gprecover.wait_till_insync_transition() # Reset gp_filerep_ct_batch_size cmd = Command('reset resync batch size', 'gpconfig -r gp_filerep_ct_batch_size') cmd.run() assert cmd.get_results().rc == 0, 'gpconfig failed' cmd = Command('load updated config', 'gpstop -au') cmd.run() assert cmd.get_results().rc == 0, '"gpstop -au" failed' self.run_sql('select_after_rebalance')
def test_resync_ct_blocks_per_query(self): '''Catch a bug in resync that manifests only after rebalance. The logic used by a resync worker to obtain changed blocks from CT log had a bug. The SQL query used to obtain a batch of changed blocks from CT log was incorrectly using LSN to filter out changed blocks. All of the following must be true for the bug to occur: * More than gp_filerep_ct_batch_size blocks of a relation are changed on a segment in changetracking. * A block with a higher number is changed earlier (lower LSN) than lower numbered blocks. * The first batch of changed blocks obtained by resync worker from CT log for this relation contains only lower (according to block number) blocks. The higher block with lower LSN is not included in this batch. Another query must be run against CT log to obtain this block. * The SQL query used to obtain next batch of changed blocks for this relation contains incorrect WHERE clause involving a filter based on LSN of previously obtained blocks. The higher numbered block is missed out - not returned by the query as changed block for the relation. The block is never shipped from primary to mirror, resulting in data loss. The test aims to verify that this doesn't happen as the bug is now fixed. ''' config = GPDBConfig() assert (config.is_not_insync_segments() & config.is_balanced_segments()), 'cluster not in-sync and balanced' # Create table and insert data so that adequate number of # blocks are occupied. self.run_sql('resync_bug_setup') # Bring down primaries and transition mirrors to # changetracking. filerep = Filerepe2e_Util() filerep.inject_fault(y='fault', f='segment_probe_response', r='primary') # Trigger the fault by running a sql file. PSQL.run_sql_file(local_path('test_ddl.sql')) filerep.wait_till_change_tracking_transition() # Set gp_filerep_ct_batch_size = 3. cmd = Command('reduce resync batch size', 'gpconfig -c gp_filerep_ct_batch_size -v 3') cmd.run() assert cmd.get_results().rc == 0, 'gpconfig failed' cmd = Command('load updated config', 'gpstop -au') cmd.run() assert cmd.get_results().rc == 0, '"gpstop -au" failed' self.run_sql('change_blocks_in_ct') # Capture change tracking log contents from the segment of # interest for debugging, in case the test fails. (host, port) = GPDBConfig().get_hostandport_of_segment(0, 'p') assert PSQL.run_sql_file_utility_mode( sql_file=local_path('sql/ct_log_contents.sql'), out_file=local_path('output/ct_log_contents.out'), host=host, port=port), sql_file gprecover = GpRecover(GPDBConfig()) gprecover.incremental(False) gprecover.wait_till_insync_transition() # Rebalance, so that original primary is back in the role gprecover = GpRecover(GPDBConfig()) gprecover.rebalance() gprecover.wait_till_insync_transition() # Reset gp_filerep_ct_batch_size cmd = Command('reset resync batch size', 'gpconfig -r gp_filerep_ct_batch_size') cmd.run() assert cmd.get_results().rc == 0, 'gpconfig failed' cmd = Command('load updated config', 'gpstop -au') cmd.run() assert cmd.get_results().rc == 0, '"gpstop -au" failed' self.run_sql('select_after_rebalance')