def tearDown(self): # each test should wait for the wal sender to disappear for i in walrepl.polling(30, 0.5): with PGconn("") as conn: cnt = self.count_walsender(conn) if cnt <= 0: break
def wait_triggered(self, fault_name): search = "fault injection state:'triggered'" for i in walrepl.polling(10, 3): result = self.fault_status(fault_name) stdout = result.stdout if stdout.find(search) > 0: return True return False
def wait_for_walsender(self, nretry=30, interval=0.5): """ Wait for walsender to be active for a while, and returns when it is. Finally the wait loop ends, in which case it returns 0. """ num_walsender = 0 for i in walrepl.polling(nretry, interval): num_walsender = self.count_walsender() if num_walsender > 0: break return num_walsender
def check_pg_stat_replication(self): '''Check the state and sync_state from pg_stat_replication ''' for i in walrepl.polling(max_try=20, interval=0.5): res = self.get_pg_stat_replication() if len(res) == 0: continue elif res[0].state == 'streaming' and res[0].sync_state == 'sync': tinctest.logger.info('pg_stat_replication is updated with the information') return True else: continue return False
def check_pg_stat_replication(self): '''Check the state and sync_state from pg_stat_replication ''' for i in walrepl.polling(max_try=20, interval=0.5): res = self.get_pg_stat_replication() if len(res) == 0: continue elif res[0].state == 'streaming' and res[0].sync_state == 'sync': tinctest.logger.info( 'pg_stat_replication is updated with the information') return True else: continue return False
def test_unixsocket(self): """ Connect via unit socket to walsender. @tags sanity """ with PGconn("") as conn: # gp_libpq_fe doesn't use PGHOST env var, so without giving # explicit hostname, it should use unix domain socket. res = conn.execute("SELECT test_connect('')") self.assertEqual(res.status(), PGRES_TUPLES_OK) res = conn.execute("SELECT * FROM pg_stat_replication") tup = res.pyobjects().pop(0) # client_addr will be NULL in unix socket case self.assertIsNone(tup.client_addr) # client_port will be -1 in unix socket case self.assertEqual(tup.client_port, -1) self.assertIsNotNone(tup.sent_location) # catch up to the primary for i in walrepl.polling(100, 0.5): res = conn.execute("SELECT test_receive()") if not res.getpyvalue(0, 0): break res = conn.execute("SELECT pg_current_xlog_location()") current_xlog_location = res.getvalue(0, 0) res = conn.execute("SELECT * FROM pg_stat_replication") tup = res.pyobjects().pop(0) # sent_location follows current_xlog_location. # They don't necessarily match, as there should be some # window between write and send. self.assertTrue(tup.sent_location <= current_xlog_location)
def test_async(self): """ Run sendtest in async mode. @tags sanity """ PSQL.run_sql_command('DROP TABLE IF EXISTS foo') with WalClient("replication=true") as client: self.assertEqual(client.status(), CONNECTION_OK) (sysid, tli, xpos) = client.identify_system() xpos_ptr = XLogRecPtr.from_string(xpos) client.start_replication(xpos_ptr, sync=False) # wouldn't block since it's async connection. PSQL.run_sql_command('CREATE TABLE foo(a int, b int)') # it may take time to get complete message for i in walrepl.polling(10, 0.5): msg = client.receive(1000) if isinstance(msg, WalMessageData): break self.assertIsInstance(msg, WalMessageData)
def test_disconnect(self): """ @tags sanity """ with PGconn("") as conn: # check disconnect does not fail even before connect res = conn.execute("SELECT test_disconnect()") self.assertEqual(res.status(), PGRES_TUPLES_OK) self.assertTrue(res.getvalue(0, 0, convert=True)) # check if disconnect clears the walsender res = conn.execute("SELECT test_connect('')") self.assertEqual(res.status(), PGRES_TUPLES_OK) res = conn.execute("SELECT test_disconnect()") self.assertEqual(res.status(), PGRES_TUPLES_OK) # wait for the backend terminates... for i in walrepl.polling(10, 0.5): cnt = self.count_walsender(conn) if cnt == 0: break self.assertEqual(cnt, 0)
def test_smart_shutdown(self): # 1. Verify if the system is UP and there is no WAL Receiver running # 2. Perform basebackup and deploy it into some dest. directory # 3. Copy recover.conf into the dest. directory to be used by Standby # 4. Initiate the Standby using the Master (primary) postmaster # paramerters # 5. Perform some transaction to generate xlog. Then do a smart shutdown # 6. Once the primary DB is down, find the last checkpoint from pg_control # on primary. Check the last modified xlog seg from the standby and find # if the last checkpoint from primary exists # 7. It should be present there! # 0. Stop standby if it's running PSQL.run_sql_command('DROP table if exists foo') standby = Standby('base', 5433) standby.stop() # 1. Verify if the system is UP and there is no WAL sender running self.assertEqual(self.count_walsender(), 0) logger.info('No active WAL Receiver found') # 2. Perform basebackup and deploy it into some dest. # (currenttly hardcoded 'base') directory shutil.rmtree('base', True) logger.info('Performing and deploying base backup ...') standby.create() # 3.Copy recover.conf into the dest. directory to be used by StandBy logger.info('Deploying recovery.conf...') # 4. Initiate the StandBy using the Master (primary) postmaster # paramerters logger.info('Initiating Standby...') res = standby.start() self.assertTrue(res.wasSuccessful()) num_walsender = 0 for i in polling(10, 0.5): num_walsender = self.count_walsender() if num_walsender > 0: break self.assertEqual(num_walsender, 1) logger.info('Activated WAL Receiver...') # 5. Perform some transaction to generate xlog. Then do a smart shutdown logger.info('Perform some transaction to generate some XLOG') PSQL.run_sql_command('Create table foo (a int)') logger.info('Now perform smart shutdown (gpstop -a)') cmd = Command(name="gpstop smart", cmdStr="source %s/greenplum_path.sh;\ gpstop -a" % os.environ["GPHOME"]) cmd.run(validateAfter=True) # 6. Once the primary DB is down, find the last checkpoint from pg_control # on primary. Check the last modified xlog seg from the standby and find # if the last checkpoint from primary exists logger.info('Read the pg_control from primary, find the last checkpoint & see if it made to standby') standby_xlog_path = os.path.join('base','pg_xlog') cmd= Command(name = 'pg_controldata ' + os.environ.get('MASTER_DATA_DIRECTORY'), cmdStr = 'pg_controldata ' + os.environ.get('MASTER_DATA_DIRECTORY')) cmd.run(validateAfter=True); primary_last_ckpt_lsn = self.last_ckpt_lsn((cmd.get_results()).stdout) logger.info ("Primary last checkpoint LSN = " + primary_last_ckpt_lsn) standby_last_mod_xlog = self.last_mod_file(standby_xlog_path) logger.info ("Last mod standby XLOG = " + standby_last_mod_xlog ) cmd= Command(name = 'xlogdump standby last modifiled xlog', cmdStr="xlogdump " + standby_last_mod_xlog) cmd.run(validateAfter=True); logger.info('See if we find the shutdown LSN in the XLOG seg file') lines = (cmd.get_results()).stdout.splitlines() flag = False for line in range(0, len(lines) - 1): if ((lines[line]).find(primary_last_ckpt_lsn) > -1): self.assertTrue((lines[line]).find("checkpoint") > -1) self.assertTrue((lines[line]).find("shutdown") > -1) flag = True break self.assertTrue(flag) logger.info('PASS') # Re-start the database logger.info('Now restart the DB (gpstart -a)') cmd = Command(name="gpstop smart", cmdStr="source %s/greenplum_path.sh;\ gpstart -a" % os.environ["GPHOME"]) cmd.run(validateAfter=True) # Cleanup. Currently we dont have a clean way of WAL rcv dying logger.info('Kill the standby processes as clean standby killing is not supported') cmd = Command(name="kill standby", cmdStr="kill -9 `ps -ef | grep 5433 | grep -v grep | awk '{print $2}'`") cmd.run(validateAfter=True)
def test_icg(self): # 1. Verify if the system is UP and there is no WAL Receiver running # 2. Perform basebackup and deploy it into some dest. directory # 3. Copy recover.conf into the dest. directory to be used by Standby # 4. Initiate the Standby using the Master (primary) postmaster # paramerters # 5. Once the WAL receiver waits for the next record to arrive, perform # installCheck-good # 0. Stop standby if it's running standby = Standby('base', 5433) standby.stop() # 1. Verify if the system is UP and there is no WAL sender running self.assertEqual(self.count_wal_sender(), 0) logger.info('No active WAL Receiver found') # Set environmental variable GPSRC for now for make installcheck purpose source_file = sys.modules[self.__class__.__module__].__file__ source_dir = os.path.dirname(source_file) os.environ['GPSRC'] = os.path.join(source_dir, '../../../../') # 2. Perform basebackup and deploy it into some dest. # (currenttly hardcoded 'base') directory shutil.rmtree('base', True) logger.info('Performing and deploying base backup ...') standby.create() # 3.Copy recover.conf into the dest. directory to be used by StandBy logger.info('Deploying recovery.conf...') # 4. Initiate the StandBy using the Master (primary) postmaster # paramerters logger.info('Initiating Standby...') res = standby.start() self.assertTrue(res.wasSuccessful()) num_walsender = 0 for i in polling(10, 0.5): num_walsender = self.count_wal_sender() if num_walsender > 0: break self.assertEqual(num_walsender, 1) logger.info('Activated WAL Receiver...') # 6. Run installcheck-good self.assertTrue(os.environ["GPSRC"]) installCheckGoodPath = os.path.join(os.environ.get('GPSRC'), 'test', 'regress') self.assertTrue(os.path.exists(installCheckGoodPath)) os.chdir(installCheckGoodPath) regression_diffs = os.path.join(installCheckGoodPath, 'regression.diffs') if os.path.exists(regression_diffs): os.remove(regression_diffs) logger.info('Perform InstallCheck-Good...') subprocess.check_call('make installcheck-good', shell=True) # Verify installcheck result by checking if regression.diff is present. self.assertTrue(not os.path.exists(regression_diffs))
def test_negative(self): """ This test verifies that if the WAL sender dies due to to any reason ,it should not keep backends blocked on replication waiting forever in sync-rep queue """ # Verify if the database is up. Run some sql. logger.info('Verify the DB is up...') PSQL.run_sql_command('DROP table if exists foo', dbname=os.environ.get('PGDATABASE')) Command('remove standby', 'gpinitstandby -ra').run() self.assertEqual(self.standby.create(), 0) # Trigger file cleanup if (os.path.exists(os.path.join(self.standby.datadir, 'wal_rcv_test'))): os.remove(os.path.join(self.standby.datadir, 'wal_rcv_test')) # Setup a standby logger.info('Setting up standby...') res = self.standby.start() self.assertTrue(res.wasSuccessful()) # Wait for the walreceiver to start num_walsender = self.wait_for_walsender() self.assertEqual(num_walsender, 1) logger.info('Activated WAL Receiver...') # Once the WAL receiver starts, signal it to suspend wal_rcv_pid = self.get_pid_having_keyword('wal receiver process') trigger_content = 'wait_before_send' logger.info('Suspending WAL Receiver(' + str(wal_rcv_pid) + ') ' + 'with ' + trigger_content + '...') self.generate_trigger_file(trigger_content) os.kill(wal_rcv_pid, signal.SIGUSR2) # Once suspended, spawn a new backend so that it gets blocked # Confirm that its blocked result = False for i in walrepl.polling(20, 1): sql = 'CREATE TABLE foo%s(a int, b int)' % str(i) subprocess.Popen(['psql', '-c', sql], stdout=subprocess.PIPE, stderr=subprocess.PIPE) num_blocked = self.count_blocked() if num_blocked >= 1: result = True break self.assertTrue(result, "no backend blocked") logger.info( 'The backend is blocked for replication (referred pg_stat_activity)...' ) logger.info( 'Shutdown the standby to break the connection with Master...') # Stop the standby as its of no use anymore self.standby.stop() # Verify that the wal sender died num_walsender = self.count_walsender() self.assertEqual(num_walsender, 0, "WAL sender has not gone") logger.info('Wal sender is now dead...') # Confirm that the there are no more replication waiting backends # Thats the test num_blocked = self.count_blocked() self.assertEqual(num_blocked, 0, str(num_blocked) + " backends still blocked") logger.info('The backend is no more blocked...')
def test_negative(self): """ This test verifies that if the WAL sender dies due to to any reason ,it should not keep backends blocked on replication waiting forever in sync-rep queue """ # Verify if the database is up. Run some sql. logger.info ('Verify the DB is up...') PSQL.run_sql_command('DROP table if exists foo',dbname=os.environ.get('PGDATABASE')) Command('remove standby', 'gpinitstandby -ra').run() self.assertEqual(self.standby.create(), 0) # Trigger file cleanup if (os.path.exists(os.path.join(self.standby.datadir, 'wal_rcv_test'))): os.remove(os.path.join(self.standby.datadir, 'wal_rcv_test')) # Setup a standby logger.info ('Setting up standby...') res = self.standby.start() self.assertTrue(res.wasSuccessful()) # Wait for the walreceiver to start num_walsender = self.wait_for_walsender() self.assertEqual(num_walsender, 1) logger.info('Activated WAL Receiver...') # Once the WAL receiver starts, signal it to suspend wal_rcv_pid = self.get_pid_having_keyword('wal receiver process') trigger_content = 'wait_before_send' logger.info('Suspending WAL Receiver(' + str(wal_rcv_pid) +') ' + 'with ' + trigger_content + '...') self.generate_trigger_file(trigger_content) os.kill(wal_rcv_pid, signal.SIGUSR2) # Once suspended, spawn a new backend so that it gets blocked # Confirm that its blocked result = False for i in walrepl.polling(20, 1): sql = 'CREATE TABLE foo%s(a int, b int)' % str(i) subprocess.Popen(['psql', '-c', sql], stdout=subprocess.PIPE, stderr=subprocess.PIPE) num_blocked = self.count_blocked() if num_blocked >= 1: result = True break self.assertTrue(result, "no backend blocked") logger.info('The backend is blocked for replication (referred pg_stat_activity)...') logger.info('Shutdown the standby to break the connection with Master...') # Stop the standby as its of no use anymore self.standby.stop() # Verify that the wal sender died, max within 1 min for retry in range(1,30): num_walsender = self.count_walsender() if num_walsender == 0: break; logger.info('Wal sender still exists, retrying ...' + str(retry)) time.sleep(2); self.assertEqual(num_walsender, 0, "WAL sender has not gone") logger.info('Wal sender is now dead...') # Confirm that the there are no more replication waiting backends # Thats the test num_blocked = self.count_blocked() self.assertEqual(num_blocked, 0, str(num_blocked) + " backends still blocked") logger.info('The backend is no more blocked...')
def test_smart_shutdown(self): # 1. Verify if the system is UP and there is no WAL Receiver running # 2. Perform basebackup and deploy it into some dest. directory # 3. Copy recover.conf into the dest. directory to be used by Standby # 4. Initiate the Standby using the Master (primary) postmaster # paramerters # 5. Perform some transaction to generate xlog. Then do a smart shutdown # 6. Once the primary DB is down, find the last checkpoint from pg_control # on primary. Check the last modified xlog seg from the standby and find # if the last checkpoint from primary exists # 7. It should be present there! # 0. Stop standby if it's running PSQL.run_sql_command('DROP table if exists foo') standby = Standby('base', 5433) standby.stop() # 1. Verify if the system is UP and there is no WAL sender running self.assertEqual(self.count_walsender(), 0) logger.info('No active WAL Receiver found') # 2. Perform basebackup and deploy it into some dest. # (currenttly hardcoded 'base') directory shutil.rmtree('base', True) logger.info('Performing and deploying base backup ...') standby.create() # 3.Copy recover.conf into the dest. directory to be used by StandBy logger.info('Deploying recovery.conf...') # 4. Initiate the StandBy using the Master (primary) postmaster # paramerters logger.info('Initiating Standby...') res = standby.start() self.assertTrue(res.wasSuccessful()) num_walsender = 0 for i in polling(10, 0.5): num_walsender = self.count_walsender() if num_walsender > 0: break self.assertEqual(num_walsender, 1) logger.info('Activated WAL Receiver...') # 5. Perform some transaction to generate xlog. Then do a smart shutdown logger.info('Perform some transaction to generate some XLOG') PSQL.run_sql_command('Create table foo (a int)') logger.info('Now perform smart shutdown (gpstop -a)') cmd = Command(name="gpstop smart", cmdStr="source %s/greenplum_path.sh;\ gpstop -a" % os.environ["GPHOME"]) cmd.run(validateAfter=True) # 6. Once the primary DB is down, find the last checkpoint from pg_control # on primary. Check the last modified xlog seg from the standby and find # if the last checkpoint from primary exists logger.info( 'Read the pg_control from primary, find the last checkpoint & see if it made to standby' ) standby_xlog_path = os.path.join('base', 'pg_xlog') cmd = Command( name='pg_controldata ' + os.environ.get('MASTER_DATA_DIRECTORY'), cmdStr='pg_controldata ' + os.environ.get('MASTER_DATA_DIRECTORY')) cmd.run(validateAfter=True) primary_last_ckpt_lsn = self.last_ckpt_lsn((cmd.get_results()).stdout) logger.info("Primary last checkpoint LSN = " + primary_last_ckpt_lsn) standby_last_mod_xlog = self.last_mod_file(standby_xlog_path) logger.info("Last mod standby XLOG = " + standby_last_mod_xlog) cmd = Command(name='xlogdump standby last modifiled xlog', cmdStr="xlogdump " + standby_last_mod_xlog) cmd.run(validateAfter=True) logger.info('See if we find the shutdown LSN in the XLOG seg file') lines = (cmd.get_results()).stdout.splitlines() flag = False for line in range(0, len(lines) - 1): if ((lines[line]).find(primary_last_ckpt_lsn) > -1): self.assertTrue((lines[line]).find("checkpoint") > -1) self.assertTrue((lines[line]).find("shutdown") > -1) flag = True break self.assertTrue(flag) logger.info('PASS') # Re-start the database logger.info('Now restart the DB (gpstart -a)') cmd = Command(name="gpstop smart", cmdStr="source %s/greenplum_path.sh;\ gpstart -a" % os.environ["GPHOME"]) cmd.run(validateAfter=True) # Cleanup. Currently we dont have a clean way of WAL rcv dying logger.info( 'Kill the standby processes as clean standby killing is not supported' ) cmd = Command( name="kill standby", cmdStr= "kill -9 `ps -ef | grep 5433 | grep -v grep | awk '{print $2}'`") cmd.run(validateAfter=True)