Пример #1
0
class GpRecoversegRegressionTests(unittest.TestCase):

    def setUp(self):
        self.gprec = GpRecover()
        self.gphome = os.environ.get('GPHOME')

    def failover(self, type = 'mirror'):
        if type == 'mirror':
            fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f filerep_consumer  -m async -y fault -r mirror -H ALL' % self.gphome
        else:
            fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f postmaster -m async -y panic -r primary -H ALL' % self.gphome
        return run_shell_command(fault_str, cmdname = 'Run fault injector to failover')
    
    def test_incr_gprecoverseg(self):
        self.gprec.wait_till_insync_transition()
        if(self.failover()):
            self.assertTrue(self.gprec.incremental())

    def test_full_gprecoverseg(self):
        self.gprec.wait_till_insync_transition()
        if(self.failover()):
            self.assertTrue(self.gprec.full())

    def test_gprecoverseg_rebalance(self):
        self.gprec.wait_till_insync_transition()
        if(self.failover('primary')):
            PSQL.run_sql_file(local_path('mirror_failover_trigger.sql'))
            self.gprec.incremental()
            if (self.gprec.wait_till_insync_transition()):
                self.assertTrue(self.gprec.rebalance())
    
    def test_wait_till_insync(self):
        self.gprec.wait_till_insync_transition()
        if(self.failover()):
            self.gprec.incremental()
            self.assertTrue(self.gprec.wait_till_insync_transition())
Пример #2
0
class GpRecoversegRegressionTests(unittest.TestCase):
    def setUp(self):
        self.gprec = GpRecover()
        self.gphome = os.environ.get('GPHOME')

    def failover(self, type='mirror'):
        if type == 'mirror':
            fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f filerep_consumer  -m async -y fault -r mirror -H ALL' % self.gphome
        else:
            fault_str = 'source %s/greenplum_path.sh;gpfaultinjector -f postmaster -m async -y panic -r primary -H ALL' % self.gphome
        return run_shell_command(fault_str,
                                 cmdname='Run fault injector to failover')

    def test_incr_gprecoverseg(self):
        self.gprec.wait_till_insync_transition()
        if (self.failover()):
            self.assertTrue(self.gprec.incremental())

    def test_full_gprecoverseg(self):
        self.gprec.wait_till_insync_transition()
        if (self.failover()):
            self.assertTrue(self.gprec.full())

    def test_gprecoverseg_rebalance(self):
        self.gprec.wait_till_insync_transition()
        if (self.failover('primary')):
            PSQL.run_sql_file(local_path('mirror_failover_trigger.sql'))
            self.gprec.incremental()
            if (self.gprec.wait_till_insync_transition()):
                self.assertTrue(self.gprec.rebalance())

    def test_wait_till_insync(self):
        self.gprec.wait_till_insync_transition()
        if (self.failover()):
            self.gprec.incremental()
            self.assertTrue(self.gprec.wait_till_insync_transition())
Пример #3
0
class FilerepTestCase(MPPTestCase):
    def __init__(self, methodName):
        self.pgport = os.environ.get('PGPORT')
        self.util = Filerepe2e_Util()
        self.gpconfig = GpConfig()
        self.config = GPDBConfig()
        self.gpr = GpRecover(self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        super(FilerepTestCase, self).__init__(methodName)

    def sleep(self, seconds=60):
        time.sleep(seconds)

    def create_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('create a file',
                      'touch %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def remove_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('remove a file',
                      'rm %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def get_timestamp_of_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command(
            'check timestamp',
            """ python -c "import os; print os.stat('%s').st_mtime" """ %
            file_path,
            ctxt=REMOTE,
            remoteHost=host)
        cmd.run(validateAfter=True)
        res = cmd.get_results().stdout.strip()
        return res

    def verify_file_exists(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check if file exists',
                      'test -f %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def handle_ext_cases(self, file):
        """
        @file: wet sql file to replace with specific machine env.
        """

        host = str(socket.gethostbyname(socket.gethostname()))  #Must be an IP
        querystring = "gpfdist://" + host + ":8088"

        if os.path.isfile(file):
            for line in fileinput.FileInput(file, inplace=1):
                line = re.sub('gpfdist.+8088', querystring, line)
                print str(re.sub('\n', '', line))

    def handle_hybrid_part_cases(self, file):
        """
        @file: hybrid sql file to replace with specific machine env
        """

        querystring = "FROM '" + local_path('hybrid_part.data') + "'"
        if os.path.isfile(file):
            for line in fileinput.FileInput(file, inplace=1):
                line = re.sub('FROM\s\'.+hybrid_part.data\'', querystring,
                              line)
                print str(re.sub('\n', '', line))

    def preprocess(self):
        """ 
        Replace the hard-coded information from sql files with correct hostname and ip address,etc 
        """

        list_workload_dir = [
            'set_sync1', 'sync1', 'set_ck_sync1', 'ck_sync1', 'set_ct', 'ct',
            'set_resync', 'resync', 'set_sync2', 'sync2'
        ]
        for dir in list_workload_dir:
            sql_path = os.path.join(local_path(dir), 'sql')
            ans_path = os.path.join(local_path(dir), 'expected')
            for file in os.listdir(sql_path):
                if (file.find('wet_ret') >= 0):
                    self.handle_ext_cases(os.path.join(sql_path, file))
                if (file.find('hybrid_part') >= 0):
                    self.handle_hybrid_part_cases(os.path.join(sql_path, file))
            for file in os.listdir(ans_path):
                if (file.find('wet_ret') >= 0):
                    self.handle_ext_cases(os.path.join(ans_path, file))
                if (file.find('hybrid_part') >= 0):
                    self.handle_hybrid_part_cases(os.path.join(ans_path, file))

    def clean_data(self):
        """ 
        Clean the data by removing the external table, otherwise, more data will be appended to the
        same external table from running multiple sql files. 
        """

        test = local_path("")
        test = str(test) + "data/*.*"

        cmd = 'rm -rfv ' + test
        run_shell_command(cmd)

    def anydownsegments(self):
        """
        checks if any segments are down
        """

        tinctest.logger.info("Checking if any segments are down")
        num_segments_down = self.count_of_nodes_down()
        if int(num_segments_down) == 0:
            return True
        else:
            return False

    def stop_start_validate(self, stopValidate=True):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """

        tinctest.logger.info("Performing stop start validate")
        tinctest.logger.info("Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate='i', validate=stopValidate)
        if not ok and stopValidate:
            raise Exception('Problem while shutting down the cluster')
        tinctest.logger.info("Successfully shutdown the cluster.")

        tinctest.logger.info("Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failed to bring the cluster back up')
        tinctest.logger.info("Successfully restarted the cluster.")
        if not self.anydownsegments():
            raise Exception("segments were marked down")
        else:
            return (True, "All segments are up")

    def method_reset_fault_injection(self):
        """
        Resets fault injection
        Return: (True, [result]) if OK, or (False, [result]) otherwise
        """

        tinctest.logger.info("Resetting fault injection")

        (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')
        if not ok1:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done Injecting Fault  to reset resync")

        return (True, str(out1))

    def method_resume_filerep_resync(self):
        """
        Resumes the process of resync
        """

        tinctest.logger.info("Resuming Resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync',
                                           m='async',
                                           y='resume',
                                           r='primary',
                                           H='ALL')
        if not ok:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done resuming resync")
        return (ok, out)

    def run_method_suspendresync(self):
        """
        Stops the cluster from going to resync
        """

        tinctest.logger.info("Suspending resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync',
                                           m='async',
                                           y='suspend',
                                           r='primary',
                                           H='ALL')
        tinctest.logger.info('output from suspend resync %s' % out)
        if not ok:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done Injecting Fault to suspend resync")
        return (ok, out)

    def count_of_masters(self):
        """
        Gives count of number of nodes in the cluster that are master 
        Return: count of number of nodes in the cluster that are master
        """

        tinctest.logger.info("Count the number of masters")
        cmd = "select count(*) from gp_segment_configuration where content = -1"
        (out) = PSQL.run_sql_command(cmd)
        num_master = out.split('\n')[3].strip()
        return num_master

    def count_of_nodes(self):
        """
        Gives count of number of nodes in the cluster
        Return: count of number of nodes in the cluster
        """

        tinctest.logger.info("Counting number of nodes")
        cmd = "select count(*) from gp_segment_configuration"
        (num_cl) = PSQL.run_sql_command(cmd)
        total_num_rows = num_cl.split('\n')[3].strip()
        return total_num_rows

    def count_of_nodes_in_ct(self):
        """
        Gives count of number of nodes in change tracking
        Return: count of number of nodes in change tracking
        """

        tinctest.logger.info("Counting number of nodes in ct")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'"
        (num_cl) = PSQL.run_sql_command(sqlcmd)
        num_cl = num_cl.split('\n')[3].strip()
        return num_cl

    def count_of_nodes_down(self):
        """
        Gives count of number of nodes marked as down
        Return: count of number of nodes marked as down
        """

        tinctest.logger.info("Counting the number of nodes down")
        sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'"
        (num_down) = PSQL.run_sql_command(sqlcmd)
        num_down = num_down.split('\n')[3].strip()
        return num_down

    def count_of_nodes_sync(self):
        """
        Gives count of number of nodes in sync
        Return: count of number of nodes in sync
        """

        tinctest.logger.info("Counting the number of nodes in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def count_of_nodes_not_sync(self):
        """
        Gives count of number of nodes not in sync
        Return: count of number of nodes not in sync
        """

        tinctest.logger.info("Counting number of nodes not in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def inject_fault_on_first_primary(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        tinctest.logger.info("\n Injecting faults on first primary")
        (ok,
         out) = self.util.inject_fault(f='filerep_immediate_shutdown_request',
                                       m='async',
                                       y='infinite_loop',
                                       r='primary',
                                       seg_id=2,
                                       sleeptime=300)
        if not ok:
            raise Exception(
                "Fault filerep_immediate_shutdown_request injection failed")

        (ok, out) = self.util.inject_fault(f='fileRep_is_operation_completed',
                                           m='async',
                                           y='infinite_loop',
                                           r='primary',
                                           seg_id=2)
        if not ok:
            raise Exception(
                "Fault fileRep_is_operation_completed injection failed")
        tinctest.logger.info("\n Done Injecting Fault")

    def inject_fault_on_first_mirror(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'"
        (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd)
        first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip()

        tinctest.logger.info("\n Injecting faults on first mirror")
        flag = self.util.check_fault_status(
            fault_name='fileRep_is_operation_completed',
            status='triggered',
            max_cycle=100)
        if not flag:
            raise Exception(
                "Fault fileRep_is_operation_completed didn't trigger")

        (ok, out) = self.util.inject_fault(f='filerep_consumer',
                                           m='async',
                                           y='panic',
                                           r='mirror',
                                           seg_id=first_mirror_dbid)
        if not ok:
            raise Exception("Fault filerep_consumer injection failed")
        tinctest.logger.info("\n Done Injecting Fault")

    def setupGpfdist(self, port, path):
        gpfdist = Gpfdist(port, self.hostIP())
        gpfdist.killGpfdist()
        gpfdist.startGpfdist(' -t 30 -m 1048576 -d ' + path)
        return True

    def cleanupGpfdist(self, port, path):
        gpfdist = Gpfdist(port, self.hostIP())
        gpfdist.killGpfdist()
        return True

    def hostIP(self):
        ok = run_shell_command('which gpfdist')
        if not ok:
            raise GPtestError("Error:'which gpfdist' command failed.")
        hostname = socket.gethostname()
        if hostname.find('mdw') > 0:
            host = 'mdw'
        else:
            host = str(socket.gethostbyname(
                socket.gethostname()))  #Must be an IP
        tinctest.logger.info('current host is %s' % host)
        return host

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs = Gpfilespace()
        gpfs.create_filespace('filerep_fs_a')
        gpfs.create_filespace('filerep_fs_b')
        gpfs.create_filespace('filerep_fs_c')
        gpfs.create_filespace('filerep_fs_z')
        gpfs.create_filespace('sync1_fs_1')

        # Set max_resource_queues to 100
        cmd = 'gpconfig -c max_resource_queues -v 100 '
        ok = run_shell_command(cmd)
        if not ok:
            raise Exception(
                'Failure during setting the max_resource_queues value to 100 using gpconfig tool'
            )
        #Restart the cluster
        self.gpstop.run_gpstop_cmd(immediate='i')
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failure during restarting the cluster')
        return True

    def get_ext_table_query_from_gpstate(self):
        outfile = local_path("gpstate_tmp")
        ok = run_shell_command("gpstate --printSampleExternalTableSql >" +
                               outfile)
        querystring = ""
        flag = 'false'
        out = open(outfile, 'r').readlines()
        for line in out:
            line.strip()
            if (line.find(
                    'DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status') >=
                    0):
                flag = 'true'
            if flag == 'true':
                querystring = querystring + line
        return querystring  ############RUN QYUERY

    def check_gpstate(self, type, phase):
        """ 
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        if phase == 'sync1':
            state_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'"
            )
            sync1_num = self.query_select_count(
                "select count(*) from gp_segment_configuration where content <> -1"
            )
            if int(sync1_num) <> int(state_num):
                raise Exception("gpstate in Sync state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'ct':
            p_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking'  and role = 'Primary' and status_in_config='Up' and instance_status='Up'"
            )
            m_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync'  and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' "
            )

            if int(p_num) <> int(m_num):
                raise Exception("gpstate in CT state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'resync_incr':

            if type == 'primary':
                query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            else:
                query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)

            query_num_rows = "select count(*) from gp_segment_configuration where content <> -1"
            num_rows = self.query_select_count(query_num_rows)

            if int(resync_incr_num) <> int(num_rows):
                tinctest.logger.info("resync_incr_num query run %s" % query)
                tinctest.logger.info("num_rows query run %s" % query_num_rows)
                raise Exception(
                    "gpstate in Resync Incremental  state failed. resync_incr_num %s <> num_rows %s"
                    % (resync_incr_num, num_rows))
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'resync_full':
            num_rows = self.query_select_count(
                "select count(*) from gp_segment_configuration where content <> -1"
            )

            if type == 'primary':
                resync_full_num = self.query_select_count(
                    "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'"
                )
            else:
                resync_full_num = self.query_select_count(
                    "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'"
                )

            if int(resync_full_num) <> int(num_rows):
                raise Exception("gptate in Resync Full state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        return True

    def trigger_transition(self):
        PSQL.run_sql_file(local_path('mirrors.sql'))

    def run_gpstate(self, type, phase):
        """
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        tinctest.logger.info("running gpstate")
        querystring = self.get_ext_table_query_from_gpstate()
        file1 = local_path('create_table_gpstate.sql')
        f1 = open(file1, 'w')
        f1.write(querystring)
        f1.write('\n')
        f1.close()
        PSQL.run_sql_file(local_path('create_table_gpstate.sql'))

        gpstate_outfile = local_path('gpstate_out')
        cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile)

        ok = run_shell_command(cmd)
        self.check_gpstate(type, phase)
        return ok

    def check_mirror_seg(self, master=False):
        tinctest.logger.info("running check mirror")
        self.dbstate.check_mirrorintegrity()

    def do_gpcheckcat(self,
                      dbname=None,
                      alldb=False,
                      online=False,
                      outputFile='checkcat.out',
                      outdir=None):
        tinctest.logger.info("running gpcheckcat")
        self.dbstate.check_catalog(outputFile=outputFile)

    def query_select_count(self, sqlcmd):
        (num) = PSQL.run_sql_command(sqlcmd)
        num = num.split('\n')[3].strip()
        return num

    def method_run_failover(self, type):
        """
        Inject fault to failover nodes
        @type: primary [induces fault in mirror] mirror [creates panic in primary]   
        Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise
        """

        if type == 'primary':
            tinctest.logger.info("\n primary failover")
            (ok, out) = self.util.inject_fault(f='filerep_consumer',
                                               m='async',
                                               y='fault',
                                               r='mirror',
                                               H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")

        elif type == 'mirror':
            tinctest.logger.info("\n Mirror failover")
            (ok, out) = self.util.inject_fault(f='postmaster',
                                               m='async',
                                               y='panic',
                                               r='primary',
                                               H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")
        return True

    def wait_till_change_tracking_transition(self):
        self.util.wait_till_change_tracking_transition()

    def wait_till_insync_transition(self):
        self.gpr.wait_till_insync_transition()

    def run_gprecoverseg(self, recover_mode):
        if recover_mode == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

    def run_gpconfig(self, parameter, master_value, segment_value):
        if (parameter is not None):
            self.gpconfig.setParameter(parameter, master_value, segment_value)
            self.gpstop.run_gpstop_cmd(restart='r')

    def inject_fault(self,
                     fault=None,
                     mode=None,
                     operation=None,
                     prim_mirr=None,
                     host='All',
                     table=None,
                     database=None,
                     seg_id=None,
                     sleeptime=None,
                     occurence=None):
        if (fault == None or mode == None or operation == None
                or prim_mirr == None):
            raise Exception('Incorrect parameters provided for inject fault')

        (ok, out) = self.util.inject_fault(f=fault,
                                           m=mode,
                                           y=operation,
                                           r=prim_mirr,
                                           H='ALL',
                                           table=table,
                                           database=database,
                                           sleeptime=sleeptime,
                                           o=occurence,
                                           seg_id=seg_id)
Пример #4
0
 def test_recovery_full(self):
     gprecover = GpRecover()
     gprecover.full()
     gprecover.wait_till_insync_transition()
Пример #5
0
 def full_recoverseg(self):
     gprecover = GpRecover(GPDBConfig())
     gprecover.full()
Пример #6
0
 def test_recovery_full(self):
     gprecover = GpRecover()
     gprecover.full()
     gprecover.wait_till_insync_transition()
Пример #7
0
class FilerepTestCase(MPPTestCase):

    def __init__(self, methodName):    
        self.pgport = os.environ.get('PGPORT')
        self.util = Filerepe2e_Util()
        self.gpconfig = GpConfig()
        self.config = GPDBConfig()
        self.gpr = GpRecover(self.config)
        self.dbstate = DbStateClass('run_validation',self.config)
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        super(FilerepTestCase,self).__init__(methodName)

    def sleep(self, seconds=60):
        time.sleep(seconds)

    def create_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('create a file', 'touch %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def remove_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('remove a file', 'rm %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def get_timestamp_of_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check timestamp', """ python -c "import os; print os.stat('%s').st_mtime" """ %
                      file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)
        res = cmd.get_results().stdout.strip()
        return res

    def verify_file_exists(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check if file exists', 'test -f %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def handle_ext_cases(self,file):
        """
        @file: wet sql file to replace with specific machine env.
        """

        host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP
        querystring = "gpfdist://"+host+":8088"
        
        if os.path.isfile(file):
            for line in fileinput.FileInput(file,inplace=1):
               line = re.sub('gpfdist.+8088',querystring,line)
               print str(re.sub('\n','',line))

    def handle_hybrid_part_cases(self, file):
        """
        @file: hybrid sql file to replace with specific machine env
        """

        querystring = "FROM '"+local_path('hybrid_part.data')+"'" 
        if os.path.isfile(file):
            for line in fileinput.FileInput(file,inplace=1):
                line = re.sub('FROM\s\'.+hybrid_part.data\'',querystring,line)
                print str(re.sub('\n','',line))


    def preprocess(self):
        """ 
        Replace the hard-coded information from sql files with correct hostname and ip address,etc 
        """

        list_workload_dir = ['set_sync1','sync1','set_ck_sync1','ck_sync1',
                        'set_ct','ct','set_resync','resync','set_sync2','sync2']
        for dir in list_workload_dir:
            sql_path = os.path.join(local_path(dir),'sql')
            ans_path = os.path.join(local_path(dir),'expected')
            for file in os.listdir(sql_path):
                    if (file.find('wet_ret')>=0):
                       self.handle_ext_cases(os.path.join(sql_path,file))
                    if (file.find('hybrid_part')>=0):
                       self.handle_hybrid_part_cases(os.path.join(sql_path,file))  
            for file in os.listdir(ans_path):
                    if (file.find('wet_ret')>=0):
                       self.handle_ext_cases(os.path.join(ans_path,file))
                    if (file.find('hybrid_part')>=0):
                       self.handle_hybrid_part_cases(os.path.join(ans_path,file)) 


    def clean_data(self):
        """ 
        Clean the data by removing the external table, otherwise, more data will be appended to the
        same external table from running multiple sql files. 
        """  

        test = local_path("")
        test = str(test) +"data/*.*"
    
        cmd = 'rm -rfv '+test
        run_shell_command(cmd)       

    def anydownsegments(self):
        """
        checks if any segments are down
        """        

        tinctest.logger.info("Checking if any segments are down")
        num_segments_down = self.count_of_nodes_down()
        if int(num_segments_down) == 0:
           return True
        else:
           return False

    def stop_start_validate(self, stopValidate=True):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """        

        tinctest.logger.info("Performing stop start validate")
        tinctest.logger.info("Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i', validate=stopValidate)
        if not ok and stopValidate:
           raise Exception('Problem while shutting down the cluster')
        tinctest.logger.info("Successfully shutdown the cluster.")

        tinctest.logger.info("Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failed to bring the cluster back up')
        tinctest.logger.info("Successfully restarted the cluster.")
        if not self.anydownsegments():
           raise Exception("segments were marked down")
        else:
           return (True, "All segments are up")


    def method_reset_fault_injection(self):
        """
        Resets fault injection
        Return: (True, [result]) if OK, or (False, [result]) otherwise
        """        

        tinctest.logger.info("Resetting fault injection")
        
        (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL')
        if not ok1:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done Injecting Fault  to reset resync")

        return (True, str(out1))


    def method_resume_filerep_resync(self):
        """
        Resumes the process of resync
        """

        tinctest.logger.info("Resuming Resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync', m='async',y='resume', r='primary', H='ALL')
        if not ok:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done resuming resync")
        return (ok, out)

    def run_method_suspendresync(self):
        """
        Stops the cluster from going to resync
        """

        tinctest.logger.info("Suspending resync")
        (ok,out) = self.util.inject_fault(f='filerep_resync', m='async' , y='suspend', r ='primary', H='ALL')
        tinctest.logger.info('output from suspend resync %s'%out)
        if not ok:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done Injecting Fault to suspend resync")
        return (ok, out)
      

    def count_of_masters(self):
        """
        Gives count of number of nodes in the cluster that are master 
        Return: count of number of nodes in the cluster that are master
        """

        tinctest.logger.info("Count the number of masters")
        cmd = "select count(*) from gp_segment_configuration where content = -1"
        (out) = PSQL.run_sql_command(cmd)
        num_master = out.split('\n')[3].strip()
        return num_master 


    def count_of_nodes(self):
        """
        Gives count of number of nodes in the cluster
        Return: count of number of nodes in the cluster
        """

        tinctest.logger.info("Counting number of nodes")
        cmd = "select count(*) from gp_segment_configuration"
        (num_cl) = PSQL.run_sql_command(cmd)
        total_num_rows = num_cl.split('\n')[3].strip()
        return total_num_rows


    def count_of_nodes_in_ct(self):
        """
        Gives count of number of nodes in change tracking
        Return: count of number of nodes in change tracking
        """

        tinctest.logger.info("Counting number of nodes in ct")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'"
        (num_cl) = PSQL.run_sql_command(sqlcmd)
        num_cl = num_cl.split('\n')[3].strip()
        return num_cl


    def count_of_nodes_down(self):
        """
        Gives count of number of nodes marked as down
        Return: count of number of nodes marked as down
        """

        tinctest.logger.info("Counting the number of nodes down")
        sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'"
        (num_down) = PSQL.run_sql_command(sqlcmd)
        num_down = num_down.split('\n')[3].strip()
        return num_down    


    def count_of_nodes_sync(self):
        """
        Gives count of number of nodes in sync
        Return: count of number of nodes in sync
        """

        tinctest.logger.info("Counting the number of nodes in sync")        
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync


    def count_of_nodes_not_sync(self):
        """
        Gives count of number of nodes not in sync
        Return: count of number of nodes not in sync
        """

        tinctest.logger.info("Counting number of nodes not in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def inject_fault_on_first_primary(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        tinctest.logger.info("\n Injecting faults on first primary")
        (ok,out) = self.util.inject_fault(f='filerep_immediate_shutdown_request', m='async' , y='infinite_loop', r ='primary', seg_id=2, sleeptime=300)
        if not ok:
            raise Exception("Fault filerep_immediate_shutdown_request injection failed")   

        (ok,out) = self.util.inject_fault(f='fileRep_is_operation_completed', m='async' , y='infinite_loop', r ='primary', seg_id=2)
        if not ok:
            raise Exception("Fault fileRep_is_operation_completed injection failed")   
        tinctest.logger.info("\n Done Injecting Fault")


    def inject_fault_on_first_mirror(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'"
        (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd)
        first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip()

        tinctest.logger.info("\n Injecting faults on first mirror")
        flag = self.util.check_fault_status(fault_name='fileRep_is_operation_completed', status='triggered', max_cycle=100);
        if not flag:
            raise Exception("Fault fileRep_is_operation_completed didn't trigger")   
 
        (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='panic', r ='mirror', seg_id=first_mirror_dbid)
        if not ok:
            raise Exception("Fault filerep_consumer injection failed")   
        tinctest.logger.info("\n Done Injecting Fault")

    def setupGpfdist(self, port, path):
        gpfdist = Gpfdist(port , self.hostIP())
        gpfdist.killGpfdist()
        gpfdist.startGpfdist(' -t 30 -m 1048576 -d '+path)
        return True

    def cleanupGpfdist(self, port,path):
        gpfdist = Gpfdist(port , self.hostIP())
        gpfdist.killGpfdist()
        return True

    def hostIP(self):
        ok = run_shell_command('which gpfdist')
        if not ok:
            raise GPtestError("Error:'which gpfdist' command failed.")
        hostname = socket.gethostname()
        if hostname.find('mdw') > 0 :
            host = 'mdw'
        else:
            host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP
        tinctest.logger.info('current host is %s'%host)
        return host

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs=Gpfilespace()
        gpfs.create_filespace('filerep_fs_a')
        gpfs.create_filespace('filerep_fs_b')
        gpfs.create_filespace('filerep_fs_c')
        gpfs.create_filespace('filerep_fs_z')
        gpfs.create_filespace('sync1_fs_1') 
 
        # Set max_resource_queues to 100 
        cmd = 'gpconfig -c max_resource_queues -v 100 '
        ok = run_shell_command(cmd)
        if not ok:
            raise Exception('Failure during setting the max_resource_queues value to 100 using gpconfig tool')
        #Restart the cluster
        self.gpstop.run_gpstop_cmd(immediate = 'i')
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failure during restarting the cluster')
        return True


    def get_ext_table_query_from_gpstate(self):
        outfile = local_path("gpstate_tmp")
        ok = run_shell_command("gpstate --printSampleExternalTableSql >"+ outfile)
        querystring = ""
        flag = 'false'
        out = open(outfile, 'r').readlines()
        for line in out:
            line.strip()
            if (line.find('DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status')>=0):
                flag = 'true'
            if flag == 'true':
                querystring = querystring + line
        return querystring ############RUN QYUERY

    def check_gpstate(self, type, phase):
        """ 
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """       

        if phase == 'sync1':
            state_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'")
            sync1_num = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1")
            if int(sync1_num) <> int(state_num):
                raise Exception("gpstate in Sync state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'ct':
            p_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking'  and role = 'Primary' and status_in_config='Up' and instance_status='Up'")
            m_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync'  and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' ")

            if int(p_num) <> int(m_num):
                raise Exception("gpstate in CT state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'resync_incr':
            
            if type == 'primary':
                query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            else:
                query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            
            query_num_rows = "select count(*) from gp_segment_configuration where content <> -1"
            num_rows = self.query_select_count(query_num_rows)
            
            if int(resync_incr_num) <> int(num_rows):
                tinctest.logger.info("resync_incr_num query run %s" % query)
                tinctest.logger.info("num_rows query run %s" % query_num_rows)
                raise Exception("gpstate in Resync Incremental  state failed. resync_incr_num %s <> num_rows %s" % (resync_incr_num, num_rows))
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'resync_full':
            num_rows = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1")
          
            if type == 'primary':
                resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'")
            else:
                resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'")

            if int(resync_full_num) <> int(num_rows):
                raise Exception("gptate in Resync Full state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))
        
        return True
    
    def trigger_transition(self):
        PSQL.run_sql_file(local_path('mirrors.sql'))
        

    def run_gpstate(self, type, phase):            
        """
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        tinctest.logger.info("running gpstate")
        querystring = self.get_ext_table_query_from_gpstate()
        file1 = local_path('create_table_gpstate.sql')
        f1 = open(file1,'w')
        f1.write(querystring)
        f1.write('\n')
        f1.close()
        PSQL.run_sql_file(local_path('create_table_gpstate.sql'))

        gpstate_outfile = local_path('gpstate_out')
        cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile)

        ok  = run_shell_command(cmd)
        self.check_gpstate(type, phase)
        return ok


    def check_mirror_seg(self, master=False):
        tinctest.logger.info("running check mirror")
        self.dbstate.check_mirrorintegrity()

    def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None):
        tinctest.logger.info("running gpcheckcat")
        self.dbstate.check_catalog(outputFile=outputFile)

    def query_select_count(self,sqlcmd):
        (num) = PSQL.run_sql_command(sqlcmd)
        num = num.split('\n')[3].strip()
        return num
    
    def method_run_failover(self,type):
        """
        Inject fault to failover nodes
        @type: primary [induces fault in mirror] mirror [creates panic in primary]   
        Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise
        """

        if type == 'primary':
            tinctest.logger.info("\n primary failover")
            (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='fault', r ='mirror', H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")

        elif type == 'mirror':
            tinctest.logger.info("\n Mirror failover")
            (ok,out) = self.util.inject_fault(f='postmaster', m='async' , y='panic', r ='primary', H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")
        return True

    def wait_till_change_tracking_transition(self):
        self.util.wait_till_change_tracking_transition()

    def wait_till_insync_transition(self):
        self.gpr.wait_till_insync_transition()
   
    def run_gprecoverseg(self,recover_mode):
        if recover_mode == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

    def run_gpconfig(self, parameter, master_value, segment_value):
        if (parameter is not None):
            self.gpconfig.setParameter(parameter, master_value, segment_value)
            self.gpstop.run_gpstop_cmd(restart='r')

    def inject_fault(self, fault = None, mode = None, operation = None, prim_mirr = None, host = 'All', table = None, database = None, seg_id = None, sleeptime = None, occurence = None):
        if (fault == None or mode == None or operation == None or prim_mirr == None):
            raise Exception('Incorrect parameters provided for inject fault')

        (ok,out) = self.util.inject_fault(f=fault, m=mode , y=operation, r=prim_mirr, H='ALL', table=table, database=database, sleeptime=sleeptime, o=occurence, seg_id=seg_id)
Пример #8
0
class SubTransactionLimitRemovalTestCase(MPPTestCase):

    def __init__(self, methodName):    
        super(SubTransactionLimitRemovalTestCase,self).__init__(methodName)
   
    def check_system(self):
        '''
        @summary: Check whether the system is up and sync. Exit out if not 
        '''
        tinctest.logger.info("[STLRTest] Running check_system")   
        
        tinctest.logger.info("[STLRTest] Check whether the system is up and sync")   
        
        cmd ="select count(*) from gp_segment_configuration where content<> -1 ;"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_all = num_cl.split('\n')[3].strip()
               
        cmd ="select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_up_and_sync = num_cl.split('\n')[3].strip()
        tinctest.logger.info("[STLRTest] printing gp segment configuration")
        (gp_seg_conf) = PSQL.run_sql_command("select * from gp_segment_configuration order by dbid")
        tinctest.logger.info(gp_seg_conf)


        if count_all != count_up_and_sync :
            raise Exception("[STLRTest] System not in sync and up. Exiting test")
        else:
            tinctest.logger.info("[STLRTest] Starting New Test: System is up and in sync...")

    def run_sqls(self,test):
        '''
        @summary : Run the sql 
        @param test: the sql file list
        '''        
        tinctest.logger.info("[STLRTest] Running run_sqls")   
        tinctest.logger.info("[STLRTest]Starting new thread to run sql %s"%(test))
        PSQL.run_sql_file(local_path(test))
            
    def suspend_faults(self,fault_name):
        '''
        @summary : Suspend the specified fault: reset it before issuing suspend 
        @param fault_name : Name of the fault to suspend
        '''
        tinctest.logger.info("[STLRTest] Running suspend_faults")   

        self.util = Filerepe2e_Util()

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting the %s fault"%(fault_name))      

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'suspend', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done suspending the %s fault"%(fault_name))
        
    def check_fault_status(self,fault_name = None, status = None, max_cycle=10):
        ''' 
        Check whether a fault is triggered. Poll till the fault is triggered
        @param name : Fault name
        @param status : Status to be checked - triggered/completed
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running check_fault_status %s", status)   

        if (not fault_name) or (not status) :
            raise Exception("[STLRTest]Need a value for fault_name and status to continue")

        poll =0
        while(poll < max_cycle):
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'status', r = 'primary', H ='ALL')
            poll +=1
            for line in out1.splitlines():
                if line.find(fault_name) > 0 and line.find(status) > 0 :
                    tinctest.logger.info("[STLRTest]Fault %s is %s " % (fault_name,status))
                    poll = 0
                    tinctest.logger.info("[STLRTest] Running check_fault_status %s TRUE", status)
                    return True

            #sleep a while before start polling again
            sleep(10)
        tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE", status)
        return False
        
        
    def filerep_fault(self,trans_state):
        '''
        @summary : Inject the filerep fault supplied
        @param trans_state : type of transition 
        '''
        tinctest.logger.info("[STLRTest] Running filerep_fault")   
        self.util = Filerepe2e_Util()

        if trans_state == 'failover_to_primary':
            tinctest.logger.info("[STLRTest] primary failover")
            (ok1,out1) = self.util.inject_fault(f='filerep_consumer', m = 'async', y = 'fault', r = 'mirror', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done primary failover fault")

        elif trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for postmaster panic")
            (ok1,out1) = self.util.inject_fault(f='postmaster', m = 'async', y = 'panic', r = 'primary', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done postmaster panic fault")

        elif trans_state == 'postmaster_reset':
            tinctest.logger.info("[STLRTest] fault for filerep_sender panic")
            (ok1,out1) = self.util.inject_fault(f='filerep_sender', m = 'async', y = 'panic', r = 'primary', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done filerep_sender panic fault")
            
        tinctest.logger.info("[STLRTest] Done Injecting Fault")

    def resume_faults(self,fault_name,trans_state):
        ''''
        @summary : Resume the fault and check status
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_faults")   

        if not trans_state == 'failover_to_mirror' :
            tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name)
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'primary', H ='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault resume failed")   
            tinctest.logger.info("[STLRTest]Done fault for %s resume" % fault_name)

        if trans_state == 'postmaster_reset':
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'mirror', H ='ALL')
            if not ok1:
                tinctest.logger.info("[STLRTest]Failed fault for %s resume on mirror" % fault_name)

        if trans_state == 'failover_to_primary' :
            self.check_fault_status(fault_name,'completed')

    def checkPSQLRun(self, test):
        '''Check if the psql run started in parallel is over before running the _post.sql '''
        tinctest.logger.info("[STLRTest] Running checkPSQLRun")   
        cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql'
        while(1):
            is_running = 0 
            (rc , out) = shell.run(cmd_str)
            for line in out:
                if '%s' %test in line:
                    is_running = 1 
            if is_running == 0:
                return True
            else:
                sleep(10)
        return False
        

    def resume_filerep_resync(self):
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_filerep_resync")   

        tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume")
        (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'resume', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done fault for failover_to_mirror resume")
        sleep(10)

    def stop_start_validate(self, expect_down_segments=False):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """        
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running stop_start_validate")   

        tinctest.logger.info("[STLRTest]Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
        if not expect_down_segments:
            if not ok:
                raise Exception('[STLRTest]Problem while shutting down the cluster')
            tinctest.logger.info("[STLRTest]Successfully shutdown the cluster.")

        tinctest.logger.info("[STLRTest]Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()

        if not ok:
            raise Exception('[STLRTest]Failed to bring the cluster back up')
        tinctest.logger.info("[STLRTest]Successfully restarted the cluster.")
        if not self.anydownsegments():
           raise Exception("[STLRTest]segments were marked down")
        else:
           return (True, "All segments are up")

    def run_gprecoverseg(self,recover_option):
        '''
        @summary : Call gpecoverseg full or incremental to bring back the cluster to sync
        '''
        self.gpr = GpRecover()

        tinctest.logger.info("[STLRTest] Running run_gprecoverseg")   

        if recover_option == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

        self.gpr.wait_till_insync_transition()
        
    def run_restart_database(self):
        '''
        @summary : Restart the database
        '''
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running run_restart_database")   
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
        tinctest.logger.info(ok)
        ok = self.gpstart.run_gpstart_cmd()
        tinctest.logger.info(ok)       
       
    def reset_faults(self,fault_name,current_cluster_state):
        ''''
        @summary : Reset the faults at the end of test 
        '''
        self.util = Filerepe2e_Util()
        tinctest.logger.info("[STLRTest] Running reset_faults")   

        tinctest.logger.info("[STLRTest] Resetting fault before ending test")

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting %s fault" %(fault_name))

        if current_cluster_state == 'resync':
            (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done filerep_resync fault")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting checkpoint fault" )
        
    def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None):
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
        self.dbstate.check_catalog()
        return True

    def _validation(self):
        '''
        @summary :gpcheckcat and gpcheckmirrorintegrity
        '''
        
        ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;")
        ###sleep(30) # sleep for some time for the segments to be in sync before validation
 
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running _validation")

        outfile = local_path("subt_checkcat.out")
        self.dbstate.check_catalog(outputFile=outfile)
         
        self.dbstate.check_mirrorintegrity()

    def inject_and_resume_fault(self, fault_name, trans_state):
        self.check_fault_status(fault_name, 'triggered')
        self.filerep_fault(trans_state)
        if trans_state == 'failover_to_mirror' :
            PSQL.run_sql_file(local_path('test_while_ct.sql'))
        self.resume_faults(fault_name, trans_state)

    def run_post_sqls(self, fault_name ='', trans_state=''):
        PSQL.wait_for_database_up();
        if (trans_state == 'failover_to_primary' or trans_state == ''):   
            post_sql = "failover_sql/subt_create_table_ao_post_commit"
        else:
            post_sql = "failover_sql/subt_create_table_ao_post_abort"       
            
        sql_file = post_sql+".sql"
        ans_file = post_sql+".ans"
        out_file = post_sql+".out"

        PSQL.run_sql_file(sql_file = local_path(sql_file), out_file = local_path(out_file))
        diff_res = Gpdiff.are_files_equal(local_path(out_file), local_path(ans_file))
        
        if not diff_res:
           self.fail("[STLRTest]Gpdiff failed for : %s %s" %(fault_name, trans_state))

    def reset_all_faults(self):
        ''''
        @summary : Reset all faults on primary and mirror 
        '''
        tinctest.logger.info("[STLRTest] Running reset_all_faults")   
        self.util = Filerepe2e_Util()

        (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting all faults on primary")

        (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'mirror', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting all faults fault on mirror") 

    def kill_zombies(self):
        ''' 
        @summary : There are stray zombie processes running after each test. This method clears them 
        '''
        tinctest.logger.info("[STLRTest] Running kill_zombies")
        cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1"
        cmd = Command("shell_command", cmd_str)
        tinctest.logger.info('Executing command: %s : %s' %("shell command", cmd_str))
        cmd.run()
        result = cmd.get_results()
        out = result.stdout
        lines = out.split('\n')
        for line in lines:
            pids = line.split('#')
            if pids[0] == '1':
               kill_str= "kill -9 %s" %(pids[1])
               cmd2 = Command("kill_command", kill_str)
               cmd2.run()


    def skip_checkpoint(self):
        ''' 
        @summary : Routine to inject fault that skips checkpointing 
        '''

        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running skip_checkpoint")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'skip', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault")

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs=Gpfilespace()
        gpfs.create_filespace('subt_filespace_a')

    def cleandb(self):
        db = Database()
        db.setupDatabase('gptest')
Пример #9
0
class SubTransactionLimitRemovalTestCase(MPPTestCase):
    def __init__(self, methodName):
        super(SubTransactionLimitRemovalTestCase, self).__init__(methodName)

    def check_system(self):
        '''
        @summary: Check whether the system is up and sync. Exit out if not 
        '''
        tinctest.logger.info("[STLRTest] Running check_system")

        tinctest.logger.info(
            "[STLRTest] Check whether the system is up and sync")

        cmd = "select count(*) from gp_segment_configuration where content<> -1 ;"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_all = num_cl.split('\n')[3].strip()

        cmd = "select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_up_and_sync = num_cl.split('\n')[3].strip()
        tinctest.logger.info("[STLRTest] printing gp segment configuration")
        (gp_seg_conf) = PSQL.run_sql_command(
            "select * from gp_segment_configuration order by dbid")
        tinctest.logger.info(gp_seg_conf)

        if count_all != count_up_and_sync:
            raise Exception(
                "[STLRTest] System not in sync and up. Exiting test")
        else:
            tinctest.logger.info(
                "[STLRTest] Starting New Test: System is up and in sync...")

    def run_sqls(self, test):
        '''
        @summary : Run the sql 
        @param test: the sql file list
        '''
        tinctest.logger.info("[STLRTest] Running run_sqls")
        tinctest.logger.info("[STLRTest]Starting new thread to run sql %s" %
                             (test))
        PSQL.run_sql_file(local_path(test))

    def suspend_faults(self, fault_name):
        '''
        @summary : Suspend the specified fault: reset it before issuing suspend 
        @param fault_name : Name of the fault to suspend
        '''
        tinctest.logger.info("[STLRTest] Running suspend_faults")

        self.util = Filerepe2e_Util()

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the %s fault" %
                             (fault_name))

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='suspend',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done suspending the %s fault" %
                             (fault_name))

    def check_fault_status(self, fault_name=None, status=None, max_cycle=10):
        ''' 
        Check whether a fault is triggered. Poll till the fault is triggered
        @param name : Fault name
        @param status : Status to be checked - triggered/completed
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running check_fault_status %s",
                             status)

        if (not fault_name) or (not status):
            raise Exception(
                "[STLRTest]Need a value for fault_name and status to continue")

        poll = 0
        while (poll < max_cycle):
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='status',
                                                 r='primary',
                                                 H='ALL')
            poll += 1
            for line in out1.splitlines():
                if line.find(fault_name) > 0 and line.find(status) > 0:
                    tinctest.logger.info("[STLRTest]Fault %s is %s " %
                                         (fault_name, status))
                    poll = 0
                    tinctest.logger.info(
                        "[STLRTest] Running check_fault_status %s TRUE",
                        status)
                    return True

            #sleep a while before start polling again
            sleep(10)
        tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE",
                             status)
        return False

    def filerep_fault(self, trans_state):
        '''
        @summary : Inject the filerep fault supplied
        @param trans_state : type of transition 
        '''
        tinctest.logger.info("[STLRTest] Running filerep_fault")
        self.util = Filerepe2e_Util()

        if trans_state == 'failover_to_primary':
            tinctest.logger.info("[STLRTest] primary failover")
            (ok1, out1) = self.util.inject_fault(f='filerep_consumer',
                                                 m='async',
                                                 y='fault',
                                                 r='mirror',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done primary failover fault")

        elif trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for postmaster panic")
            (ok1, out1) = self.util.inject_fault(f='postmaster',
                                                 m='async',
                                                 y='panic',
                                                 r='primary',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done postmaster panic fault")

        elif trans_state == 'postmaster_reset':
            tinctest.logger.info("[STLRTest] fault for filerep_sender panic")
            (ok1, out1) = self.util.inject_fault(f='filerep_sender',
                                                 m='async',
                                                 y='panic',
                                                 r='primary',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done filerep_sender panic fault")

        tinctest.logger.info("[STLRTest] Done Injecting Fault")

    def resume_faults(self, fault_name, trans_state):
        ''''
        @summary : Resume the fault and check status
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_faults")

        if not trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name)
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='resume',
                                                 r='primary',
                                                 H='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault resume failed")
            tinctest.logger.info("[STLRTest]Done fault for %s resume" %
                                 fault_name)

        if trans_state == 'postmaster_reset':
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='resume',
                                                 r='mirror',
                                                 H='ALL')
            if not ok1:
                tinctest.logger.info(
                    "[STLRTest]Failed fault for %s resume on mirror" %
                    fault_name)

        if trans_state == 'failover_to_primary':
            self.check_fault_status(fault_name, 'completed')

    def checkPSQLRun(self, test):
        '''Check if the psql run started in parallel is over before running the _post.sql '''
        tinctest.logger.info("[STLRTest] Running checkPSQLRun")
        cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql'
        while (1):
            is_running = 0
            (rc, out) = shell.run(cmd_str)
            for line in out:
                if '%s' % test in line:
                    is_running = 1
            if is_running == 0:
                return True
            else:
                sleep(10)
        return False

    def resume_filerep_resync(self):
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_filerep_resync")

        tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume")
        (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                             m='async',
                                             y='resume',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info(
            "[STLRTest]Done fault for failover_to_mirror resume")
        sleep(10)

    def stop_start_validate(self, expect_down_segments=False):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running stop_start_validate")

        tinctest.logger.info("[STLRTest]Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate='i')
        if not expect_down_segments:
            if not ok:
                raise Exception(
                    '[STLRTest]Problem while shutting down the cluster')
            tinctest.logger.info(
                "[STLRTest]Successfully shutdown the cluster.")

        tinctest.logger.info("[STLRTest]Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()

        if not ok:
            raise Exception('[STLRTest]Failed to bring the cluster back up')
        tinctest.logger.info("[STLRTest]Successfully restarted the cluster.")
        if not self.anydownsegments():
            raise Exception("[STLRTest]segments were marked down")
        else:
            return (True, "All segments are up")

    def run_gprecoverseg(self, recover_option):
        '''
        @summary : Call gpecoverseg full or incremental to bring back the cluster to sync
        '''
        self.gpr = GpRecover()

        tinctest.logger.info("[STLRTest] Running run_gprecoverseg")

        if recover_option == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

        self.gpr.wait_till_insync_transition()

    def run_restart_database(self):
        '''
        @summary : Restart the database
        '''
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running run_restart_database")
        ok = self.gpstop.run_gpstop_cmd(immediate='i')
        tinctest.logger.info(ok)
        ok = self.gpstart.run_gpstart_cmd()
        tinctest.logger.info(ok)

    def reset_faults(self, fault_name, current_cluster_state):
        ''''
        @summary : Reset the faults at the end of test 
        '''
        self.util = Filerepe2e_Util()
        tinctest.logger.info("[STLRTest] Running reset_faults")

        tinctest.logger.info("[STLRTest] Resetting fault before ending test")

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting %s fault" %
                             (fault_name))

        if current_cluster_state == 'resync':
            (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                                 m='async',
                                                 y='reset',
                                                 r='primary',
                                                 H='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done filerep_resync fault")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting checkpoint fault")

    def do_gpcheckcat(self,
                      dbname=None,
                      alldb=False,
                      online=False,
                      outputFile='checkcat.out',
                      outdir=None):
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
        self.dbstate.check_catalog()
        return True

    def _validation(self):
        '''
        @summary :gpcheckcat and gpcheckmirrorintegrity
        '''

        ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;")
        ###sleep(30) # sleep for some time for the segments to be in sync before validation

        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running _validation")

        outfile = local_path("subt_checkcat.out")
        self.dbstate.check_catalog(outputFile=outfile)

        self.dbstate.check_mirrorintegrity()

    def inject_and_resume_fault(self, fault_name, trans_state):
        self.check_fault_status(fault_name, 'triggered')
        self.filerep_fault(trans_state)
        if trans_state == 'failover_to_mirror':
            PSQL.run_sql_file(local_path('test_while_ct.sql'))
        self.resume_faults(fault_name, trans_state)

    def run_post_sqls(self, fault_name='', trans_state=''):
        PSQL.wait_for_database_up()
        if (trans_state == 'failover_to_primary' or trans_state == ''):
            post_sql = "failover_sql/subt_create_table_ao_post_commit"
        else:
            post_sql = "failover_sql/subt_create_table_ao_post_abort"

        sql_file = post_sql + ".sql"
        ans_file = post_sql + ".ans"
        out_file = post_sql + ".out"

        PSQL.run_sql_file(sql_file=local_path(sql_file),
                          out_file=local_path(out_file))
        diff_res = Gpdiff.are_files_equal(local_path(out_file),
                                          local_path(ans_file))

        if not diff_res:
            self.fail("[STLRTest]Gpdiff failed for : %s %s" %
                      (fault_name, trans_state))

    def reset_all_faults(self):
        ''''
        @summary : Reset all faults on primary and mirror 
        '''
        tinctest.logger.info("[STLRTest] Running reset_all_faults")
        self.util = Filerepe2e_Util()

        (ok1, out1) = self.util.inject_fault(f='all',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting all faults on primary")

        (ok1, out1) = self.util.inject_fault(f='all',
                                             m='async',
                                             y='reset',
                                             r='mirror',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info(
            "[STLRTest]Done resetting all faults fault on mirror")

    def kill_zombies(self):
        ''' 
        @summary : There are stray zombie processes running after each test. This method clears them 
        '''
        tinctest.logger.info("[STLRTest] Running kill_zombies")
        cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1"
        cmd = Command("shell_command", cmd_str)
        tinctest.logger.info('Executing command: %s : %s' %
                             ("shell command", cmd_str))
        cmd.run()
        result = cmd.get_results()
        out = result.stdout
        lines = out.split('\n')
        for line in lines:
            pids = line.split('#')
            if pids[0] == '1':
                kill_str = "kill -9 %s" % (pids[1])
                cmd2 = Command("kill_command", kill_str)
                cmd2.run()

    def skip_checkpoint(self):
        ''' 
        @summary : Routine to inject fault that skips checkpointing 
        '''

        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running skip_checkpoint")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='skip',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault")

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs = Gpfilespace()
        gpfs.create_filespace('subt_filespace_a')

    def cleandb(self):
        db = Database()
        db.setupDatabase('gptest')