示例#1
0
文件: utils.py 项目: satchel9/gpdb
def run_async_command(context, command):
    context.exception = None
    cmd = Command(name='run %s' % command, cmdStr='%s' % command)
    try:
        proc = cmd.runNoWait()
    except ExecutionError, e:
        context.exception = e
示例#2
0
文件: utils.py 项目: pf-qiu/gpdb
def run_async_command(context, command):
    context.exception = None
    cmd = Command(name='run %s' % command, cmdStr='%s' % command)
    try:
        proc = cmd.runNoWait()
    except ExecutionError, e:
        context.exception = e
示例#3
0
文件: utils.py 项目: satchel9/gpdb
def run_gpcommand_async(context, command):
    cmd = Command(name='run %s' % command, cmdStr='$GPHOME/bin/%s' % (command))
    context.asyncproc = cmd.runNoWait()
示例#4
0
    def test_master_panic_after_phase1(self):
        """PANIC master after recording distributed commit.

        Trigger PANIC in master after completing phase 1 of 2PC,
        right after recording distributed commit in xlog but before
        broadcasting COMMIT PREPARED to segments.  Master's recovery
        cycle should correctly broadcast COMMIT PREPARED because
        master should find distributed commit record in its xlog
        during recovery.  Verify that the transaction is committed
        after recovery.

        JIRA: MPP-19044

        """
        tinctest.logger.info("running test: test_crash_master_after_phase1")
        gparray = GpArray.initFromCatalog(dbconn.DbURL(), utility=True)
        assert len(gparray.getHostList()) == 1, "cannot run on multi-node"
        host = gparray.getHostList()[0]

        # Must have at least one in-sync and up segment.
        primaries = [
            p for p in gparray.get_list_of_primary_segments_on_host(host)
            if p.getSegmentMode() == "s" and p.getSegmentStatus() == "u"
        ]
        assert len(primaries) > 0, "in-sync and up primary not found"
        primary = primaries[0]
        tinctest.logger.info("chose primary: %s" % primary.datadir)

        # Inject suspend fault after recording distributed commit on master.
        cmd = Command("Suspend master post distributed commit",
                      self.faultcmd % "suspend")
        cmd.run(validateAfter=True)
        tinctest.logger.info(cmd.get_results().printResult())

        # Trigger the fault.
        cmd = Command("run DDL",
                      "psql -f %s" % local_path('sql/ao_create.sql'))
        self.proc = cmd.runNoWait()
        tinctest.logger.info("runNoWait: %s, pid: %d" %
                             (cmd.cmdStr, self.proc.pid))

        commitBlocked = self.filereputil.check_fault_status(
            fault_name='dtm_xlog_distributed_commit',
            status="triggered",
            seg_id='1',
            num_times_hit=1)

        # Shutdown of primary (and mirror) should happen only after
        # the commit is blocked due to suspend fault.
        assert commitBlocked, "timeout waiting for commit to be blocked"
        tinctest.logger.info("commit is blocked due to suspend fault")
        # At this point, segments have already recorded the
        # transaction as prepared by writing PREPARE record in xlog.
        # Crash one primary (and its mirror).
        mirror = None
        mirrors = [
            m for m in gparray.get_list_of_mirror_segments_on_host(host)
            if m.getSegmentMode() == "s" and m.getSegmentStatus() == "u"
            and primary.getSegmentContentId() == m.getSegmentContentId()
        ]
        if len(mirrors) > 0:
            mirror = mirrors[0]
            tinctest.logger.info("chose mirror: %s" % mirror.datadir)
            # Pause FTS probes to avoid a failover while we bring down
            # segments.  Note that we bring down both primary and its
            # mirror, thereby causing double failure.  This prevents
            # FTS from making changes to segment configuration, even
            # if FTS probes are unpaused.  It is necessary to unpause
            # FTS probes to prevent gang creation from being blocked.
            PSQL.run_sql_command_utility_mode("SET gp_fts_probe_pause = on")
            tinctest.logger.info("FTS probes paused")
            cmdstr = 'pg_ctl -D %s stop -m immediate' % mirror.datadir
            tinctest.logger.info("bringing down primary: %s" % cmdstr)
            cmd = Command("Shutdown a primary segment", cmdstr)
            cmd.run(validateAfter=True)

        cmdstr = 'pg_ctl -D %s stop -m immediate' % primary.datadir
        tinctest.logger.info("bringing down primary: %s" % cmdstr)
        cmd = Command("Shutdown a primary segment", cmdstr)
        cmd.run(validateAfter=True)

        if mirror is not None:
            PSQL.run_sql_command_utility_mode("SET gp_fts_probe_pause = off")
            tinctest.logger.info("FTS probes unpaused")

        # Resume master.  Master should PANIC and go through crash recovery.
        cmd = Command("resume master", self.faultcmd % "resume")
        cmd.run(validateAfter=True)
        tinctest.logger.info(cmd.get_results().printResult())

        (rc, out, err) = self.proc.communicate2()
        self.proc = None
        tinctest.logger.info("runNoWait rc: %d, output: %s, err: %s" %
                             (rc, out, err))
        # Fail if QD did not PANIC.
        assert (out.find("commit succeeded") == -1
                and err.find("commit succeeded") == -1
                and err.find("PANIC") != -1)
        # Wait for recovery to complete, timeout after ~ 5 mins.
        attempts = 1
        recoveryComplete = False
        while attempts < 600 and not recoveryComplete:
            recoveryComplete = "aaa150" in PSQL.run_sql_command_utility_mode(
                "select 'aaa' || (100+50)")
            time.sleep(0.5)
            attempts = attempts + 1
        assert recoveryComplete, "timeout waiting for master to recover"
        cmdstr = "gpstop -ar"
        cmd = Command("restart", cmdstr)
        tinctest.logger.info("restarting the cluster with '%s'" % cmdstr)
        cmd.run(validateAfter=True)
        tinctest.logger.info("restart complete")
        # Verify table got created (commit was successful).
        assert PSQL.run_sql_file(local_path('sql/ao_select.sql'))

        gpverify = GpdbVerify()
        (errorCode, hasError, gpcheckcat_output,
         repairScript) = gpverify.gpcheckcat()
        assert errorCode == 0, ("gpcheckcat failed: %s" % gpcheckcat_output[0])

        # No need to restart GPDB again in tearDown()
        self.skipRestart = True
    def test_master_panic_after_phase1(self):
        """PANIC master after recording distributed commit.

        Trigger PANIC in master after completing phase 1 of 2PC,
        right after recording distributed commit in xlog but before
        broadcasting COMMIT PREPARED to segments.  Master's recovery
        cycle should correctly broadcast COMMIT PREPARED because
        master should find distributed commit record in its xlog
        during recovery.  Verify that the transaction is committed
        after recovery.

        JIRA: MPP-19044

        """
        tinctest.logger.info("running test: test_crash_master_after_phase1")
        gparray = GpArray.initFromCatalog(dbconn.DbURL(), utility=True)
        assert len(gparray.getHostList()) == 1, "cannot run on multi-node"
        host = gparray.getHostList()[0]

        # Must have at least one in-sync and up segment.
        primaries = [
            p for p in gparray.get_list_of_primary_segments_on_host(host)
            if p.getSegmentMode() == "s" and p.getSegmentStatus() == "u"]
        assert len(primaries) > 0, "in-sync and up primary not found"
        primary = primaries[0]
        tinctest.logger.info("chose primary: %s" % primary.datadir)

        # Inject suspend fault after recording distributed commit on master.
        cmd = Command("Suspend master post distributed commit",
                      self.faultcmd % "suspend")
        cmd.run(validateAfter=True)
        tinctest.logger.info(cmd.get_results().printResult())

        # Trigger the fault.
        cmd = Command("run DDL", "psql -f %s" %
                      local_path('sql/ao_create.sql'))
        self.proc = cmd.runNoWait()
        tinctest.logger.info("runNoWait: %s, pid: %d" % (cmd.cmdStr, self.proc.pid))

        commitBlocked = self.filereputil.check_fault_status(fault_name='dtm_xlog_distributed_commit', status="triggered", seg_id='1', num_times_hit=1);

        # Shutdown of primary (and mirror) should happen only after
        # the commit is blocked due to suspend fault.
        assert commitBlocked, "timeout waiting for commit to be blocked"
        tinctest.logger.info("commit is blocked due to suspend fault")
        # At this point, segments have already recorded the
        # transaction as prepared by writing PREPARE record in xlog.
        # Crash one primary (and its mirror).
        mirror = None
        mirrors = [m for m in gparray.get_list_of_mirror_segments_on_host(host)
                   if m.getSegmentMode() == "s" and m.getSegmentStatus() == "u"
                   and primary.getSegmentContentId() == m.getSegmentContentId()]
        if len(mirrors) > 0:
            mirror = mirrors[0]
            tinctest.logger.info("chose mirror: %s" % mirror.datadir)
            # Pause FTS probes to avoid a failover while we bring down
            # segments.  Note that we bring down both primary and its
            # mirror, thereby causing double failure.  This prevents
            # FTS from making changes to segment configuration, even
            # if FTS probes are unpaused.  It is necessary to unpause
            # FTS probes to prevent gang creation from being blocked.
            PSQL.run_sql_command_utility_mode("SET gp_fts_probe_pause = on")
            tinctest.logger.info("FTS probes paused")
            cmdstr = 'pg_ctl -D %s stop -m immediate' % mirror.datadir
            tinctest.logger.info("bringing down primary: %s" % cmdstr)
            cmd = Command("Shutdown a primary segment", cmdstr)
            cmd.run(validateAfter=True)

        cmdstr = 'pg_ctl -D %s stop -m immediate' % primary.datadir
        tinctest.logger.info("bringing down primary: %s" % cmdstr)
        cmd = Command("Shutdown a primary segment", cmdstr)
        cmd.run(validateAfter=True)

        if mirror is not None:
            PSQL.run_sql_command_utility_mode("SET gp_fts_probe_pause = off")
            tinctest.logger.info("FTS probes unpaused")

        # Resume master.  Master should PANIC and go through crash recovery.
        cmd = Command("resume master", self.faultcmd % "resume")
        cmd.run(validateAfter=True)
        tinctest.logger.info(cmd.get_results().printResult())

        (rc, out, err) = self.proc.communicate2()
        self.proc = None
        tinctest.logger.info("runNoWait rc: %d, output: %s, err: %s" %
                              (rc, out, err))
        # Fail if QD did not PANIC.
        assert (out.find("commit succeeded") == -1 and
                err.find("commit succeeded") == -1 and
                err.find("PANIC") != -1)

        # Wait for a few seconds to ensure that postmaster reset has started
        time.sleep(5)

        # Wait for recovery to complete, timeout after ~ 5 mins.
        attempts = 1
        recoveryComplete = False
        while attempts < 600 and not recoveryComplete:
            recoveryComplete = "aaa150" in PSQL.run_sql_command_utility_mode(
                "select 'aaa' || (100+50)")
            time.sleep(0.5)
            attempts = attempts + 1
        assert recoveryComplete, "timeout waiting for master to recover"
        cmdstr = "gpstop -ar"
        cmd = Command("restart", cmdstr)
        tinctest.logger.info("restarting the cluster with '%s'" % cmdstr)
        cmd.run(validateAfter=True)
        tinctest.logger.info("restart complete")
        # Verify table got created (commit was successful).
        assert PSQL.run_sql_file(local_path('sql/ao_select.sql'))

        gpverify = GpdbVerify()
        (errorCode, hasError, gpcheckcat_output, repairScript) = gpverify.gpcheckcat()
        assert errorCode == 0, ("gpcheckcat failed: %s" % gpcheckcat_output[0])

        # No need to restart GPDB again in tearDown()
        self.skipRestart = True
示例#6
0
文件: utils.py 项目: pf-qiu/gpdb
def run_gpcommand_async(context, command):
    cmd = Command(name='run %s' % command, cmdStr='$GPHOME/bin/%s' % (command))
    context.asyncproc = cmd.runNoWait()