def __runWaitAndCheckWorkerPoolForErrorsAndClear(self, cmds, actionVerb, suppressErrorCheck=False, progressCmds=[]): for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() elif progressCmds: self._join_and_show_segment_progress( progressCmds, inplace=self.__progressMode == GpMirrorListToBuild.Progress.INPLACE) else: base.join_and_indicate_progress(self.__pool) if not suppressErrorCheck: self.__pool.check_results() completedRecoveryCmds = list( set(self.__pool.getCompletedItems()) & set(cmds)) self.__pool.empty_completed_items() return completedRecoveryCmds
def __runStartCommand(self, segments, startMethod, numContentsInCluster, resultOut, gpArray, era): """ Putt results into the resultOut object """ if len(segments) == 0: return if startMethod == START_AS_PRIMARY_OR_MIRROR: logger.info("Commencing parallel primary and mirror segment instance startup, please wait...") else: logger.info("Commencing parallel segment instance startup, please wait...") dbIdToPeerMap = gpArray.getDbIdToPeerMap() mirroringModePreTransition = MIRROR_MODE_MIRRORLESS if startMethod == START_AS_MIRRORLESS else MIRROR_MODE_QUIESCENT # launch the start for hostName, segments in GpArray.getSegmentsByHostName(segments).iteritems(): logger.debug("Dispatching command to start segments on host: %s, " \ "with %s contents in cluster" % (hostName, numContentsInCluster)) pickledTransitionData = None if startMethod == START_AS_PRIMARY_OR_MIRROR: mirroringModePerSegment = [] for seg in segments: modeThisSegment = MIRROR_MODE_PRIMARY if seg.isSegmentPrimary(True) else MIRROR_MODE_MIRROR mirroringModePerSegment.append(modeThisSegment) pickledTransitionData = self.__createPickledTransitionParameters(segments, mirroringModePerSegment, None, dbIdToPeerMap) # # This will call sbin/gpsegstart.py # cmd = gp.GpSegStartCmd("remote segment starts on host '%s'" % hostName, self.__gpHome, segments, self.__gpVersion, mirroringModePreTransition, numContentsInCluster, era, self.master_checksum_value, self.__timeout, verbose=logging_is_verbose(), ctxt=base.REMOTE, remoteHost=segments[0].getSegmentAddress(), pickledTransitionData=pickledTransitionData, specialMode=self.__specialMode, wrapper=self.__wrapper, wrapper_args=self.__wrapper_args, parallel=self.__parallel, logfileDirectory=self.logfileDirectory) self.__workerPool.addCommand(cmd) if self.__quiet: self.__workerPool.join() else: base.join_and_indicate_progress(self.__workerPool) # process results self.__processStartOrConvertCommands(resultOut) self.__workerPool.empty_completed_items()
def run_pg_rewind(self, rewindInfo): """ Run pg_rewind for incremental recovery. """ rewindFailedSegments = [] # Run pg_rewind on all the targets for rewindSeg in list(rewindInfo.values()): # Do CHECKPOINT on source to force TimeLineID to be updated in pg_control. # pg_rewind wants that to make incremental recovery successful finally. self.__logger.debug( 'Do CHECKPOINT on %s (port: %d) before running pg_rewind.' % (rewindSeg.sourceHostname, rewindSeg.sourcePort)) dburl = dbconn.DbURL(hostname=rewindSeg.sourceHostname, port=rewindSeg.sourcePort, dbname='template1') conn = dbconn.connect(dburl, utility=True) dbconn.execSQL(conn, "CHECKPOINT") conn.close() # If the postmaster.pid still exists and another process # is actively using that pid, pg_rewind will fail when it # tries to start the failed segment in single-user # mode. It should be safe to remove the postmaster.pid # file since we do not expect the failed segment to be up. self.remove_postmaster_pid_from_remotehost( rewindSeg.targetSegment.getSegmentHostName(), rewindSeg.targetSegment.getSegmentDataDirectory()) # Note the command name, we use the dbid later to # correlate the command results with GpMirrorToBuild # object. cmd = gp.SegmentRewind( 'rewind dbid: %s' % rewindSeg.targetSegment.getSegmentDbId(), rewindSeg.targetSegment.getSegmentHostName(), rewindSeg.targetSegment.getSegmentDataDirectory(), rewindSeg.sourceHostname, rewindSeg.sourcePort, verbose=gplog.logging_is_verbose()) self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) for cmd in self.__pool.getCompletedItems(): self.__logger.debug('pg_rewind results: %s' % cmd.results) if not cmd.was_successful(): dbid = int(cmd.name.split(':')[1].strip()) self.__logger.debug("%s failed" % cmd.name) self.__logger.warning(cmd.get_stdout()) self.__logger.warning( "Incremental recovery failed for dbid %d. You must use gprecoverseg -F to recover the segment." % dbid) rewindFailedSegments.append(rewindInfo[dbid].targetSegment) self.__pool.empty_completed_items() return rewindFailedSegments
def __runStartCommand(self, segments, startMethod, numContentsInCluster, resultOut, gpArray, era): """ Putt results into the resultOut object """ if len(segments) == 0: return if startMethod == START_AS_PRIMARY_OR_MIRROR: logger.info("Commencing parallel primary and mirror segment instance startup, please wait...") else: logger.info("Commencing parallel segment instance startup, please wait...") dbIdToPeerMap = gpArray.getDbIdToPeerMap() mirroringModePreTransition = MIRROR_MODE_MIRRORLESS if startMethod == START_AS_MIRRORLESS else MIRROR_MODE_QUIESCENT # launch the start for hostName, segments in GpArray.getSegmentsByHostName(segments).items(): logger.debug("Dispatching command to start segments on host: %s, " \ "with %s contents in cluster" % (hostName, numContentsInCluster)) pickledTransitionData = None if startMethod == START_AS_PRIMARY_OR_MIRROR: mirroringModePerSegment = [] for seg in segments: modeThisSegment = MIRROR_MODE_PRIMARY if seg.isSegmentPrimary(True) else MIRROR_MODE_MIRROR mirroringModePerSegment.append(modeThisSegment) pickledTransitionData = self.__createPickledTransitionParameters(segments, mirroringModePerSegment, None, dbIdToPeerMap) # # This will call sbin/gpsegstart.py # cmd = gp.GpSegStartCmd("remote segment starts on host '%s'" % hostName, self.__gpHome, segments, self.__gpVersion, mirroringModePreTransition, numContentsInCluster, era, self.master_checksum_value, self.__timeout, verbose=logging_is_verbose(), ctxt=base.REMOTE, remoteHost=segments[0].getSegmentAddress(), pickledTransitionData=pickledTransitionData, specialMode=self.__specialMode, wrapper=self.__wrapper, wrapper_args=self.__wrapper_args, parallel=self.__parallel, logfileDirectory=self.logfileDirectory) self.__workerPool.addCommand(cmd) if self.__quiet: self.__workerPool.join() else: base.join_and_indicate_progress(self.__workerPool) # process results self.__processStartOrConvertCommands(resultOut) self.__workerPool.empty_completed_items()
def __runWaitAndCheckWorkerPoolForErrorsAndClear(self, cmds, actionVerb, suppressErrorCheck=False): for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) if not suppressErrorCheck: self.__pool.check_results() self.__pool.empty_completed_items()
def test_join_and_indicate_progress_flushes_every_dot(self): duration = 0.005 cmd = mock.Mock(spec=Command) def wait_for_duration(): time.sleep(duration) cmd.run.side_effect = wait_for_duration self.pool.addCommand(cmd) stdout = mock.Mock(spec=file) join_and_indicate_progress(self.pool, stdout, interval=(duration / 5)) for i, call in enumerate(stdout.mock_calls): # Every written dot should be followed by a flush(). if call == mock.call.write('.'): self.assertEqual(stdout.mock_calls[i + 1], mock.call.flush())
def __runWaitAndCheckWorkerPoolForErrorsAndClear(self, cmds, actionVerb, suppressErrorCheck=False, progressCmds=[]): for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() elif progressCmds: self._join_and_show_segment_progress(progressCmds, inplace=self.__progressMode == GpMirrorListToBuild.Progress.INPLACE) else: base.join_and_indicate_progress(self.__pool) if not suppressErrorCheck: self.__pool.check_results() self.__pool.empty_completed_items()
def test_join_and_indicate_progress_prints_dots_until_pool_is_done(self): # To avoid false negatives from the race conditions here, let's set up a # situation where we'll print ten dots on average, and verify that there # were at least five dots printed. duration = 0.01 cmd = mock.Mock(spec=Command) def wait_for_duration(): time.sleep(duration) cmd.run.side_effect = wait_for_duration self.pool.addCommand(cmd) stdout = StringIO.StringIO() join_and_indicate_progress(self.pool, stdout, interval=(duration / 10)) results = stdout.getvalue() self.assertIn('.....', results) self.assertTrue(results.endswith('\n'))
def test_join_and_indicate_progress_flushes_every_dot(self): duration = 0.005 cmd = mock.Mock(spec=Command) def wait_for_duration(): time.sleep(duration) cmd.run.side_effect = wait_for_duration self.pool.addCommand(cmd) stdout = mock.Mock(io.StringIO()) join_and_indicate_progress(self.pool, stdout, interval=(duration / 5)) for i, call in enumerate(stdout.mock_calls): # Every written dot should be followed by a flush(). if call == mock.call.write('.'): self.assertEqual(stdout.mock_calls[i + 1], mock.call.flush())
def test_join_and_indicate_progress_prints_dots_until_pool_is_done(self): # To avoid false negatives from the race conditions here, let's set up a # situation where we'll print ten dots on average, and verify that there # were at least five dots printed. duration = 0.01 cmd = mock.Mock(spec=Command) def wait_for_duration(): time.sleep(duration) cmd.run.side_effect = wait_for_duration self.pool.addCommand(cmd) stdout = StringIO.StringIO() join_and_indicate_progress(self.pool, stdout, interval=(duration / 10)) results = stdout.getvalue() self.assertIn('.....', results) self.assertTrue(results.endswith('\n'))
def _do_setup_for_recovery(self, recovery_info_by_host): self.__logger.info('Setting up the required segments for recovery') cmds = [] for host_name, recovery_info_list in recovery_info_by_host.items(): cmds.append( gp.GpSegSetupRecovery( 'Run validation checks and setup data directories for recovery', recoveryinfo.serialize_list(recovery_info_list), gplog.get_logger_dir(), verbose=gplog.logging_is_verbose(), batchSize=self.__parallelPerHost, remoteHost=host_name, forceoverwrite=self.__forceoverwrite)) for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) completed_results = self.__pool.getCompletedItems() self.__pool.empty_completed_items() return completed_results
def test_join_and_indicate_progress_prints_nothing_if_pool_is_done(self): stdout = StringIO.StringIO() join_and_indicate_progress(self.pool, stdout) self.assertEqual(stdout.getvalue(), '')
def tmain(): join_and_indicate_progress(self.pool, write_end, interval=0.001) write_end.close()
def test_join_and_indicate_progress_prints_nothing_if_pool_is_done(self): stdout = io.StringIO() join_and_indicate_progress(self.pool, stdout) self.assertEqual(stdout.getvalue(), '')
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [] destSegments = [] isTargetReusedLocation = [] timeStamp = datetime.datetime.today().strftime('%Y%m%d_%H%M%S') for directive in directives: srcSegment = directive.getSrcSegment() destSegment = directive.getDestSegment() destSegment.primaryHostname = srcSegment.getSegmentHostName() destSegment.primarySegmentPort = srcSegment.getSegmentPort() destSegment.progressFile = '%s/pg_basebackup.%s.dbid%s.out' % ( gplog.get_logger_dir(), timeStamp, destSegment.getSegmentDbId()) srcSegments.append(srcSegment) destSegments.append(destSegment) isTargetReusedLocation.append(directive.isTargetReusedLocation()) destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment( destSegments, isTargetReusedLocation) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, gplog.get_logger_dir(), newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly, forceoverwrite=self.__forceoverwrite) # # validate directories for target segments # self.__logger.info('Validating remote directories') cmds = [] for hostName in list(destSegmentByHost.keys()): cmds.append( createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append( "Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # Configure a new segment # # Recover segments using gpconfigurenewsegment, which # uses pg_basebackup. gprecoverseg generates a log filename which is # passed to gpconfigurenewsegment as a confinfo parameter. gprecoverseg # tails this file to show recovery progress to the user, and removes the # file when one done. A new file is generated for each run of # gprecoverseg based on a timestamp. self.__logger.info('Configuring new segments') cmds = [] progressCmds = [] removeCmds = [] for hostName in list(destSegmentByHost.keys()): for segment in destSegmentByHost[hostName]: progressCmd, removeCmd = self.__getProgressAndRemoveCmds( segment.progressFile, segment.getSegmentDbId(), hostName) removeCmds.append(removeCmd) if progressCmd: progressCmds.append(progressCmd) cmds.append( createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "unpacking basic segment directory", suppressErrorCheck=False, progressCmds=progressCmds) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( removeCmds, "removing pg_basebackup progress logfiles", suppressErrorCheck=False) # # copy dump files from old segment to new segment # for srcSeg in srcSegments: for destSeg in destSegments: if srcSeg.content == destSeg.content: src_dump_dir = os.path.join( srcSeg.getSegmentDataDirectory(), 'db_dumps') cmd = base.Command('check existence of db_dumps directory', 'ls %s' % (src_dump_dir), ctxt=base.REMOTE, remoteHost=destSeg.getSegmentAddress()) cmd.run() if cmd.results.rc == 0: # Only try to copy directory if it exists cmd = Scp( 'copy db_dumps from old segment to new segment', os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps*', '*'), os.path.join(destSeg.getSegmentDataDirectory(), 'db_dumps'), srcSeg.getSegmentAddress(), destSeg.getSegmentAddress(), recursive=True) cmd.run(validateAfter=True) break
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [] destSegments = [] isTargetReusedLocation = [] timeStamp = datetime.datetime.today().strftime('%Y%m%d_%H%M%S') for directive in directives: srcSegment = directive.getSrcSegment() destSegment = directive.getDestSegment() destSegment.primaryHostname = srcSegment.getSegmentHostName() destSegment.primarySegmentPort = srcSegment.getSegmentPort() destSegment.progressFile = '%s/pg_basebackup.%s.dbid%s.out' % (gplog.get_logger_dir(), timeStamp, destSegment.getSegmentDbId()) srcSegments.append(srcSegment) destSegments.append(destSegment) isTargetReusedLocation.append(directive.isTargetReusedLocation()) destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment(destSegments, isTargetReusedLocation) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, gplog.get_logger_dir(), newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly, forceoverwrite=self.__forceoverwrite) # # validate directories for target segments # self.__logger.info('Validating remote directories') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append("Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # Configure a new segment # # Recover segments using gpconfigurenewsegment, which # uses pg_basebackup. gprecoverseg generates a log filename which is # passed to gpconfigurenewsegment as a confinfo parameter. gprecoverseg # tails this file to show recovery progress to the user, and removes the # file when one done. A new file is generated for each run of # gprecoverseg based on a timestamp. # # There is race between when the pg_basebackup log file is created and # when the progress command is run. Thus, the progress command touches # the file to ensure its present before tailing. self.__logger.info('Configuring new segments') cmds = [] progressCmds = [] removeCmds= [] for hostName in destSegmentByHost.keys(): for segment in destSegmentByHost[hostName]: if self.__progressMode != GpMirrorListToBuild.Progress.NONE: progressCmds.append( GpMirrorListToBuild.ProgressCommand("tail the last line of the file", "set -o pipefail; touch -a {0}; tail -1 {0} | tr '\\r' '\\n' | tail -1".format( pipes.quote(segment.progressFile)), segment.getSegmentDbId(), segment.progressFile, ctxt=base.REMOTE, remoteHost=hostName)) removeCmds.append( base.Command("remove file", "rm -f %s" % pipes.quote(segment.progressFile), ctxt=base.REMOTE, remoteHost=hostName)) cmds.append( createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "unpacking basic segment directory", suppressErrorCheck=False, progressCmds=progressCmds) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(removeCmds, "removing pg_basebackup progress logfiles", suppressErrorCheck=False) # # copy dump files from old segment to new segment # for srcSeg in srcSegments: for destSeg in destSegments: if srcSeg.content == destSeg.content: src_dump_dir = os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps') cmd = base.Command('check existence of db_dumps directory', 'ls %s' % (src_dump_dir), ctxt=base.REMOTE, remoteHost=destSeg.getSegmentAddress()) cmd.run() if cmd.results.rc == 0: # Only try to copy directory if it exists cmd = Scp('copy db_dumps from old segment to new segment', os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps*', '*'), os.path.join(destSeg.getSegmentDataDirectory(), 'db_dumps'), srcSeg.getSegmentAddress(), destSeg.getSegmentAddress(), recursive=True) cmd.run(validateAfter=True) break
def rebalance(self): # Get the unbalanced primary segments grouped by hostname # These segments are what we will shutdown. self.logger.info("Getting unbalanced segments") unbalanced_primary_segs = GpArray.getSegmentsByHostName( self.gpArray.get_unbalanced_primary_segdbs()) pool = base.WorkerPool() try: # Disable ctrl-c signal.signal(signal.SIGINT, signal.SIG_IGN) self.logger.info("Stopping unbalanced primary segments...") for hostname in unbalanced_primary_segs.keys(): cmd = GpSegStopCmd("stop unbalanced primary segs", self.gpEnv.getGpHome(), self.gpEnv.getGpVersion(), 'fast', unbalanced_primary_segs[hostname], ctxt=base.REMOTE, remoteHost=hostname, timeout=600) pool.addCommand(cmd) base.join_and_indicate_progress(pool) failed_count = 0 completed = pool.getCompletedItems() for res in completed: if not res.get_results().wasSuccessful(): failed_count += 1 allSegmentsStopped = (failed_count == 0) if not allSegmentsStopped: self.logger.warn( "%d segments failed to stop. A full rebalance of the") self.logger.warn( "system is not possible at this time. Please check the") self.logger.warn( "log files, correct the problem, and run gprecoverseg -r") self.logger.warn("again.") self.logger.info( "gprecoverseg will continue with a partial rebalance.") pool.empty_completed_items() segment_reconfigurer = SegmentReconfigurer( logger=self.logger, worker_pool=pool, timeout=MIRROR_PROMOTION_TIMEOUT) segment_reconfigurer.reconfigure() # Final step is to issue a recoverseg operation to resync segments self.logger.info("Starting segment synchronization") original_sys_args = sys.argv[:] try: self.logger.info( "=============================START ANOTHER RECOVER=========================================" ) # import here because GpRecoverSegmentProgram and GpSegmentRebalanceOperation have a circular dependency from gppylib.programs.clsRecoverSegment import GpRecoverSegmentProgram sys.argv = ['gprecoverseg', '-a'] local_parser = GpRecoverSegmentProgram.createParser() local_options, args = local_parser.parse_args() cmd = GpRecoverSegmentProgram.createProgram( local_options, args) cmd.run() except SystemExit as e: if e.code != 0: self.logger.error( "Failed to start the synchronization step of the segment rebalance." ) self.logger.error( "Check the gprecoverseg log file, correct any problems, and re-run" ) self.logger.error("'gprecoverseg -a'.") raise Exception("Error synchronizing.\nError: %s" % str(e)) finally: if cmd: cmd.cleanup() sys.argv = original_sys_args self.logger.info( "==============================END ANOTHER RECOVER==========================================" ) except Exception, ex: raise ex
def tmain(): join_and_indicate_progress(self.pool, write_end, interval=0.001) write_end.close()
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [] destSegments = [] isTargetReusedLocation = [] for directive in directives: srcSegment = directive.getSrcSegment() destSegment = directive.getDestSegment() destSegment.primaryHostname = srcSegment.getSegmentHostName() destSegment.primarySegmentPort = srcSegment.getSegmentPort() srcSegments.append(srcSegment) destSegments.append(destSegment) isTargetReusedLocation.append(directive.isTargetReusedLocation()) destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment(destSegments, isTargetReusedLocation) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, gplog.get_logger_dir(), newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly, forceoverwrite=self.__forceoverwrite) # # validate directories for target segments # self.__logger.info('Validating remote directories') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append("Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # # unpack and configure new segments # self.__logger.info('Configuring new segments') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "unpacking basic segment directory") # # copy dump files from old segment to new segment # for srcSeg in srcSegments: for destSeg in destSegments: if srcSeg.content == destSeg.content: src_dump_dir = os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps') cmd = base.Command('check existence of db_dumps directory', 'ls %s' % (src_dump_dir), ctxt=base.REMOTE, remoteHost=destSeg.getSegmentAddress()) cmd.run() if cmd.results.rc == 0: # Only try to copy directory if it exists cmd = Scp('copy db_dumps from old segment to new segment', os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps*', '*'), os.path.join(destSeg.getSegmentDataDirectory(), 'db_dumps'), srcSeg.getSegmentAddress(), destSeg.getSegmentAddress(), recursive=True) cmd.run(validateAfter=True) break
def rebalance(self): self.logger.info("Determining primary and mirror segment pairs to rebalance") # The current implementation of rebalance calls "gprecoverseg -a" below. # Thus, if another balanced pair is not synchronized, or has a down mirror # that pair will be recovered as a side-effect of rebalancing. unbalanced_primary_segs = [] for segmentPair in self.gpArray.segmentPairs: if segmentPair.balanced(): continue if segmentPair.up() and segmentPair.reachable() and segmentPair.synchronized(): unbalanced_primary_segs.append(segmentPair.primaryDB) else: self.logger.warning( "Not rebalancing primary segment dbid %d with its mirror dbid %d because one is either down, unreachable, or not synchronized" \ % (segmentPair.primaryDB.dbid, segmentPair.mirrorDB.dbid)) if not len(unbalanced_primary_segs): self.logger.info("No segments to rebalance") return True unbalanced_primary_segs = GpArray.getSegmentsByHostName(unbalanced_primary_segs) pool = base.WorkerPool(min(len(unbalanced_primary_segs), self.batch_size)) try: # Disable ctrl-c signal.signal(signal.SIGINT, signal.SIG_IGN) self.logger.info("Stopping unbalanced primary segments...") for hostname in list(unbalanced_primary_segs.keys()): cmd = GpSegStopCmd("stop unbalanced primary segs", self.gpEnv.getGpHome(), self.gpEnv.getGpVersion(), 'fast', unbalanced_primary_segs[hostname], ctxt=base.REMOTE, remoteHost=hostname, timeout=600, segment_batch_size=self.segment_batch_size) pool.addCommand(cmd) base.join_and_indicate_progress(pool) failed_count = 0 completed = pool.getCompletedItems() for res in completed: if not res.get_results().wasSuccessful(): failed_count += 1 allSegmentsStopped = (failed_count == 0) if not allSegmentsStopped: self.logger.warn("%d segments failed to stop. A full rebalance of the" % failed_count) self.logger.warn("system is not possible at this time. Please check the") self.logger.warn("log files, correct the problem, and run gprecoverseg -r") self.logger.warn("again.") self.logger.info("gprecoverseg will continue with a partial rebalance.") pool.empty_completed_items() segment_reconfigurer = SegmentReconfigurer(logger=self.logger, worker_pool=pool, timeout=MIRROR_PROMOTION_TIMEOUT) segment_reconfigurer.reconfigure() # Final step is to issue a recoverseg operation to resync segments self.logger.info("Starting segment synchronization") original_sys_args = sys.argv[:] self.logger.info("=============================START ANOTHER RECOVER=========================================") # import here because GpRecoverSegmentProgram and GpSegmentRebalanceOperation have a circular dependency from gppylib.programs.clsRecoverSegment import GpRecoverSegmentProgram cmd_args = ['gprecoverseg', '-a', '-B', str(self.batch_size), '-b', str(self.segment_batch_size)] sys.argv = cmd_args[:] local_parser = GpRecoverSegmentProgram.createParser() local_options, args = local_parser.parse_args() recover_cmd = GpRecoverSegmentProgram.createProgram(local_options, args) try: recover_cmd.run() except SystemExit as e: if e.code != 0: self.logger.error("Failed to start the synchronization step of the segment rebalance.") self.logger.error("Check the gprecoverseg log file, correct any problems, and re-run") self.logger.error(' '.join(cmd_args)) raise Exception("Error synchronizing.\nError: %s" % str(e)) finally: if recover_cmd: recover_cmd.cleanup() sys.argv = original_sys_args self.logger.info("==============================END ANOTHER RECOVER==========================================") except Exception as ex: raise ex finally: pool.join() pool.haltWork() pool.joinWorkers() signal.signal(signal.SIGINT, signal.default_int_handler) return allSegmentsStopped # if all segments stopped, then a full rebalance was done
def rebalance(self): # Get the unbalanced primary segments grouped by hostname # These segments are what we will shutdown. self.logger.info("Getting unbalanced segments") unbalanced_primary_segs = GpArray.getSegmentsByHostName(self.gpArray.get_unbalanced_primary_segdbs()) pool = base.WorkerPool() try: # Disable ctrl-c signal.signal(signal.SIGINT, signal.SIG_IGN) self.logger.info("Stopping unbalanced primary segments...") for hostname in unbalanced_primary_segs.keys(): cmd = GpSegStopCmd("stop unbalanced primary segs", self.gpEnv.getGpHome(), self.gpEnv.getGpVersion(), 'fast', unbalanced_primary_segs[hostname], ctxt=base.REMOTE, remoteHost=hostname, timeout=600) pool.addCommand(cmd) base.join_and_indicate_progress(pool) failed_count = 0 completed = pool.getCompletedItems() for res in completed: if not res.get_results().wasSuccessful(): failed_count += 1 allSegmentsStopped = (failed_count == 0) if not allSegmentsStopped: self.logger.warn("%d segments failed to stop. A full rebalance of the") self.logger.warn("system is not possible at this time. Please check the") self.logger.warn("log files, correct the problem, and run gprecoverseg -r") self.logger.warn("again.") self.logger.info("gprecoverseg will continue with a partial rebalance.") pool.empty_completed_items() segment_reconfigurer = SegmentReconfigurer(logger=self.logger, worker_pool=pool, timeout=MIRROR_PROMOTION_TIMEOUT) segment_reconfigurer.reconfigure() # Final step is to issue a recoverseg operation to resync segments self.logger.info("Starting segment synchronization") original_sys_args = sys.argv[:] try: self.logger.info("=============================START ANOTHER RECOVER=========================================") # import here because GpRecoverSegmentProgram and GpSegmentRebalanceOperation have a circular dependency from gppylib.programs.clsRecoverSegment import GpRecoverSegmentProgram sys.argv = ['gprecoverseg', '-a'] local_parser = GpRecoverSegmentProgram.createParser() local_options, args = local_parser.parse_args() cmd = GpRecoverSegmentProgram.createProgram(local_options, args) cmd.run() except SystemExit as e: if e.code != 0: self.logger.error("Failed to start the synchronization step of the segment rebalance.") self.logger.error("Check the gprecoverseg log file, correct any problems, and re-run") self.logger.error("'gprecoverseg -a'.") raise Exception("Error synchronizing.\nError: %s" % str(e)) finally: if cmd: cmd.cleanup() sys.argv = original_sys_args self.logger.info("==============================END ANOTHER RECOVER==========================================") except Exception, ex: raise ex