def _update_config(self, recovery_info_by_host, gpArray): # should use mainUtils.getProgramName but I can't make it work! programName = os.path.split(sys.argv[0])[-1] full_recovery_dbids = {} for host_name, recovery_info_list in recovery_info_by_host.items(): for ri in recovery_info_list: if ri.is_full_recovery: full_recovery_dbids[ri.target_segment_dbid] = True # Disable Ctrl-C, going to save metadata in database and transition segments signal.signal(signal.SIGINT, signal.SIG_IGN) backout_map = None try: self.__logger.info("Updating configuration for mirrors") backout_map = configInterface.getConfigurationProvider( ).updateSystemConfig( gpArray, "%s: segment config for resync" % programName, dbIdToForceMirrorRemoveAdd=full_recovery_dbids, useUtilityMode=False, allowPrimary=False) self.__logger.debug("Generating configuration backout scripts") finally: # Re-enable Ctrl-C signal.signal(signal.SIGINT, signal.default_int_handler) return backout_map
def loadTargetSegments(self) : targetHost = self.options.targetHost targetRole = self.options.targetRole targetRegistrationOrder = self.options.targetRegistrationOrder if targetHost is None and targetRegistrationOrder is None: raise ProgramArgumentValidationException(\ "neither --host nor --registration_order is specified. " \ "Exactly one should be specified.") if targetHost is not None and targetRegistrationOrder is not None: raise ProgramArgumentValidationException(\ "both --host and --registration_order are specified. " \ "Exactly one should be specified.") if targetHost is not None and targetRole is None: raise ProgramArgumentValidationException(\ "--role is not specified when --host is specified. " \ "Role is required when targeting a host.") if targetRegistrationOrder is not None and targetRole is not None: raise ProgramArgumentValidationException(\ "--role is specified when --registration_order is specified. " \ "Role should not be specified when targeting a single registration_order.") # # load from master db # masterPort = self.options.masterPort if masterPort is None: gpEnv = GpMasterEnvironment(self.options.masterDataDirectory, False) masterPort = gpEnv.getMasterPort() conf = configurationInterface.getConfigurationProvider().initializeProvider(masterPort) hawqArray = conf.loadSystemConfig(useUtilityMode=True) hawqdbs = hawqArray.getDbList() # # prune gpArray according to filter settings # if targetHost is not None and targetHost != "ALL": hawqdbs = [hdb for hdb in hawqdbs if hdb.getHostName() == targetHost] if targetRegistrationOrder is not None: hawqdbs = gpArray.getDbList() regorder = int(targetRegistrationOrder) hawqdbs = [hdb for hdb in hawqdbs if hdb.getRegistrationOrder() == regorder] if targetRole is not None: hawqdbs = [hdb for hdb in hawqdbs if self.isMatchingRole(targetRole, hdb)] # only DOWN segments remaining? Error out downhawqdbs = [hdb for hdb in hawqdbs if hdb.getStatus() != 'u'] if len(downhawqdbs) > 0: downhawqdbStr = "\n Down Segment: " raise ExceptionNoStackTraceNeeded( "Unable to inject fault. At least one segment is marked as down in the database.%s%s" % (downhawqdbStr, downhawqdbStr.join([str(downhdb) for downhdb in downhawqdbs]))) print "### DEBUG: loadTargetSegments" print "### DEBUG: HAWQDBS " print hawqdbs return hawqdbs
def loadTargetSegments(self): targetHost = self.options.targetHost targetRole = self.options.targetRole targetDbId = self.options.targetDbId if targetHost is None and targetDbId is None: raise ProgramArgumentValidationException( "neither --host nor --seg_dbid specified. " "Exactly one should be specified." ) if targetHost is not None and targetDbId is not None: raise ProgramArgumentValidationException( "both --host nor --seg_dbid specified. " "Exactly one should be specified." ) if targetHost is not None and targetRole is None: raise ProgramArgumentValidationException( "--role not specified when --host is specified. " "Role is required when targeting a host." ) if targetDbId is not None and targetRole is not None: raise ProgramArgumentValidationException( "--role specified when --seg_dbid is specified. " "Role should not be specified when targeting a single dbid." ) # # load from master db # masterPort = self.options.masterPort if masterPort is None: gpEnv = GpMasterEnvironment(self.options.masterDataDirectory, False) masterPort = gpEnv.getMasterPort() conf = configurationInterface.getConfigurationProvider().initializeProvider(masterPort) gpArray = conf.loadSystemConfig(useUtilityMode=True) segments = gpArray.getDbList() # # prune gpArray according to filter settings # segments = [seg for seg in segments if seg.isSegmentQE()] if targetHost is not None and targetHost != "ALL": segments = [seg for seg in segments if seg.getSegmentHostName() == targetHost] if targetDbId is not None: segments = gpArray.getDbList() dbId = int(targetDbId) segments = [seg for seg in segments if seg.getSegmentDbId() == dbId] if targetRole is not None: segments = [seg for seg in segments if self.isMatchingRole(targetRole, seg)] # only DOWN segments remaining? Error out downSegments = [seg for seg in segments if seg.getSegmentStatus() != "u"] if len(downSegments) > 0: downSegStr = "\n Down Segment: " raise ExceptionNoStackTraceNeeded( "Unable to inject fault. At least one segment is marked as down in the database.%s%s" % (downSegStr, downSegStr.join([str(downSeg) for downSeg in downSegments])) ) return segments
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree) self.__pool = base.WorkerPool(self.__options.parallelDegree) gpEnv = GpMasterEnvironment(self.__options.masterDataDirectory, True) faultProberInterface.getFaultProber().initializeProber( gpEnv.getMasterPort()) confProvider = configInterface.getConfigurationProvider( ).initializeProvider(gpEnv.getMasterPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) # check that heap_checksums is consistent across cluster, fail immediately if not self.validate_heap_checksums(gpArray) # check that we actually have mirrors if gpArray.getFaultStrategy() != gparray.FAULT_STRATEGY_NONE: raise ExceptionNoStackTraceNeeded( \ "GPDB physical mirroring cannot be added. The cluster is already configured with %s." % \ gparray.getFaultStrategyLabel(gpArray.getFaultStrategy())) # figure out what needs to be done mirrorBuilder = self.__getMirrorsToBuildBasedOnOptions(gpEnv, gpArray) mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.__outputToFile(mirrorBuilder, self.__options.outputSampleConfigFile, gpArray) logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) else: self.__displayAddMirrors(gpEnv, mirrorBuilder, gpArray) if self.__options.interactive: if not userinput.ask_yesno( None, "\nContinue with add mirrors procedure", 'N'): raise UserAbortedException() gpArray.setFaultStrategy(gparray.FAULT_STRATEGY_FILE_REPLICATION) mirrorBuilder.buildMirrors("add", gpEnv, gpArray) logger.info( "******************************************************************" ) logger.info( "Mirror segments have been added; data synchronization is in progress." ) logger.info( "Data synchronization will continue in the background.") logger.info("") logger.info( "Use gpstate -s to check the resynchronization progress.") logger.info( "******************************************************************" ) return 0 # success -- exit code 0!
def loadTargetSegments(self) : targetHost = self.options.targetHost targetRole = self.options.targetRole targetDbId = self.options.targetDbId if targetHost is None and targetDbId is None: raise ProgramArgumentValidationException(\ "neither --host nor --seg_dbid specified. " \ "Exactly one should be specified.") if targetHost is not None and targetDbId is not None: raise ProgramArgumentValidationException(\ "both --host nor --seg_dbid specified. " \ "Exactly one should be specified.") if targetHost is not None and targetRole is None: raise ProgramArgumentValidationException(\ "--role not specified when --host is specified. " \ "Role is required when targeting a host.") if targetDbId is not None and targetRole is not None: raise ProgramArgumentValidationException(\ "--role specified when --seg_dbid is specified. " \ "Role should not be specified when targeting a single dbid.") # # load from master db # masterPort = self.options.masterPort if masterPort is None: gpEnv = GpMasterEnvironment(self.options.masterDataDirectory, False, verbose=False) masterPort = gpEnv.getMasterPort() conf = configurationInterface.getConfigurationProvider().initializeProvider(masterPort) gpArray = conf.loadSystemConfig(useUtilityMode=True, verbose=False) segments = gpArray.getDbList() # # prune gpArray according to filter settings # segments = [seg for seg in segments if seg.isSegmentQE()] if targetHost is not None and targetHost != "ALL": segments = [seg for seg in segments if seg.getSegmentHostName() == targetHost] if targetDbId is not None: segments = gpArray.getDbList() dbId = int(targetDbId) segments = [seg for seg in segments if seg.getSegmentDbId() == dbId] if targetRole is not None: segments = [seg for seg in segments if self.isMatchingRole(targetRole, seg)] # only DOWN segments remaining? Error out downSegments = [seg for seg in segments if seg.getSegmentStatus() != 'u'] if len(downSegments) > 0: downSegStr = "\n Down Segment: " raise ExceptionNoStackTraceNeeded( "Unable to inject fault. At least one segment is marked as down in the database.%s%s" % (downSegStr, downSegStr.join([str(downSeg) for downSeg in downSegments]))) return segments
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree) self.__pool = WorkerPool(self.__options.parallelDegree) gpEnv = GpMasterEnvironment(self.__options.masterDataDirectory, True) # verify "where to recover" options optionCnt = 0 if self.__options.newRecoverHosts is not None: optionCnt += 1 if self.__options.recoveryConfigFile is not None: optionCnt += 1 if self.__options.rebalanceSegments: optionCnt += 1 if optionCnt > 1: raise ProgramArgumentValidationException("Only one of -i, -p, and -r may be specified") faultProberInterface.getFaultProber().initializeProber(gpEnv.getMasterPort()) confProvider = configInterface.getConfigurationProvider().initializeProvider(gpEnv.getMasterPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) num_workers = min(len(gpArray.get_hostlist()), self.__options.parallelDegree) hosts = set(gpArray.get_hostlist(includeMaster=False)) unreachable_hosts = get_unreachable_segment_hosts(hosts, num_workers) for i, segmentPair in enumerate(gpArray.segmentPairs): if segmentPair.primaryDB.getSegmentHostName() in unreachable_hosts: logger.warning("Not recovering segment %d because %s is unreachable" % (segmentPair.primaryDB.dbid, segmentPair.primaryDB.getSegmentHostName())) gpArray.segmentPairs[i].primaryDB.unreachable = True if segmentPair.mirrorDB.getSegmentHostName() in unreachable_hosts: logger.warning("Not recovering segment %d because %s is unreachable" % (segmentPair.mirrorDB.dbid, segmentPair.mirrorDB.getSegmentHostName())) gpArray.segmentPairs[i].mirrorDB.unreachable = True if not gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( 'GPDB Mirroring replication is not configured for this Greenplum Database instance.') # We have phys-rep/filerep mirrors. if self.__options.newRecoverHosts is not None: try: uniqueHosts = [] for h in self.__options.newRecoverHosts.split(','): if h.strip() not in uniqueHosts: uniqueHosts.append(h.strip()) self.__options.newRecoverHosts = uniqueHosts except Exception, ex: raise ProgramArgumentValidationException( \ "Invalid value for recover hosts: %s" % ex)
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree) self.__pool = base.WorkerPool(self.__options.parallelDegree) gpEnv = GpMasterEnvironment(self.__options.masterDataDirectory, True) faultProberInterface.getFaultProber().initializeProber(gpEnv.getMasterPort()) confProvider = configInterface.getConfigurationProvider().initializeProvider(gpEnv.getMasterPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) # check that heap_checksums is consistent across cluster, fail immediately if not self.validate_heap_checksums(gpArray) self.checkMirrorOffset(gpArray) # check that we actually have mirrors if gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( \ "GPDB physical mirroring cannot be added. The cluster is already configured with Mirrors.") # figure out what needs to be done (AND update the gpArray!) mirrorBuilder = self.__getMirrorsToBuildBasedOnOptions(gpEnv, gpArray) mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.__outputToFile(mirrorBuilder, self.__options.outputSampleConfigFile, gpArray) logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) else: self.__displayAddMirrors(gpEnv, mirrorBuilder, gpArray) if self.__options.interactive: if not userinput.ask_yesno(None, "\nContinue with add mirrors procedure", 'N'): raise UserAbortedException() self.config_primaries_for_replication(gpArray) if not mirrorBuilder.buildMirrors("add", gpEnv, gpArray): return 1 logger.info("******************************************************************") logger.info("Mirror segments have been added; data synchronization is in progress.") logger.info("Data synchronization will continue in the background.") logger.info("Use gpstate -s to check the resynchronization progress.") logger.info("******************************************************************") return 0 # success -- exit code 0!
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree) self.__pool = WorkerPool(self.__options.parallelDegree) gpEnv = GpMasterEnvironment(self.__options.masterDataDirectory, True) # verify "where to recover" options optionCnt = 0 if self.__options.newRecoverHosts is not None: optionCnt += 1 if self.__options.recoveryConfigFile is not None: optionCnt += 1 if self.__options.rebalanceSegments: optionCnt += 1 if optionCnt > 1: raise ProgramArgumentValidationException( "Only one of -i, -p, and -r may be specified") faultProberInterface.getFaultProber().initializeProber( gpEnv.getMasterPort()) confProvider = configInterface.getConfigurationProvider( ).initializeProvider(gpEnv.getMasterPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) if not gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( 'GPDB Mirroring replication is not configured for this Greenplum Database instance.' ) # We have phys-rep/filerep mirrors. if self.__options.newRecoverHosts is not None: try: uniqueHosts = [] for h in self.__options.newRecoverHosts.split(','): if h.strip() not in uniqueHosts: uniqueHosts.append(h.strip()) self.__options.newRecoverHosts = uniqueHosts except Exception, ex: raise ProgramArgumentValidationException( \ "Invalid value for recover hosts: %s" % ex)
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree ) self.__pool = base.WorkerPool(self.__options.parallelDegree) gpEnv = GpMasterEnvironment(self.__options.masterDataDirectory, True) faultProberInterface.getFaultProber().initializeProber(gpEnv.getMasterPort()) confProvider = configInterface.getConfigurationProvider().initializeProvider(gpEnv.getMasterPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) # check that we actually have mirrors if gpArray.getFaultStrategy() != gparray.FAULT_STRATEGY_NONE: raise ExceptionNoStackTraceNeeded( "GPDB physical mirroring cannot be added. The cluster is already configured with %s." % gparray.getFaultStrategyLabel(gpArray.getFaultStrategy()) ) # figure out what needs to be done mirrorBuilder = self.__getMirrorsToBuildBasedOnOptions(gpEnv, gpArray) mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.__outputToFile(mirrorBuilder, self.__options.outputSampleConfigFile, gpArray) logger.info("Configuration file output to %s successfully." % self.__options.outputSampleConfigFile) else: self.__displayAddMirrors(gpEnv, mirrorBuilder, gpArray) if self.__options.interactive: if not userinput.ask_yesno(None, "\nContinue with add mirrors procedure", "N"): raise UserAbortedException() gpArray.setFaultStrategy(gparray.FAULT_STRATEGY_FILE_REPLICATION) mirrorBuilder.buildMirrors("add", gpEnv, gpArray) logger.info("******************************************************************") logger.info("Mirror segments have been added; data synchronization is in progress.") logger.info("Data synchronization will continue in the background.") logger.info("") logger.info("Use gpstate -s to check the resynchronization progress.") logger.info("******************************************************************") return 0 # success -- exit code 0!
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree) self.__pool = WorkerPool(self.__options.parallelDegree) gpEnv = GpMasterEnvironment(self.__options.masterDataDirectory, True) # verify "where to recover" options optionCnt = 0 if self.__options.newRecoverHosts is not None: optionCnt += 1 if self.__options.recoveryConfigFile is not None: optionCnt += 1 if self.__options.rebalanceSegments: optionCnt += 1 if optionCnt > 1: raise ProgramArgumentValidationException("Only one of -i, -p, and -r may be specified") faultProberInterface.getFaultProber().initializeProber(gpEnv.getMasterPort()) confProvider = configInterface.getConfigurationProvider().initializeProvider(gpEnv.getMasterPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) if not gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( 'GPDB Mirroring replication is not configured for this Greenplum Database instance.') # We have phys-rep/filerep mirrors. if self.__options.newRecoverHosts is not None: try: uniqueHosts = [] for h in self.__options.newRecoverHosts.split(','): if h.strip() not in uniqueHosts: uniqueHosts.append(h.strip()) self.__options.newRecoverHosts = uniqueHosts except Exception, ex: raise ProgramArgumentValidationException( \ "Invalid value for recover hosts: %s" % ex)
def buildMirrors(self, actionName, gpEnv, gpArray): """ Build the mirrors. gpArray must have already been altered to have updated directories -- that is, the failoverSegments from the mirrorsToBuild must be present in gpArray. """ if len(self.__mirrorsToBuild) == 0: self.__logger.info("No segments to " + actionName) return True self.checkForPortAndDirectoryConflicts(gpArray) self.__logger.info("%s segment(s) to %s" % (len(self.__mirrorsToBuild), actionName)) # make sure the target directories are up-to-date # by cleaning them, if needed, and then copying a basic directory there # the postgresql.conf in that basic directory will need updating (to change the port) toStopDirectives = [] toEnsureMarkedDown = [] cleanupDirectives = [] copyDirectives = [] for toRecover in self.__mirrorsToBuild: if toRecover.getFailedSegment() is not None: # will stop the failed segment. Note that we do this even if we are recovering to a different location! toStopDirectives.append( GpStopSegmentDirectoryDirective( toRecover.getFailedSegment())) if toRecover.getFailedSegment().getSegmentStatus( ) == gparray.STATUS_UP: toEnsureMarkedDown.append(toRecover.getFailedSegment()) if toRecover.isFullSynchronization(): isTargetReusedLocation = False if toRecover.getFailedSegment() is not None and \ toRecover.getFailoverSegment() is None: # # We are recovering a failed segment in-place # cleanupDirectives.append( GpCleanupSegmentDirectoryDirective( toRecover.getFailedSegment())) isTargetReusedLocation = True if toRecover.getFailoverSegment() is not None: targetSegment = toRecover.getFailoverSegment() else: targetSegment = toRecover.getFailedSegment() d = GpCopySegmentDirectoryDirective(toRecover.getLiveSegment(), targetSegment, isTargetReusedLocation) copyDirectives.append(d) self.__ensureStopped(gpEnv, toStopDirectives) self.__ensureMarkedDown(gpEnv, toEnsureMarkedDown) if not self.__forceoverwrite: self.__cleanUpSegmentDirectories(cleanupDirectives) self.__copySegmentDirectories(gpEnv, gpArray, copyDirectives) # update and save metadata in memory for toRecover in self.__mirrorsToBuild: if toRecover.getFailoverSegment() is None: # we are recovering the lost segment in place seg = toRecover.getFailedSegment() else: seg = toRecover.getFailedSegment() # no need to update the failed segment's information -- it is # being overwritten in the configuration with the failover segment for gpArraySegment in gpArray.getDbList(): if gpArraySegment is seg: raise Exception( "failed segment should not be in the new configuration if failing over to new segment" ) seg = toRecover.getFailoverSegment() seg.setSegmentStatus(gparray.STATUS_DOWN ) # down initially, we haven't started it yet seg.setSegmentMode(gparray.MODE_NOT_SYNC) # figure out what needs to be started or transitioned mirrorsToStart = [] # Map of mirror dbid to GpMirrorListToBuild.RewindSegmentInfo objects rewindInfo = {} primariesToConvert = [] convertPrimaryUsingFullResync = [] fullResyncMirrorDbIds = {} timeStamp = datetime.datetime.today().strftime('%Y%m%d_%H%M%S') for toRecover in self.__mirrorsToBuild: seg = toRecover.getFailoverSegment() if seg is None: seg = toRecover.getFailedSegment( ) # we are recovering in place mirrorsToStart.append(seg) primarySeg = toRecover.getLiveSegment() # Add to rewindInfo to execute pg_rewind later if we are not # using full recovery. We will run pg_rewind on incremental recovery # if the target mirror does not have standby.signal file because # segment failover happened. The check for standby.signal file will # happen in the same remote SegmentRewind Command call. if not toRecover.isFullSynchronization() \ and seg.getSegmentRole() == gparray.ROLE_MIRROR: rewindInfo[seg.getSegmentDbId( )] = GpMirrorListToBuild.RewindSegmentInfo( seg, primarySeg.getSegmentHostName(), primarySeg.getSegmentPort(), timeStamp) # The change in configuration to of the mirror to down requires that # the primary also be marked as unsynchronized. primarySeg.setSegmentMode(gparray.MODE_NOT_SYNC) primariesToConvert.append(primarySeg) convertPrimaryUsingFullResync.append( toRecover.isFullSynchronization()) if toRecover.isFullSynchronization() and seg.getSegmentDbId() > 0: fullResyncMirrorDbIds[seg.getSegmentDbId()] = True # should use mainUtils.getProgramName but I can't make it work! programName = os.path.split(sys.argv[0])[-1] # Disable Ctrl-C, going to save metadata in database and transition segments signal.signal(signal.SIGINT, signal.SIG_IGN) rewindFailedSegments = [] try: self.__logger.info("Updating configuration with new mirrors") configInterface.getConfigurationProvider().updateSystemConfig( gpArray, "%s: segment config for resync" % programName, dbIdToForceMirrorRemoveAdd=fullResyncMirrorDbIds, useUtilityMode=False, allowPrimary=False) self.__logger.info("Updating mirrors") if len(rewindInfo) != 0: self.__logger.info("Running pg_rewind on required mirrors") rewindFailedSegments = self.run_pg_rewind(rewindInfo) # Do not start mirrors that failed pg_rewind for failedSegment in rewindFailedSegments: mirrorsToStart.remove(failedSegment) self.__logger.info("Starting mirrors") start_all_successful = self.__startAll(gpEnv, gpArray, mirrorsToStart) finally: # Re-enable Ctrl-C signal.signal(signal.SIGINT, signal.default_int_handler) if len(rewindFailedSegments) != 0: return False return start_all_successful
def run(self): if self.__options.batch_size < 1 or self.__options.batch_size > gp.MAX_COORDINATOR_NUM_WORKERS: raise ProgramArgumentValidationException( "Invalid batch_size provided with -B argument: %d" % self.__options.batch_size) if self.__options.segment_batch_size < 1 or self.__options.segment_batch_size > gp.MAX_SEGHOST_NUM_WORKERS: raise ProgramArgumentValidationException( "Invalid segment_batch_size provided with -b argument: %d" % self.__options.segment_batch_size) self.__pool = base.WorkerPool(self.__options.batch_size) gpEnv = GpCoordinatorEnvironment( self.__options.coordinatorDataDirectory, True) faultProberInterface.getFaultProber().initializeProber( gpEnv.getCoordinatorPort()) confProvider = configInterface.getConfigurationProvider( ).initializeProvider(gpEnv.getCoordinatorPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) # check that heap_checksums is consistent across cluster, fail immediately if not self.validate_heap_checksums(gpArray) if self.__options.mirrorConfigFile is None: self.checkMirrorOffset(gpArray) # check that we actually have mirrors if gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( \ "GPDB physical mirroring cannot be added. The cluster is already configured with Mirrors.") # figure out what needs to be done (AND update the gpArray!) mirrorBuilder = self.__getMirrorsToBuildBasedOnOptions(gpEnv, gpArray) mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.__outputToFile(mirrorBuilder, self.__options.outputSampleConfigFile, gpArray) logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) else: self.__displayAddMirrors(gpEnv, mirrorBuilder, gpArray) if self.__options.interactive: if not userinput.ask_yesno( None, "\nContinue with add mirrors procedure", 'N'): raise UserAbortedException() update_pg_hba_on_segments(gpArray, self.__options.hba_hostnames, self.__options.batch_size) if not mirrorBuilder.buildMirrors("add", gpEnv, gpArray): return 1 logger.info( "******************************************************************" ) logger.info( "Mirror segments have been added; data synchronization is in progress." ) logger.info( "Data synchronization will continue in the background.") logger.info( "Use gpstate -s to check the resynchronization progress.") logger.info( "******************************************************************" ) return 0 # success -- exit code 0!
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > gp.MAX_COORDINATOR_NUM_WORKERS: raise ProgramArgumentValidationException( "Invalid parallelDegree value provided with -B argument: %d" % self.__options.parallelDegree) if self.__options.parallelPerHost < 1 or self.__options.parallelPerHost > gp.MAX_SEGHOST_NUM_WORKERS: raise ProgramArgumentValidationException( "Invalid parallelPerHost value provided with -b argument: %d" % self.__options.parallelPerHost) self.__pool = WorkerPool(self.__options.parallelDegree) gpEnv = GpCoordinatorEnvironment(self.__options.coordinatorDataDirectory, True) # verify "where to recover" options optionCnt = 0 if self.__options.newRecoverHosts is not None: optionCnt += 1 if self.__options.recoveryConfigFile is not None: optionCnt += 1 if self.__options.rebalanceSegments: optionCnt += 1 if optionCnt > 1: raise ProgramArgumentValidationException("Only one of -i, -p, and -r may be specified") faultProberInterface.getFaultProber().initializeProber(gpEnv.getCoordinatorPort()) confProvider = configInterface.getConfigurationProvider().initializeProvider(gpEnv.getCoordinatorPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) if not gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( 'GPDB Mirroring replication is not configured for this Greenplum Database instance.') num_workers = min(len(gpArray.get_hostlist()), self.__options.parallelDegree) hosts = set(gpArray.get_hostlist(includeCoordinator=False)) unreachable_hosts = get_unreachable_segment_hosts(hosts, num_workers) update_unreachable_flag_for_segments(gpArray, unreachable_hosts) # We have phys-rep/filerep mirrors. if self.__options.newRecoverHosts is not None: try: uniqueHosts = [] for h in self.__options.newRecoverHosts.split(','): if h.strip() not in uniqueHosts: uniqueHosts.append(h.strip()) self.__options.newRecoverHosts = uniqueHosts except Exception as ex: raise ProgramArgumentValidationException( \ "Invalid value for recover hosts: %s" % ex) # retain list of hosts that were existing in the system prior to getRecoverActions... # this will be needed for later calculations that determine whether # new hosts were added into the system existing_hosts = set(gpArray.getHostList()) # figure out what needs to be done mirrorBuilder = self.getRecoveryActionsBasedOnOptions(gpEnv, gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.outputToFile(mirrorBuilder, gpArray, self.__options.outputSampleConfigFile) self.logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) elif self.__options.rebalanceSegments: assert (isinstance(mirrorBuilder, GpSegmentRebalanceOperation)) # Make sure we have work to do if len(gpArray.get_unbalanced_segdbs()) == 0: self.logger.info("No segments are running in their non-preferred role and need to be rebalanced.") else: self.displayRecovery(mirrorBuilder, gpArray) if self.__options.interactive: self.logger.warn("This operation will cancel queries that are currently executing.") self.logger.warn("Connections to the database however will not be interrupted.") if not userinput.ask_yesno(None, "\nContinue with segment rebalance procedure", 'N'): raise UserAbortedException() fullRebalanceDone = mirrorBuilder.rebalance() self.logger.info("******************************************************************") if fullRebalanceDone: self.logger.info("The rebalance operation has completed successfully.") else: self.logger.info("The rebalance operation has completed with WARNINGS." " Please review the output in the gprecoverseg log.") self.logger.info("******************************************************************") elif len(mirrorBuilder.getMirrorsToBuild()) == 0: self.logger.info('No segments to recover') else: #TODO this already happens in buildMirrors function mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) self.validate_heap_checksum_consistency(gpArray, mirrorBuilder) self.displayRecovery(mirrorBuilder, gpArray) self.__displayRecoveryWarnings(mirrorBuilder) if self.__options.interactive: if not userinput.ask_yesno(None, "\nContinue with segment recovery procedure", 'N'): raise UserAbortedException() # sync packages current_hosts = set(gpArray.getHostList()) new_hosts = current_hosts - existing_hosts if new_hosts: self.syncPackages(new_hosts) contentsToUpdate = [seg.getLiveSegment().getSegmentContentId() for seg in mirrorBuilder.getMirrorsToBuild()] update_pg_hba_on_segments(gpArray, self.__options.hba_hostnames, self.__options.parallelDegree, contentsToUpdate) if not mirrorBuilder.recover_mirrors(gpEnv, gpArray): self.logger.error("gprecoverseg failed. Please check the output for more details.") sys.exit(1) self.logger.info("********************************") self.logger.info("Segments successfully recovered.") self.logger.info("********************************") self.logger.info("Recovered mirror segments need to sync WAL with primary segments.") self.logger.info("Use 'gpstate -e' to check progress of WAL sync remaining bytes") sys.exit(0)
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree) self.__pool = WorkerPool(self.__options.parallelDegree) gpEnv = GpCoordinatorEnvironment(self.__options.coordinatorDataDirectory, True) # verify "where to recover" options optionCnt = 0 if self.__options.newRecoverHosts is not None: optionCnt += 1 if self.__options.recoveryConfigFile is not None: optionCnt += 1 if self.__options.rebalanceSegments: optionCnt += 1 if optionCnt > 1: raise ProgramArgumentValidationException("Only one of -i, -p, and -r may be specified") faultProberInterface.getFaultProber().initializeProber(gpEnv.getCoordinatorPort()) confProvider = configInterface.getConfigurationProvider().initializeProvider(gpEnv.getCoordinatorPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) num_workers = min(len(gpArray.get_hostlist()), self.__options.parallelDegree) hosts = set(gpArray.get_hostlist(includeCoordinator=False)) unreachable_hosts = get_unreachable_segment_hosts(hosts, num_workers) for i, segmentPair in enumerate(gpArray.segmentPairs): if segmentPair.primaryDB.getSegmentHostName() in unreachable_hosts: logger.warning("Not recovering segment %d because %s is unreachable" % (segmentPair.primaryDB.dbid, segmentPair.primaryDB.getSegmentHostName())) gpArray.segmentPairs[i].primaryDB.unreachable = True if segmentPair.mirrorDB.getSegmentHostName() in unreachable_hosts: logger.warning("Not recovering segment %d because %s is unreachable" % (segmentPair.mirrorDB.dbid, segmentPair.mirrorDB.getSegmentHostName())) gpArray.segmentPairs[i].mirrorDB.unreachable = True if not gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( 'GPDB Mirroring replication is not configured for this Greenplum Database instance.') # We have phys-rep/filerep mirrors. if self.__options.newRecoverHosts is not None: try: uniqueHosts = [] for h in self.__options.newRecoverHosts.split(','): if h.strip() not in uniqueHosts: uniqueHosts.append(h.strip()) self.__options.newRecoverHosts = uniqueHosts except Exception as ex: raise ProgramArgumentValidationException( \ "Invalid value for recover hosts: %s" % ex) # If it's a rebalance operation, make sure we are in an acceptable state to do that # Acceptable state is: # - No segments down # - No segments in change tracking or unsynchronized state if self.__options.rebalanceSegments: if len(gpArray.get_invalid_segdbs()) > 0: raise Exception("Down segments still exist. All segments must be up to rebalance.") if len(gpArray.get_synchronized_segdbs()) != len(gpArray.getSegDbList()): raise Exception( "Some segments are not yet synchronized. All segments must be synchronized to rebalance.") # retain list of hosts that were existing in the system prior to getRecoverActions... # this will be needed for later calculations that determine whether # new hosts were added into the system existing_hosts = set(gpArray.getHostList()) # figure out what needs to be done mirrorBuilder = self.getRecoveryActionsBasedOnOptions(gpEnv, gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.outputToFile(mirrorBuilder, gpArray, self.__options.outputSampleConfigFile) self.logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) elif self.__options.rebalanceSegments: assert (isinstance(mirrorBuilder, GpSegmentRebalanceOperation)) # Make sure we have work to do if len(gpArray.get_unbalanced_segdbs()) == 0: self.logger.info("No segments are running in their non-preferred role and need to be rebalanced.") else: self.displayRecovery(mirrorBuilder, gpArray) if self.__options.interactive: self.logger.warn("This operation will cancel queries that are currently executing.") self.logger.warn("Connections to the database however will not be interrupted.") if not userinput.ask_yesno(None, "\nContinue with segment rebalance procedure", 'N'): raise UserAbortedException() fullRebalanceDone = mirrorBuilder.rebalance() self.logger.info("******************************************************************") if fullRebalanceDone: self.logger.info("The rebalance operation has completed successfully.") else: self.logger.info("The rebalance operation has completed with WARNINGS." " Please review the output in the gprecoverseg log.") self.logger.info("There is a resynchronization running in the background to bring all") self.logger.info("segments in sync.") self.logger.info("Use gpstate -e to check the resynchronization progress.") self.logger.info("******************************************************************") elif len(mirrorBuilder.getMirrorsToBuild()) == 0: self.logger.info('No segments to recover') else: mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) self.validate_heap_checksum_consistency(gpArray, mirrorBuilder) self.displayRecovery(mirrorBuilder, gpArray) self.__displayRecoveryWarnings(mirrorBuilder) if self.__options.interactive: if not userinput.ask_yesno(None, "\nContinue with segment recovery procedure", 'N'): raise UserAbortedException() # sync packages current_hosts = set(gpArray.getHostList()) new_hosts = current_hosts - existing_hosts if new_hosts: self.syncPackages(new_hosts) config_primaries_for_replication(gpArray, self.__options.hba_hostnames) if not mirrorBuilder.buildMirrors("recover", gpEnv, gpArray): sys.exit(1) self.trigger_fts_probe(port=gpEnv.getCoordinatorPort()) self.logger.info("******************************************************************") self.logger.info("Updating segments for streaming is completed.") self.logger.info("For segments updated successfully, streaming will continue in the background.") self.logger.info("Use gpstate -s to check the streaming progress.") self.logger.info("******************************************************************") sys.exit(0)
def loadTargetSegments(self): targetHost = self.options.targetHost targetRole = self.options.targetRole targetRegistrationOrder = self.options.targetRegistrationOrder if targetHost is None and targetRegistrationOrder is None: raise ProgramArgumentValidationException(\ "neither --host nor --registration_order is specified. " \ "Exactly one should be specified.") if targetHost is not None and targetRegistrationOrder is not None: raise ProgramArgumentValidationException(\ "both --host and --registration_order are specified. " \ "Exactly one should be specified.") if targetHost is not None and targetRole is None: raise ProgramArgumentValidationException(\ "--role is not specified when --host is specified. " \ "Role is required when targeting a host.") if targetRegistrationOrder is not None and targetRole is not None: raise ProgramArgumentValidationException(\ "--role is specified when --registration_order is specified. " \ "Role should not be specified when targeting a single registration_order.") # # load from master db # masterPort = self.options.masterPort if masterPort is None: gpEnv = GpMasterEnvironment(self.options.masterDataDirectory, False) masterPort = gpEnv.getMasterPort() conf = configurationInterface.getConfigurationProvider( ).initializeProvider(masterPort) hawqArray = conf.loadSystemConfig(useUtilityMode=True) hawqdbs = hawqArray.getDbList() # # prune gpArray according to filter settings # hawqdbs = [hdb for hdb in hawqdbs if hdb.isSegment()] if targetHost is not None and targetHost != "ALL": hawqdbs = [ hdb for hdb in hawqdbs if hdb.getHostName() == targetHost ] if targetRegistrationOrder is not None: hawqdbs = gpArray.getDbList() regorder = int(targetRegistrationOrder) hawqdbs = [ hdb for hdb in hawqdbs if hdb.getRegistrationOrder() == regorder ] if targetRole is not None: hawqdbs = [ hdb for hdb in hawqdbs if self.isMatchingRole(targetRole, hdb) ] # only DOWN segments remaining? Error out downhawqdbs = [hdb for hdb in hawqdbs if hdb.getStatus() != 'u'] if len(downhawqdbs) > 0: downhawqdbStr = "\n Down Segment: " raise ExceptionNoStackTraceNeeded( "Unable to inject fault. At least one segment is marked as down in the database.%s%s" % (downhawqdbStr, downhawqdbStr.join([str(downhdb) for downhdb in downhawqdbs]))) print "### DEBUG: loadTargetSegments" print "### DEBUG: HAWQDBS " print hawqdbs return hawqdbs
def buildMirrors(self, actionName, gpEnv, gpArray): """ Build the mirrors. gpArray must have already been altered to have updated directories -- that is, the failoverSegments from the mirrorsToBuild must be present in gpArray. """ if len(self.__mirrorsToBuild) == 0: self.__logger.info("No segments to " + actionName) return True self.checkForPortAndDirectoryConflicts(gpArray) self.__logger.info("%s segment(s) to %s" % (len(self.__mirrorsToBuild), actionName)) # make sure the target directories are up-to-date # by cleaning them, if needed, and then copying a basic directory there # the postgresql.conf in that basic directory will need updating (to change the port) toStopDirectives = [] toEnsureMarkedDown = [] cleanupDirectives = [] copyDirectives = [] for toRecover in self.__mirrorsToBuild: if toRecover.getFailedSegment() is not None: # will stop the failed segment. Note that we do this even if we are recovering to a different location! toStopDirectives.append(GpStopSegmentDirectoryDirective(toRecover.getFailedSegment())) if toRecover.getFailedSegment().getSegmentStatus() == gparray.STATUS_UP: toEnsureMarkedDown.append(toRecover.getFailedSegment()) if toRecover.isFullSynchronization(): isTargetReusedLocation = False if toRecover.getFailedSegment() is not None and \ toRecover.getFailoverSegment() is None: # # We are recovering a failed segment in-place # cleanupDirectives.append(GpCleanupSegmentDirectoryDirective(toRecover.getFailedSegment())) isTargetReusedLocation = True if toRecover.getFailoverSegment() is not None: targetSegment = toRecover.getFailoverSegment() else: targetSegment = toRecover.getFailedSegment() d = GpCopySegmentDirectoryDirective(toRecover.getLiveSegment(), targetSegment, isTargetReusedLocation) copyDirectives.append(d) self.__ensureStopped(gpEnv, toStopDirectives) self.__ensureSharedMemCleaned(gpEnv, toStopDirectives) self.__ensureMarkedDown(gpEnv, toEnsureMarkedDown) if not self.__forceoverwrite: self.__cleanUpSegmentDirectories(cleanupDirectives) self.__copySegmentDirectories(gpEnv, gpArray, copyDirectives) # update and save metadata in memory for toRecover in self.__mirrorsToBuild: if toRecover.getFailoverSegment() is None: # we are recovering the lost segment in place seg = toRecover.getFailedSegment() else: seg = toRecover.getFailedSegment() # no need to update the failed segment's information -- it is # being overwritten in the configuration with the failover segment for gpArraySegment in gpArray.getDbList(): if gpArraySegment is seg: raise Exception( "failed segment should not be in the new configuration if failing over to new segment") seg = toRecover.getFailoverSegment() seg.setSegmentStatus(gparray.STATUS_DOWN) # down initially, we haven't started it yet seg.setSegmentMode(gparray.MODE_NOT_SYNC) # figure out what needs to be started or transitioned mirrorsToStart = [] rewindInfo = [] primariesToConvert = [] convertPrimaryUsingFullResync = [] fullResyncMirrorDbIds = {} for toRecover in self.__mirrorsToBuild: seg = toRecover.getFailoverSegment() if seg is None: seg = toRecover.getFailedSegment() # we are recovering in place mirrorsToStart.append(seg) primarySeg = toRecover.getLiveSegment() # Append to rewindInfo to execute pg_rewind later if we are not # using full recovery. We will run pg_rewind on incremental recovery # if the target mirror does not have recovery.conf file because # segment failover happened. The check for recovery.conf file will # happen in the same remote SegmentRewind Command call. if not toRecover.isFullSynchronization() \ and seg.getSegmentRole() == gparray.ROLE_MIRROR: rewindInfo.append((seg, primarySeg.getSegmentHostName(), primarySeg.getSegmentPort())) # The change in configuration to of the mirror to down requires that # the primary also be marked as unsynchronized. primarySeg.setSegmentMode(gparray.MODE_NOT_SYNC) primariesToConvert.append(primarySeg) convertPrimaryUsingFullResync.append(toRecover.isFullSynchronization()) if toRecover.isFullSynchronization() and seg.getSegmentDbId() > 0: fullResyncMirrorDbIds[seg.getSegmentDbId()] = True # should use mainUtils.getProgramName but I can't make it work! programName = os.path.split(sys.argv[0])[-1] # Disable Ctrl-C, going to save metadata in database and transition segments signal.signal(signal.SIGINT, signal.SIG_IGN) try: self.__logger.info("Updating configuration with new mirrors") configInterface.getConfigurationProvider().updateSystemConfig( gpArray, "%s: segment config for resync" % programName, dbIdToForceMirrorRemoveAdd=fullResyncMirrorDbIds, useUtilityMode=False, allowPrimary=False ) self.__logger.info("Updating mirrors") if len(rewindInfo) != 0: self.__logger.info("Running pg_rewind on required mirrors") rewindFailedSegments = self.run_pg_rewind(rewindInfo) # Do not start mirrors that failed pg_rewind for failedSegment in rewindFailedSegments: mirrorsToStart.remove(failedSegment) self.__logger.info("Starting mirrors") start_all_successful = self.__startAll(gpEnv, gpArray, mirrorsToStart) finally: # Re-enable Ctrl-C signal.signal(signal.SIGINT, signal.default_int_handler) return start_all_successful