def test_pghbaconf_updated_successfully_one_failed_segment( self, mock_list_addrs, mock_cmd, mock_username): os.environ["GPHOME"] = "/usr/local/gpdb" config_primaries_for_replication(self.gparray, False, contents_to_update=[0]) self.logger.info.assert_any_call( "Starting to modify pg_hba.conf on primary segments to allow replication connections" ) self.logger.info.assert_any_call( "Successfully modified pg_hba.conf on primary segments to allow replication connections" ) entries = self.entries_block.format(ip_primary1='192.168.1.1', ip_primary2='192.168.2.1', ip_mirror1='192.168.1.1', ip_mirror2='192.168.2.1') self.assertEqual(mock_cmd.call_count, 1) mock_cmd.assert_has_calls([ call( name="append to pg_hba.conf", cmdStr= ". /usr/local/gpdb/greenplum_path.sh; echo '%s' >> /data/primary0/pg_hba.conf; pg_ctl -D /data/primary0 reload" % entries, ctxt=REMOTE, remoteHost="sdw1"), ])
def test_pghbaconf_updated_successfully(self, mock_cmd, mock_list_addrs): config_primaries_for_replication(self.gparray, False) self.logger.info.assert_any_call( "Starting to modify pg_hba.conf on primary segments to allow replication connections" ) self.logger.info.assert_any_call( "Successfully modified pg_hba.conf on primary segments to allow replication connections" )
def test_pghbaconf_updated_fails(self, mock1, mock2): with self.assertRaisesRegex(Exception, "boom"): config_primaries_for_replication(self.gparray, False) self.logger.info.assert_any_call( "Starting to modify pg_hba.conf on primary segments to allow replication connections" ) self.logger.error.assert_any_call( "Failed while modifying pg_hba.conf on primary segments to allow replication connections: boom" )
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > gp.MAX_COORDINATOR_NUM_WORKERS: raise ProgramArgumentValidationException( "Invalid parallelDegree value provided with -B argument: %d" % self.__options.parallelDegree) if self.__options.parallelPerHost < 1 or self.__options.parallelPerHost > gp.MAX_SEGHOST_NUM_WORKERS: raise ProgramArgumentValidationException( "Invalid parallelPerHost value provided with -b argument: %d" % self.__options.parallelPerHost) self.__pool = WorkerPool(self.__options.parallelDegree) gpEnv = GpCoordinatorEnvironment( self.__options.coordinatorDataDirectory, True) # verify "where to recover" options optionCnt = 0 if self.__options.newRecoverHosts is not None: optionCnt += 1 if self.__options.recoveryConfigFile is not None: optionCnt += 1 if self.__options.rebalanceSegments: optionCnt += 1 if optionCnt > 1: raise ProgramArgumentValidationException( "Only one of -i, -p, and -r may be specified") faultProberInterface.getFaultProber().initializeProber( gpEnv.getCoordinatorPort()) confProvider = configInterface.getConfigurationProvider( ).initializeProvider(gpEnv.getCoordinatorPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) num_workers = min(len(gpArray.get_hostlist()), self.__options.parallelDegree) hosts = set(gpArray.get_hostlist(includeCoordinator=False)) unreachable_hosts = get_unreachable_segment_hosts(hosts, num_workers) for i, segmentPair in enumerate(gpArray.segmentPairs): if segmentPair.primaryDB.getSegmentHostName() in unreachable_hosts: logger.warning( "Not recovering segment %d because %s is unreachable" % (segmentPair.primaryDB.dbid, segmentPair.primaryDB.getSegmentHostName())) gpArray.segmentPairs[i].primaryDB.unreachable = True if segmentPair.mirrorDB.getSegmentHostName() in unreachable_hosts: logger.warning( "Not recovering segment %d because %s is unreachable" % (segmentPair.mirrorDB.dbid, segmentPair.mirrorDB.getSegmentHostName())) gpArray.segmentPairs[i].mirrorDB.unreachable = True if not gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( 'GPDB Mirroring replication is not configured for this Greenplum Database instance.' ) # We have phys-rep/filerep mirrors. if self.__options.newRecoverHosts is not None: try: uniqueHosts = [] for h in self.__options.newRecoverHosts.split(','): if h.strip() not in uniqueHosts: uniqueHosts.append(h.strip()) self.__options.newRecoverHosts = uniqueHosts except Exception as ex: raise ProgramArgumentValidationException( \ "Invalid value for recover hosts: %s" % ex) # retain list of hosts that were existing in the system prior to getRecoverActions... # this will be needed for later calculations that determine whether # new hosts were added into the system existing_hosts = set(gpArray.getHostList()) # figure out what needs to be done mirrorBuilder = self.getRecoveryActionsBasedOnOptions(gpEnv, gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.outputToFile(mirrorBuilder, gpArray, self.__options.outputSampleConfigFile) self.logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) elif self.__options.rebalanceSegments: assert (isinstance(mirrorBuilder, GpSegmentRebalanceOperation)) # Make sure we have work to do if len(gpArray.get_unbalanced_segdbs()) == 0: self.logger.info( "No segments are running in their non-preferred role and need to be rebalanced." ) else: self.displayRecovery(mirrorBuilder, gpArray) if self.__options.interactive: self.logger.warn( "This operation will cancel queries that are currently executing." ) self.logger.warn( "Connections to the database however will not be interrupted." ) if not userinput.ask_yesno( None, "\nContinue with segment rebalance procedure", 'N'): raise UserAbortedException() fullRebalanceDone = mirrorBuilder.rebalance() self.logger.info( "******************************************************************" ) if fullRebalanceDone: self.logger.info( "The rebalance operation has completed successfully.") else: self.logger.info( "The rebalance operation has completed with WARNINGS." " Please review the output in the gprecoverseg log.") self.logger.info( "******************************************************************" ) elif len(mirrorBuilder.getMirrorsToBuild()) == 0: self.logger.info('No segments to recover') else: mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) self.validate_heap_checksum_consistency(gpArray, mirrorBuilder) self.displayRecovery(mirrorBuilder, gpArray) self.__displayRecoveryWarnings(mirrorBuilder) if self.__options.interactive: if not userinput.ask_yesno( None, "\nContinue with segment recovery procedure", 'N'): raise UserAbortedException() # sync packages current_hosts = set(gpArray.getHostList()) new_hosts = current_hosts - existing_hosts if new_hosts: self.syncPackages(new_hosts) contentsToUpdate = [ seg.getLiveSegment().getSegmentContentId() for seg in mirrorBuilder.getMirrorsToBuild() ] config_primaries_for_replication(gpArray, self.__options.hba_hostnames, contentsToUpdate) if not mirrorBuilder.buildMirrors("recover", gpEnv, gpArray): sys.exit(1) self.trigger_fts_probe(port=gpEnv.getCoordinatorPort()) self.logger.info("********************************") self.logger.info("Segments successfully recovered.") self.logger.info("********************************") sys.exit(0)
def run(self): if self.__options.batch_size < 1 or self.__options.batch_size > gp.MAX_COORDINATOR_NUM_WORKERS: raise ProgramArgumentValidationException( "Invalid batch_size provided with -B argument: %d" % self.__options.batch_size) if self.__options.segment_batch_size < 1 or self.__options.segment_batch_size > gp.MAX_SEGHOST_NUM_WORKERS: raise ProgramArgumentValidationException( "Invalid segment_batch_size provided with -b argument: %d" % self.__options.segment_batch_size) self.__pool = base.WorkerPool(self.__options.batch_size) gpEnv = GpCoordinatorEnvironment( self.__options.coordinatorDataDirectory, True) faultProberInterface.getFaultProber().initializeProber( gpEnv.getCoordinatorPort()) confProvider = configInterface.getConfigurationProvider( ).initializeProvider(gpEnv.getCoordinatorPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) # check that heap_checksums is consistent across cluster, fail immediately if not self.validate_heap_checksums(gpArray) if self.__options.mirrorConfigFile is None: self.checkMirrorOffset(gpArray) # check that we actually have mirrors if gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( \ "GPDB physical mirroring cannot be added. The cluster is already configured with Mirrors.") # figure out what needs to be done (AND update the gpArray!) mirrorBuilder = self.__getMirrorsToBuildBasedOnOptions(gpEnv, gpArray) mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.__outputToFile(mirrorBuilder, self.__options.outputSampleConfigFile, gpArray) logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) else: self.__displayAddMirrors(gpEnv, mirrorBuilder, gpArray) if self.__options.interactive: if not userinput.ask_yesno( None, "\nContinue with add mirrors procedure", 'N'): raise UserAbortedException() config_primaries_for_replication(gpArray, self.__options.hba_hostnames) if not mirrorBuilder.buildMirrors("add", gpEnv, gpArray): return 1 logger.info( "******************************************************************" ) logger.info( "Mirror segments have been added; data synchronization is in progress." ) logger.info( "Data synchronization will continue in the background.") logger.info( "Use gpstate -s to check the resynchronization progress.") logger.info( "******************************************************************" ) return 0 # success -- exit code 0!
def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree) self.__pool = WorkerPool(self.__options.parallelDegree) gpEnv = GpMasterEnvironment(self.__options.masterDataDirectory, True) # verify "where to recover" options optionCnt = 0 if self.__options.newRecoverHosts is not None: optionCnt += 1 if self.__options.recoveryConfigFile is not None: optionCnt += 1 if self.__options.rebalanceSegments: optionCnt += 1 if optionCnt > 1: raise ProgramArgumentValidationException( "Only one of -i, -p, and -r may be specified") faultProberInterface.getFaultProber().initializeProber( gpEnv.getMasterPort()) confProvider = configInterface.getConfigurationProvider( ).initializeProvider(gpEnv.getMasterPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) if not gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( 'GPDB Mirroring replication is not configured for this Greenplum Database instance.' ) # We have phys-rep/filerep mirrors. if self.__options.newRecoverHosts is not None: try: uniqueHosts = [] for h in self.__options.newRecoverHosts.split(','): if h.strip() not in uniqueHosts: uniqueHosts.append(h.strip()) self.__options.newRecoverHosts = uniqueHosts except Exception as ex: raise ProgramArgumentValidationException( \ "Invalid value for recover hosts: %s" % ex) # If it's a rebalance operation, make sure we are in an acceptable state to do that # Acceptable state is: # - No segments down # - No segments in change tracking or unsynchronized state if self.__options.rebalanceSegments: if len(gpArray.get_invalid_segdbs()) > 0: raise Exception( "Down segments still exist. All segments must be up to rebalance." ) if len(gpArray.get_synchronized_segdbs()) != len( gpArray.getSegDbList()): raise Exception( "Some segments are not yet synchronized. All segments must be synchronized to rebalance." ) # retain list of hosts that were existing in the system prior to getRecoverActions... # this will be needed for later calculations that determine whether # new hosts were added into the system existing_hosts = set(gpArray.getHostList()) # figure out what needs to be done mirrorBuilder = self.getRecoveryActionsBasedOnOptions(gpEnv, gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.outputToFile(mirrorBuilder, gpArray, self.__options.outputSampleConfigFile) self.logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) elif self.__options.rebalanceSegments: assert (isinstance(mirrorBuilder, GpSegmentRebalanceOperation)) # Make sure we have work to do if len(gpArray.get_unbalanced_segdbs()) == 0: self.logger.info( "No segments are running in their non-preferred role and need to be rebalanced." ) else: self.displayRecovery(mirrorBuilder, gpArray) if self.__options.interactive: self.logger.warn( "This operation will cancel queries that are currently executing." ) self.logger.warn( "Connections to the database however will not be interrupted." ) if not userinput.ask_yesno( None, "\nContinue with segment rebalance procedure", 'N'): raise UserAbortedException() fullRebalanceDone = mirrorBuilder.rebalance() self.logger.info( "******************************************************************" ) if fullRebalanceDone: self.logger.info( "The rebalance operation has completed successfully.") else: self.logger.info( "The rebalance operation has completed with WARNINGS." " Please review the output in the gprecoverseg log.") self.logger.info( "There is a resynchronization running in the background to bring all" ) self.logger.info("segments in sync.") self.logger.info( "Use gpstate -e to check the resynchronization progress.") self.logger.info( "******************************************************************" ) elif len(mirrorBuilder.getMirrorsToBuild()) == 0: self.logger.info('No segments to recover') else: mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) self.validate_heap_checksum_consistency(gpArray, mirrorBuilder) self.displayRecovery(mirrorBuilder, gpArray) self.__displayRecoveryWarnings(mirrorBuilder) if self.__options.interactive: if not userinput.ask_yesno( None, "\nContinue with segment recovery procedure", 'N'): raise UserAbortedException() # sync packages current_hosts = set(gpArray.getHostList()) new_hosts = current_hosts - existing_hosts if new_hosts: self.syncPackages(new_hosts) config_primaries_for_replication(gpArray, self.__options.hba_hostnames) if not mirrorBuilder.buildMirrors("recover", gpEnv, gpArray): sys.exit(1) self.trigger_fts_probe(port=gpEnv.getMasterPort()) self.logger.info( "******************************************************************" ) self.logger.info("Updating segments for streaming is completed.") self.logger.info( "For segments updated successfully, streaming will continue in the background." ) self.logger.info( "Use gpstate -s to check the streaming progress.") self.logger.info( "******************************************************************" ) sys.exit(0)
class GpRecoverSegmentProgram: # # Constructor: # # @param options the options as returned by the options parser # def __init__(self, options): self.__options = options self.__pool = None self.logger = logger # If user did not specify a value for showProgressInplace and # stdout is a tty then send escape sequences to gprecoverseg # output. Otherwise do not show progress inplace. if self.__options.showProgressInplace is None: self.__options.showProgressInplace = sys.stdout.isatty() def getProgressMode(self): if self.__options.showProgress: if self.__options.showProgressInplace: progressMode = GpMirrorListToBuild.Progress.INPLACE else: progressMode = GpMirrorListToBuild.Progress.SEQUENTIAL else: progressMode = GpMirrorListToBuild.Progress.NONE return progressMode def outputToFile(self, mirrorBuilder, gpArray, fileName): lines = [] # one entry for each failure for mirror in mirrorBuilder.getMirrorsToBuild(): output_str = "" seg = mirror.getFailedSegment() addr = canonicalize_address(seg.getSegmentAddress()) output_str += ('%s|%d|%s' % (addr, seg.getSegmentPort(), seg.getSegmentDataDirectory())) seg = mirror.getFailoverSegment() if seg is not None: output_str += ' ' addr = canonicalize_address(seg.getSegmentAddress()) output_str += ('%s|%d|%s' % ( addr, seg.getSegmentPort(), seg.getSegmentDataDirectory())) lines.append(output_str) writeLinesToFile(fileName, lines) def _getParsedRow(self, filename, lineno, line): groups = line.split() # NOT line.split(' ') due to MPP-15675 if len(groups) not in [1, 2]: msg = "line %d of file %s: expected 1 or 2 groups but found %d" % (lineno, filename, len(groups)) raise ExceptionNoStackTraceNeeded(msg) parts = groups[0].split('|') if len(parts) != 3: msg = "line %d of file %s: expected 3 parts on failed segment group, obtained %d" % ( lineno, filename, len(parts)) raise ExceptionNoStackTraceNeeded(msg) address, port, datadir = parts check_values(lineno, address=address, port=port, datadir=datadir) row = { 'failedAddress': address, 'failedPort': port, 'failedDataDirectory': datadir, 'lineno': lineno } if len(groups) == 2: parts2 = groups[1].split('|') if len(parts2) != 3: msg = "line %d of file %s: expected 3 parts on new segment group, obtained %d" % ( lineno, filename, len(parts2)) raise ExceptionNoStackTraceNeeded(msg) address2, port2, datadir2 = parts2 check_values(lineno, address=address2, port=port2, datadir=datadir2) row.update({ 'newAddress': address2, 'newPort': port2, 'newDataDirectory': datadir2 }) return row def getRecoveryActionsFromConfigFile(self, gpArray): """ getRecoveryActionsFromConfigFile returns: a tuple (segments in change tracking disabled mode which are unable to recover, GpMirrorListToBuild object containing information of segments which are able to recover) """ filename = self.__options.recoveryConfigFile rows = [] with open(filename) as f: for lineno, line in line_reader(f): rows.append(self._getParsedRow(filename, lineno, line)) allAddresses = [row["newAddress"] for row in rows if "newAddress" in row] interfaceLookup = GpInterfaceToHostNameCache(self.__pool, allAddresses, [None]*len(allAddresses)) failedSegments = [] failoverSegments = [] for row in rows: # find the failed segment failedAddress = row['failedAddress'] failedPort = row['failedPort'] failedDataDirectory = normalizeAndValidateInputPath(row['failedDataDirectory'], "config file", row['lineno']) failedSegment = None for segment in gpArray.getDbList(): if (segment.getSegmentAddress() == failedAddress and str(segment.getSegmentPort()) == failedPort and segment.getSegmentDataDirectory() == failedDataDirectory): if failedSegment is not None: # this could be an assertion -- configuration should not allow multiple entries! raise Exception(("A segment to recover was found twice in configuration. " "This segment is described by address|port|directory '%s|%s|%s' " "on the input line: %s") % (failedAddress, failedPort, failedDataDirectory, row['lineno'])) failedSegment = segment if failedSegment is None: raise Exception("A segment to recover was not found in configuration. " \ "This segment is described by address|port|directory '%s|%s|%s' on the input line: %s" % (failedAddress, failedPort, failedDataDirectory, row['lineno'])) failoverSegment = None if "newAddress" in row: """ When the second set was passed, the caller is going to tell us to where we need to failover, so build a failover segment """ # these two lines make it so that failoverSegment points to the object that is registered in gparray failoverSegment = failedSegment failedSegment = failoverSegment.copy() address = row["newAddress"] try: port = int(row["newPort"]) except ValueError: raise Exception('Config file format error, invalid number value in line: %s' % (row['lineno'])) dataDirectory = normalizeAndValidateInputPath(row["newDataDirectory"], "config file", row['lineno']) hostName = interfaceLookup.getHostName(address) if hostName is None: raise Exception('Unable to find host name for address %s from line:%s' % (address, row['lineno'])) # now update values in failover segment failoverSegment.setSegmentAddress(address) failoverSegment.setSegmentHostName(hostName) failoverSegment.setSegmentPort(port) failoverSegment.setSegmentDataDirectory(dataDirectory) # this must come AFTER the if check above because failedSegment can be adjusted to # point to a different object failedSegments.append(failedSegment) failoverSegments.append(failoverSegment) peersForFailedSegments = self.findAndValidatePeersForFailedSegments(gpArray, failedSegments) segs = [] segs_with_persistent_mirroring_disabled = [] for index, failedSegment in enumerate(failedSegments): peerForFailedSegment = peersForFailedSegments[index] peerForFailedSegmentDbId = peerForFailedSegment.getSegmentDbId() if failedSegment.unreachable: continue segs.append(GpMirrorToBuild(failedSegment, peerForFailedSegment, failoverSegments[index], self.__options.forceFullResynchronization)) self._output_segments_with_persistent_mirroring_disabled(segs_with_persistent_mirroring_disabled) return GpMirrorListToBuild(segs, self.__pool, self.__options.quiet, self.__options.parallelDegree, forceoverwrite=True, progressMode=self.getProgressMode()) def findAndValidatePeersForFailedSegments(self, gpArray, failedSegments): dbIdToPeerMap = gpArray.getDbIdToPeerMap() peersForFailedSegments = [dbIdToPeerMap.get(seg.getSegmentDbId()) for seg in failedSegments] for i in range(len(failedSegments)): peer = peersForFailedSegments[i] if peer is None: raise Exception("No peer found for dbid %s" % failedSegments[i].getSegmentDbId()) elif peer.isSegmentDown(): raise Exception( "Both segments for content %s are down; Try restarting Greenplum DB and running %s again." % (peer.getSegmentContentId(), getProgramName())) return peersForFailedSegments def getRecoveryActionsFromConfiguration(self, gpEnv, gpArray): """ getRecoveryActionsFromConfiguration returns: a tuple (segments in change tracking disabled mode which are unable to recover, GpMirrorListToBuild object containing information of segments which are able to recover) """ segments = gpArray.getSegDbList() failedSegments = [seg for seg in segments if seg.isSegmentDown()] peersForFailedSegments = self.findAndValidatePeersForFailedSegments(gpArray, failedSegments) # Dictionaries used for building mapping to new hosts recoverAddressMap = {} recoverHostMap = {} interfaceHostnameWarnings = [] # Check if the array is a "standard" array (isStandardArray, _ignore) = gpArray.isStandardArray() recoverHostIdx = 0 if self.__options.newRecoverHosts and len(self.__options.newRecoverHosts) > 0: for seg in failedSegments: segAddress = seg.getSegmentAddress() segHostname = seg.getSegmentHostName() # Haven't seen this hostname before so we put it on a new host if not recoverHostMap.has_key(segHostname): try: recoverHostMap[segHostname] = self.__options.newRecoverHosts[recoverHostIdx] except: # If we get here, not enough hosts were specified in the -p option. Need 1 new host # per 1 failed host. raise Exception('Not enough new recovery hosts given for recovery.') recoverHostIdx += 1 if isStandardArray: # We have a standard array configuration, so we'll try to use the same # interface naming convention. If this doesn't work, we'll correct it # below on name lookup segInterface = segAddress[segAddress.rfind('-'):] destAddress = recoverHostMap[segHostname] + segInterface destHostname = recoverHostMap[segHostname] else: # Non standard configuration so we won't make assumptions on # naming. Instead we'll use the hostname passed in for both # hostname and address and flag for warning later. destAddress = recoverHostMap[segHostname] destHostname = recoverHostMap[segHostname] # Save off the new host/address for this address. recoverAddressMap[segAddress] = (destHostname, destAddress) # Now that we've generated the mapping, look up all the addresses to make # sure they are resolvable. interfaces = [address for (_ignore, address) in recoverAddressMap.values()] interfaceLookup = GpInterfaceToHostNameCache(self.__pool, interfaces, [None] * len(interfaces)) for key in recoverAddressMap.keys(): (newHostname, newAddress) = recoverAddressMap[key] try: addressHostnameLookup = interfaceLookup.getHostName(newAddress) # Lookup failed so use hostname passed in for everything. if addressHostnameLookup is None: interfaceHostnameWarnings.append( "Lookup of %s failed. Using %s for both hostname and address." % (newAddress, newHostname)) newAddress = newHostname except: # Catch all exceptions. We will use hostname instead of address # that we generated. interfaceHostnameWarnings.append( "Lookup of %s failed. Using %s for both hostname and address." % (newAddress, newHostname)) newAddress = newHostname # if we've updated the address to use the hostname because of lookup failure # make sure the hostname is resolvable and up if newHostname == newAddress: try: unix.Ping.local("ping new hostname", newHostname) except: raise Exception("Ping of host %s failed." % newHostname) # Save changes in map recoverAddressMap[key] = (newHostname, newAddress) if len(self.__options.newRecoverHosts) != recoverHostIdx: interfaceHostnameWarnings.append("The following recovery hosts were not needed:") for h in self.__options.newRecoverHosts[recoverHostIdx:]: interfaceHostnameWarnings.append("\t%s" % h) portAssigner = PortAssigner(gpArray) forceFull = self.__options.forceFullResynchronization segs = [] segs_with_persistent_mirroring_disabled = [] for i in range(len(failedSegments)): failoverSegment = None failedSegment = failedSegments[i] liveSegment = peersForFailedSegments[i] if self.__options.newRecoverHosts and len(self.__options.newRecoverHosts) > 0: (newRecoverHost, newRecoverAddress) = recoverAddressMap[failedSegment.getSegmentAddress()] # these two lines make it so that failoverSegment points to the object that is registered in gparray failoverSegment = failedSegment failedSegment = failoverSegment.copy() failoverSegment.setSegmentHostName(newRecoverHost) failoverSegment.setSegmentAddress(newRecoverAddress) port = portAssigner.findAndReservePort(newRecoverHost, newRecoverAddress) failoverSegment.setSegmentPort(port) if failedSegment.unreachable: continue segs.append(GpMirrorToBuild(failedSegment, liveSegment, failoverSegment, forceFull)) self._output_segments_with_persistent_mirroring_disabled(segs_with_persistent_mirroring_disabled) return GpMirrorListToBuild(segs, self.__pool, self.__options.quiet, self.__options.parallelDegree, interfaceHostnameWarnings, forceoverwrite=True, progressMode=self.getProgressMode()) def _output_segments_with_persistent_mirroring_disabled(self, segs_persistent_mirroring_disabled=None): if segs_persistent_mirroring_disabled: self.logger.warn('Segments with dbid %s not recovered; persistent mirroring state is disabled.' % (', '.join(str(seg_id) for seg_id in segs_persistent_mirroring_disabled))) def getRecoveryActionsBasedOnOptions(self, gpEnv, gpArray): if self.__options.rebalanceSegments: return GpSegmentRebalanceOperation(gpEnv, gpArray) elif self.__options.recoveryConfigFile is not None: return self.getRecoveryActionsFromConfigFile(gpArray) else: return self.getRecoveryActionsFromConfiguration(gpEnv, gpArray) def syncPackages(self, new_hosts): # The design decision here is to squash any exceptions resulting from the # synchronization of packages. We should *not* disturb the user's attempts to recover. try: self.logger.info('Syncing Greenplum Database extensions') operations = [SyncPackages(host) for host in new_hosts] ParallelOperation(operations, self.__options.parallelDegree).run() # introspect outcomes for operation in operations: operation.get_ret() except: self.logger.exception('Syncing of Greenplum Database extensions has failed.') self.logger.warning('Please run gppkg --clean after successful segment recovery.') def displayRecovery(self, mirrorBuilder, gpArray): self.logger.info('Greenplum instance recovery parameters') self.logger.info('---------------------------------------------------------') if self.__options.recoveryConfigFile: self.logger.info('Recovery from configuration -i option supplied') elif self.__options.newRecoverHosts is not None: self.logger.info('Recovery type = Pool Host') for h in self.__options.newRecoverHosts: self.logger.info('Pool host for recovery = %s' % h) elif self.__options.rebalanceSegments: self.logger.info('Recovery type = Rebalance') else: self.logger.info('Recovery type = Standard') if self.__options.rebalanceSegments: i = 1 total = len(gpArray.get_unbalanced_segdbs()) for toRebalance in gpArray.get_unbalanced_segdbs(): tabLog = TableLogger() self.logger.info('---------------------------------------------------------') self.logger.info('Unbalanced segment %d of %d' % (i, total)) self.logger.info('---------------------------------------------------------') programIoUtils.appendSegmentInfoForOutput("Unbalanced", gpArray, toRebalance, tabLog) tabLog.info(["Balanced role", "= Primary" if toRebalance.preferred_role == 'p' else "= Mirror"]) tabLog.info(["Current role", "= Primary" if toRebalance.role == 'p' else "= Mirror"]) tabLog.outputTable() i += 1 else: i = 0 total = len(mirrorBuilder.getMirrorsToBuild()) for toRecover in mirrorBuilder.getMirrorsToBuild(): self.logger.info('---------------------------------------------------------') self.logger.info('Recovery %d of %d' % (i + 1, total)) self.logger.info('---------------------------------------------------------') tabLog = TableLogger() syncMode = "Full" if toRecover.isFullSynchronization() else "Incremental" tabLog.info(["Synchronization mode", "= " + syncMode]) programIoUtils.appendSegmentInfoForOutput("Failed", gpArray, toRecover.getFailedSegment(), tabLog) programIoUtils.appendSegmentInfoForOutput("Recovery Source", gpArray, toRecover.getLiveSegment(), tabLog) if toRecover.getFailoverSegment() is not None: programIoUtils.appendSegmentInfoForOutput("Recovery Target", gpArray, toRecover.getFailoverSegment(), tabLog) else: tabLog.info(["Recovery Target", "= in-place"]) tabLog.outputTable() i = i + 1 self.logger.info('---------------------------------------------------------') def __getSimpleSegmentLabel(self, seg): addr = canonicalize_address(seg.getSegmentAddress()) return "%s:%s" % (addr, seg.getSegmentDataDirectory()) def __displayRecoveryWarnings(self, mirrorBuilder): for warning in self._getRecoveryWarnings(mirrorBuilder): self.logger.warn(warning) def _getRecoveryWarnings(self, mirrorBuilder): """ return an array of string warnings regarding the recovery """ res = [] for toRecover in mirrorBuilder.getMirrorsToBuild(): if toRecover.getFailoverSegment() is not None: # # user specified a failover location -- warn if it's the same host as its primary # src = toRecover.getLiveSegment() dest = toRecover.getFailoverSegment() if src.getSegmentHostName() == dest.getSegmentHostName(): res.append("Segment is being recovered to the same host as its primary: " "primary %s failover target: %s" % (self.__getSimpleSegmentLabel(src), self.__getSimpleSegmentLabel(dest))) for warning in mirrorBuilder.getAdditionalWarnings(): res.append(warning) return res def _get_dblist(self): # template0 does not accept any connections so we exclude it with dbconn.connect(dbconn.DbURL()) as conn: res = dbconn.execSQL(conn, "SELECT datname FROM PG_DATABASE WHERE datname != 'template0'") return res.fetchall() def run(self): if self.__options.parallelDegree < 1 or self.__options.parallelDegree > 64: raise ProgramArgumentValidationException( "Invalid parallelDegree provided with -B argument: %d" % self.__options.parallelDegree) self.__pool = WorkerPool(self.__options.parallelDegree) gpEnv = GpMasterEnvironment(self.__options.masterDataDirectory, True) # verify "where to recover" options optionCnt = 0 if self.__options.newRecoverHosts is not None: optionCnt += 1 if self.__options.recoveryConfigFile is not None: optionCnt += 1 if self.__options.rebalanceSegments: optionCnt += 1 if optionCnt > 1: raise ProgramArgumentValidationException("Only one of -i, -p, and -r may be specified") faultProberInterface.getFaultProber().initializeProber(gpEnv.getMasterPort()) confProvider = configInterface.getConfigurationProvider().initializeProvider(gpEnv.getMasterPort()) gpArray = confProvider.loadSystemConfig(useUtilityMode=False) num_workers = min(len(gpArray.get_hostlist()), self.__options.parallelDegree) hosts = set(gpArray.get_hostlist(includeMaster=False)) unreachable_hosts = get_unreachable_segment_hosts(hosts, num_workers) for i, segmentPair in enumerate(gpArray.segmentPairs): if segmentPair.primaryDB.getSegmentHostName() in unreachable_hosts: logger.warning("Not recovering segment %d because %s is unreachable" % (segmentPair.primaryDB.dbid, segmentPair.primaryDB.getSegmentHostName())) gpArray.segmentPairs[i].primaryDB.unreachable = True if segmentPair.mirrorDB.getSegmentHostName() in unreachable_hosts: logger.warning("Not recovering segment %d because %s is unreachable" % (segmentPair.mirrorDB.dbid, segmentPair.mirrorDB.getSegmentHostName())) gpArray.segmentPairs[i].mirrorDB.unreachable = True if not gpArray.hasMirrors: raise ExceptionNoStackTraceNeeded( 'GPDB Mirroring replication is not configured for this Greenplum Database instance.') # We have phys-rep/filerep mirrors. if self.__options.newRecoverHosts is not None: try: uniqueHosts = [] for h in self.__options.newRecoverHosts.split(','): if h.strip() not in uniqueHosts: uniqueHosts.append(h.strip()) self.__options.newRecoverHosts = uniqueHosts except Exception, ex: raise ProgramArgumentValidationException( \ "Invalid value for recover hosts: %s" % ex) # retain list of hosts that were existing in the system prior to getRecoverActions... # this will be needed for later calculations that determine whether # new hosts were added into the system existing_hosts = set(gpArray.getHostList()) # figure out what needs to be done mirrorBuilder = self.getRecoveryActionsBasedOnOptions(gpEnv, gpArray) if self.__options.outputSampleConfigFile is not None: # just output config file and done self.outputToFile(mirrorBuilder, gpArray, self.__options.outputSampleConfigFile) self.logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile) elif self.__options.rebalanceSegments: assert (isinstance(mirrorBuilder, GpSegmentRebalanceOperation)) # Make sure we have work to do if len(gpArray.get_unbalanced_segdbs()) == 0: self.logger.info("No segments are running in their non-preferred role and need to be rebalanced.") else: self.displayRecovery(mirrorBuilder, gpArray) if self.__options.interactive: self.logger.warn("This operation will cancel queries that are currently executing.") self.logger.warn("Connections to the database however will not be interrupted.") if not userinput.ask_yesno(None, "\nContinue with segment rebalance procedure", 'N'): raise UserAbortedException() fullRebalanceDone = mirrorBuilder.rebalance() self.logger.info("******************************************************************") if fullRebalanceDone: self.logger.info("The rebalance operation has completed successfully.") else: self.logger.info("The rebalance operation has completed with WARNINGS." " Please review the output in the gprecoverseg log.") self.logger.info("******************************************************************") elif len(mirrorBuilder.getMirrorsToBuild()) == 0: self.logger.info('No segments to recover') else: mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray) self.validate_heap_checksum_consistency(gpArray, mirrorBuilder) self.displayRecovery(mirrorBuilder, gpArray) self.__displayRecoveryWarnings(mirrorBuilder) if self.__options.interactive: if not userinput.ask_yesno(None, "\nContinue with segment recovery procedure", 'N'): raise UserAbortedException() # sync packages current_hosts = set(gpArray.getHostList()) new_hosts = current_hosts - existing_hosts if new_hosts: self.syncPackages(new_hosts) config_primaries_for_replication(gpArray, self.__options.hba_hostnames) if not mirrorBuilder.buildMirrors("recover", gpEnv, gpArray): sys.exit(1) confProvider.sendPgElogFromMaster("Recovery of %d segment(s) has been started." % \ len(mirrorBuilder.getMirrorsToBuild()), True) self.trigger_fts_probe(port=gpEnv.getMasterPort()) self.logger.info("********************************") self.logger.info("Segments successfully recovered.") self.logger.info("********************************") sys.exit(0)