def rebalance(runner): # find out about the cluster. response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) #stop list target_host_ids = [] targets = [] if runner.opts.stoplist: targets = runner.opts.stoplist.split(',') for host in targets: target = hosts.get_host(host) if target == None: runner.abort('Host not found in cluster: %s' % host) else: target_host_ids.append(str(target.id)) # Connect to an arbitrary host that isn't being stopped. chost = hosts.get_connection_host(targets) if chost is None: runner.abort( 'Could not find a host other than the hosts to be stoppted.in cluster: %s' % runner.opts.target_host) user_info = '' if runner.opts.username: user_info = ', user: %s' % runner.opts.username runner.info('Connecting to %s:%d%s (%s) to issue "rebalance" command' % (chost.get_admininterface(), chost.adminport, user_info, chost.hostname)) #set up connections runner.voltdb_connect(chost.get_admininterface(), chost.adminport, runner.opts.username, runner.opts.password) json_opts = ['command:"rebalance"'] if target_host_ids: stoplist = 'hosts:%s' % ("-".join(target_host_ids)) json_opts.append(stoplist) if runner.opts.kfactor: json_opts.append('kfactor:%s' % (runner.opts.kfactor)) if runner.opts.hostcount: json_opts.append('hostcount:%s' % (runner.opts.hostcount)) if not runner.opts.dryrun: response = runner.call_proc('@Rebalance', [VOLT.FastSerializer.VOLTTYPE_STRING], ['{%s}' % (','.join(json_opts))]) print response
def resize(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(*tuple) # get current version and root directory from an arbitrary node host = next(iter(hosts.hosts_by_id.values())) # check the version of target cluster to make it work properly. version = host.version versionStr = version.split('.') majorVersion = int(versionStr[0]) minorVersion = int(versionStr[1]) if "license" not in host: runner.abort('Elastic resize only available for enterprise edition.') if majorVersion < RELEASE_MAJOR_VERSION or ( majorVersion == RELEASE_MAJOR_VERSION and minorVersion < RELEASE_MINOR_VERSION): runner.abort('The version of targeting cluster is ' + version + ' which is lower than version ' + str(RELEASE_MAJOR_VERSION) + '.' + str(RELEASE_MINOR_VERSION) + ' for supporting elastic resize.') # Convert shutdown delay input of minutes to millis shutdown_delay = runner.opts.shutdown_delay * 60000 if runner.opts.shutdown_delay > 0 else -1 option = runner.opts.opt result = runner.call_proc('@ElasticRemoveNT', [ VOLT.FastSerializer.VOLTTYPE_TINYINT, VOLT.FastSerializer.VOLTTYPE_STRING, VOLT.FastSerializer.VOLTTYPE_STRING, VOLT.FastSerializer.VOLTTYPE_BIGINT ], [option, '', ','.join(runner.opts.skip_requirements), shutdown_delay ]).table(0) status = result.tuple(0).column_integer(0) message = result.tuple(0).column_string(1) if option in (Option.TEST, Option.START) and "host ids:" in message: host_names = ', '.join([ hostIdsToNames(id, hosts) for id in re.search( 'host ids: \[(.+?)\]', message).group(1).split(',') ]) if option == Option.TEST: message = "Hosts will be removed: [" + host_names + "], " + message elif option == Option.START: message = "Starting cluster resize: Removing hosts: [" + host_names + "], " + message if status == 0: runner.info(message) else: runner.error(message) sys.exit(1)
def basicCheck(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() fullClusterSize = int(host.fullclustersize) if len(hosts.hosts_by_id) < fullClusterSize: runner.abort( "Current cluster needs %d more node(s) to achieve full K-safety. In-service upgrade is not recommended in partial K-safety cluster." % (fullClusterSize - len(hosts.hosts_by_id))) if fullClusterSize % 2 == 1 and runner.opts.newNode is None: runner.abort( "The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions" ) host = hosts.hosts_by_id.itervalues().next() currentVersion = host.version currentVoltDBRoot = host.voltdbroot currentDeployment = host.deployment xmlroot = ElementTree.parse(currentDeployment).getroot() cluster = xmlroot.find("./cluster") if cluster is None: runner.abort("Couldn't find cluster tag in current deployment file") kfactor_tag = cluster.get('kfactor') if kfactor_tag is None: kfactor = 0 else: kfactor = int(kfactor_tag) # sanity check, in case of nodes number or K-factor is less than required # K = 0, abort with error message if kfactor == 0: runner.abort( "Current cluster doesn't have duplicate partitions to perform in-service upgrade. K-factor: %d" % kfactor) # N = 1, abort with error message if fullClusterSize == 1: runner.abort( "Current cluster doesn't have enough node to perform in-service upgrade, at least two nodes are required" ) return hosts, kfactor
def stop(runner): # Exec @SystemInformation to find out about the cluster. response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # Connect to an arbitrary host that isn't being stopped. defaultport = 3021 min_hosts = 1 max_hosts = 1 target_host = utility.parse_hosts(runner.opts.target_host, min_hosts, max_hosts, defaultport)[0] (thost, chost) = hosts.get_target_and_connection_host(target_host.host, target_host.port) if thost is None: runner.abort('Host not found in cluster: %s:%d' % (target_host.host, target_host.port)) if chost is None: runner.abort( 'The entire cluster is being stopped, use "shutdown" instead.') if runner.opts.username: user_info = ', user: %s' % runner.opts.username else: user_info = '' runner.info('Connecting to %s:%d%s (%s) to issue "stop" command' % (chost.get_admininterface(), chost.adminport, user_info, chost.hostname)) runner.voltdb_connect(chost.get_admininterface(), chost.adminport, runner.opts.username, runner.opts.password, runner.opts.ssl_config, runner.opts.kerberos) # Stop the requested host using exec @StopNode HOST_ID runner.info('Stopping host %d: %s:%s' % (thost.id, thost.hostname, thost.internalport)) if not runner.opts.dryrun: response = runner.call_proc('@StopNode', [VOLT.FastSerializer.VOLTTYPE_INTEGER], [thost.id], check_status=False) print response if response.status() != 1: # not SUCCESS sys.exit(1)
def findTargetHsId(runner): # Exec @SystemInformation to find out about the cluster. response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # Connect to an arbitrary host that isn't being stopped. defaultport = 3021 min_hosts = 1 max_hosts = 1 target_host = utility.parse_hosts(runner.opts.target_host, min_hosts, max_hosts, defaultport)[0] (thost, chost) = hosts.get_target_and_connection_host(target_host.host, target_host.port) if thost is None: runner.abort('Host not found in cluster: %s:%d' % (target_host.host, target_host.port)) return thost.id
def getOwnClusterId(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() # ClusterId in @SystemInformation is added in v7.2, so must check the version of target cluster to make it work properly. version = host.version versionStr = version.split('.') majorVersion = int(versionStr[0]) minorVersion = int(versionStr[1]) if majorVersion < RELEASE_MAJOR_VERSION or (majorVersion == RELEASE_MAJOR_VERSION and minorVersion < RELEASE_MINOR_VERSION): return -1 return int(host.clusterid)
def procedureCaller(runner, type): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() # check the version of target cluster to make it work properly. version = host.version versionStr = version.split('.') majorVersion = int(versionStr[0]) minorVersion = int(versionStr[1]) if majorVersion < RELEASE_MAJOR_VERSION or ( majorVersion == RELEASE_MAJOR_VERSION and minorVersion < RELEASE_MINOR_VERSION): runner.abort('The version of targeting cluster is ' + version + ' which is lower than version ' + str(RELEASE_MAJOR_VERSION) + '.' + str(RELEASE_MINOR_VERSION) + ' for supporting elastic resize.') result = runner.call_proc('@ElasticRemoveNT', [ VOLT.FastSerializer.VOLTTYPE_TINYINT, VOLT.FastSerializer.VOLTTYPE_STRING, VOLT.FastSerializer.VOLTTYPE_STRING ], [type, '', ','.join(runner.opts.skip_requirements)]).table(0) status = result.tuple(0).column_integer(0) message = result.tuple(0).column_string(1) if status == 0: runner.info(message) else: runner.error(message) sys.exit(1)
def getClusterInfo(runner, available_hosts, clearHostCache): # raise execption when failed to connect response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW'], True, None, True) if response.response.status != 1: return None; if clearHostCache: available_hosts[:] = [] # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) for hostId, hostInfo in hosts.hosts_by_id.items(): if hostInfo.hostname not in available_hosts: available_hosts.append(hostInfo.hostname + ":" + str(hostInfo.clientport)) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() # ClusterId in @SystemInformation is added in v7.2, so must check the version of target cluster to make it work properly. version = host.version versionStr = version.split('.') majorVersion = int(versionStr[0]) minorVersion = int(versionStr[1]) if majorVersion < RELEASE_MAJOR_VERSION or (majorVersion == RELEASE_MAJOR_VERSION and minorVersion < RELEASE_MINOR_VERSION): runner.abort("Only v7.2 or higher version of VoltDB supports this command. Target cluster running on v" + version + ".") clusterId = host.clusterid fullClusterSize = int(host.fullclustersize) uptime = host.uptime response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT'], True, None, True) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break cluster = Cluster(int(clusterId), version, int(kfactor), int(fullClusterSize), uptime) for hostId, hostInfo in hosts.hosts_by_id.items(): cluster.add_member(hostId, hostInfo.hostname) # number of live clients connect to the cluster try: response = checkstats.get_stats(runner, "LIVECLIENTS") except StatisticsProcedureException as e: runner.info(e.message) sys.exit(e.exitCode) liveclients = 0 for tuple in response.table(0).tuples(): isAdmin = tuple[5] # exclude admin connections if isAdmin != 1: liveclients += 1 cluster.update_live_clients(liveclients) if runner.opts.dr: # Do we have any ongoing DR conversation? try: response = checkstats.get_stats(runner, "DRROLE") except StatisticsProcedureException as e: runner.info(e.message) sys.exit(e.exitCode) for tuple in response.table(0).tuples(): role = tuple[0] status = tuple[1] remote_cluster_id = tuple[2] if (remote_cluster_id != -1): cluster.add_remote_cluster(remote_cluster_id, status, role) try: response = checkstats.get_stats(runner, "DRPRODUCER") except StatisticsProcedureException as e: runner.info(e.message) sys.exit(e.exitCode) for tuple in response.table(0).tuples(): host_name = tuple[2] remote_cluster_id = tuple[4] last_queued_drid = tuple[10] last_queued_ts = tuple[12] last_acked_ts = tuple[13] if last_queued_drid == -1: delay = 0 else: delay = (last_queued_ts - last_acked_ts).total_seconds() cluster.get_remote_cluster(remote_cluster_id).update_producer_latency(host_name, remote_cluster_id, delay) # Find remote topology through drconsumer stats try: response = checkstats.get_stats(runner, "DRCONSUMER") except StatisticsProcedureException as e: runner.info(e.message) sys.exit(e.exitCode) for tuple in response.table(1).tuples(): remote_cluster_id = tuple[4] covering_host = tuple[7] last_applied_ts = tuple[9] if covering_host != '': cluster.get_remote_cluster(remote_cluster_id).add_remote_member(covering_host) return cluster
def basicCheck(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = next(iter(hosts.hosts_by_id.values())) fullClusterSize = int(host.fullclustersize) if len(hosts.hosts_by_id) < fullClusterSize: delta = fullClusterSize - len(hosts.hosts_by_id) runner.abort( "Current cluster needs %d more node%s to achieve full K-safety. Online upgrade is not supported in partial K-safety cluster." % (delta, "" if delta == 1 else "s")) if fullClusterSize % 2 == 1 and runner.opts.newNode is None: runner.abort( "The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions" ) if fullClusterSize % 2 == 0 and runner.opts.newNode is not None: runner.abort( "For even-numbered cluster, 2 parameters are expected (received 3)." ) if runner.opts.newNode is not None: result = checkNewNode(runner.opts.newNode) if result is not None: runner.abort("Failed to resolve host {0}:{1}.".format( runner.opts.newNode, result)) host = next(iter(hosts.hosts_by_id.values())) currentVersion = host.version # get k-factor from @SystemInformation response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT']) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break # sanity check, in case of nodes number or K-factor is less than required # K = 0, abort with error message if kfactor == 0: runner.abort( "Current cluster doesn't have duplicate partitions to perform online upgrade. K-factor: %d" % kfactor) # N = 1, abort with error message if fullClusterSize == 1: runner.abort( "Current cluster doesn't have enough nodes to perform online upgrade, at least two nodes are required" ) response = checkstats.get_stats(runner, "DRROLE") clusterIds = [] clusterIds.append(int(host.clusterid)) # add local cluster id first for tuple in response.table(0).tuples(): remote_cluster_id = tuple[2] if remote_cluster_id != -1: clusterIds.append( remote_cluster_id) # add remote cluster id if exists if len(clusterIds) == 127: runner.abort( "Failed to generate upgrade plan: number of connected cluster reaches the maximum limit (127)." ) # Check the existence of voltdb root path and new kit on all the existing nodes response = runner.call_proc('@CheckUpgradePlanNT', [ VOLT.FastSerializer.VOLTTYPE_STRING, VOLT.FastSerializer.VOLTTYPE_STRING ], [runner.opts.newKit, runner.opts.newRoot]) error = False warnings = "" for tuple in response.table(0).tuples(): hostId = tuple[0] result = tuple[1] warning = tuple[2] if result != 'Success': error = True host = hosts.hosts_by_id[hostId] if host is None: runner.abort('@CheckUpgradePlanNT returns a host id ' + hostId + " that doesn't belong to the cluster.") runner.error('Check failed on host ' + getHostnameOrIp(host) + " with the cause: " + result) if warning is not None: host = hosts.hosts_by_id[hostId] if host is None: runner.abort('@CheckUpgradePlanNT returns a host id ' + hostId + " that doesn't belong to the cluster.") warnings += 'On host ' + getHostnameOrIp( host) + ': \n' + warning + '\n' if error: runner.abort("Failed to pass pre-upgrade check. Abort. ") if warnings != "": runner.warning(warnings[:-1]) # get rid of last '\n' print('[1/4] Passed new VoltDB kit version check.') print('[2/4] Passed new VoltDB root path existence check.') return hosts, kfactor, clusterIds
def stop(runner): # Exec @SystemInformation to find out about the cluster. response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # Connect to an arbitrary host that isn't being stopped. defaultport = 3021 min_hosts = 1 max_hosts = 1 target_host = utility.parse_hosts(runner.opts.target_host, min_hosts, max_hosts, defaultport)[0] (thost, chost) = hosts.get_target_and_connection_host(target_host.host, target_host.port) if thost is None: runner.abort('Host not found in cluster: %s:%d' % (target_host.host, target_host.port)) if chost is None: runner.abort( 'The entire cluster is being stopped, use "shutdown" instead.') if runner.opts.username: user_info = ', user: %s' % runner.opts.username else: user_info = '' runner.info('Connecting to %s:%d%s (%s) to issue "stop" command' % (chost.get_admininterface(), chost.adminport, user_info, chost.hostname)) runner.voltdb_connect(chost.get_admininterface(), chost.adminport, runner.opts.username, runner.opts.password, runner.opts.ssl_config, runner.opts.kerberos) if not runner.opts.forcing: stateMessage = 'The node shutdown process has stopped.' actionMessage = 'You may shutdown the node with the "voltadmin stop --force" command.' try: runner.info('Preparing for stopping node.') resp = runner.call_proc('@PrepareStopNode', [VOLT.FastSerializer.VOLTTYPE_INTEGER], [thost.id], check_status=False) if resp.status() != 1: runner.abort( 'The preparation for node shutdown failed with status: %s' % resp.response.statusString) # monitor partition leader migration runner.info( 'Completing partition leader migration away from host %d: %s' % (thost.id, thost.hostname)) checkstats.check_partition_leaders_on_host(runner, thost.id) runner.info('All partition leaders have been migrated.') # monitor export master transfer, but don't fail on timeout: target may have been # disabled, preventing transfer. In that case it's ok to proceed with the stop try: runner.info( 'Completing export master transfer away from host %d: %s' % (thost.id, thost.hostname)) checkstats.check_export_mastership_on_host(runner, thost.id) runner.info('All export masters have been transferred') except StatisticsProcedureException as proex: if not proex.isTimeout: raise runner.info(proex.message) runner.info( 'This may be caused by an export target either disabled or removed from the configuration. No action is required; the stop node process will proceed.' ) except StatisticsProcedureException as proex: runner.info(stateMessage) runner.error(proex.message) if proex.isTimeout: runner.info(actionMessage) sys.exit(proex.exitCode) except (KeyboardInterrupt, SystemExit): runner.info(stateMessage) runner.abort(actionMessage) # Stop the requested host using exec @StopNode HOST_ID runner.info('Stopping host %d: %s:%s' % (thost.id, thost.hostname, thost.internalport)) if not runner.opts.dryrun: response = runner.call_proc('@StopNode', [VOLT.FastSerializer.VOLTTYPE_INTEGER], [thost.id], check_status=False) print response if response.status() != 1: # not SUCCESS sys.exit(1)
def shutdown(runner): if runner.opts.forcing and runner.opts.save: runner.abort_with_help( 'You cannot specify both --force and --save options.') if runner.opts.cancel and runner.opts.save: runner.abort_with_help( 'You cannot specify both --cancel and --save options.') if runner.opts.cancel and runner.opts.forcing: runner.abort_with_help( 'You cannot specify both --cancel and --force options.') if runner.opts.timeout <= 0: runner.abort_with_help( 'The timeout value must be more than zero seconds.') shutdown_params = [] columns = [] zk_pause_txnid = 0 if runner.opts.cancel: runner.info('Canceling cluster shutdown ...') response = runner.call_proc('@CancelShutdown', [], []) if response.status() != 1: runner.abort('Cancel shutdown failed with status: %d' % resp.response.statusString) else: runner.info('Shutdown canceled.') else: runner.info('Cluster shutdown in progress.') if not runner.opts.forcing: response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(*tuple) host = hosts.hosts_by_id.itervalues().next() if host.get('clustersafety') == "REDUCED": runner.info( 'Since cluster is in reduced k safety mode, taking a final snapshot before shutdown.' ) runner.opts.save = True stateMessage = 'The cluster shutdown process has stopped. The cluster is still in a paused state.' actionMessage = 'You may shutdown the cluster with the "voltadmin shutdown --force" command, '\ + 'continue to wait with "voltadmin shutdown",\n'\ + 'or cancel the shutdown with the "voltadmin shutdown --cancel" command' try: runner.info('Preparing for shutdown...') resp = runner.call_proc('@PrepareShutdown', [], []) if resp.status() != 1: runner.abort( 'The preparation for shutdown failed with status: %d' % resp.response.statusString) zk_pause_txnid = resp.table(0).tuple(0).column_integer(0) runner.info('The cluster is paused prior to shutdown.') runner.info('Writing out all queued export data...') status = runner.call_proc( '@Quiesce', [], []).table(0).tuple(0).column_integer(0) if status <> 0: runner.abort( 'The cluster has failed to be quiesce with status: %d' % status) checkstats.check_clients(runner) checkstats.check_importer(runner) # Checking command log regardless of whether we're community or enterprise checkstats.check_command_log(runner) runner.info( 'If running Enterprise Edition, all transactions have been made durable.' ) if runner.opts.save: actionMessage = 'You may shutdown the cluster with the "voltadmin shutdown --force" command, '\ + 'continue to wait with "voltadmin shutdown --save",\n'\ + 'or cancel the shutdown with the "voltadmin shutdown --cancel" command.' columns = [VOLT.FastSerializer.VOLTTYPE_BIGINT] shutdown_params = [zk_pause_txnid] # save option, check more stats checkstats.check_dr_consumer(runner) runner.info( 'Starting resolution of external commitments...') checkstats.check_exporter(runner) status = runner.call_proc( '@Quiesce', [], []).table(0).tuple(0).column_integer(0) if status <> 0: runner.abort( 'The cluster has failed to quiesce with status: %d' % status) checkstats.check_dr_producer(runner) runner.info( 'Saving a final snapshot, The cluster will shutdown after the snapshot is finished...' ) else: checkstats.check_exporter(runner) runner.info('Shutting down the cluster...') except StatisticsProcedureException as proex: runner.info(stateMessage) runner.error(proex.message) if proex.isTimeout: runner.info(actionMessage) sys.exit(proex.exitCode) except (KeyboardInterrupt, SystemExit): runner.info(stateMessage) runner.abort(actionMessage) response = runner.call_proc('@Shutdown', columns, shutdown_params, check_status=False) print response
def basicCheck(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() fullClusterSize = int(host.fullclustersize) if len(hosts.hosts_by_id) < fullClusterSize: runner.abort( "Current cluster needs %d more node(s) to achieve full K-safety. In-service upgrade is not recommended in partial K-safety cluster." % (fullClusterSize - len(hosts.hosts_by_id))) if fullClusterSize % 2 == 1 and runner.opts.newNode is None: runner.abort( "The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions" ) host = hosts.hosts_by_id.itervalues().next() currentVersion = host.version # get k-factor from @SystemInformation response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT']) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break # sanity check, in case of nodes number or K-factor is less than required # K = 0, abort with error message if kfactor == 0: runner.abort( "Current cluster doesn't have duplicate partitions to perform online upgrade. K-factor: %d" % kfactor) # N = 1, abort with error message if fullClusterSize == 1: runner.abort( "Current cluster doesn't have enough node to perform online upgrade, at least two nodes are required" ) response = checkstats.get_stats(runner, "DRROLE") largestClusterId = -1 for tuple in response.table(0).tuples(): remote_cluster_id = tuple[2] if remote_cluster_id > largestClusterId: largestClusterId = remote_cluster_id # Check the existence of voltdb root path and new kit on all the existing nodes response = runner.call_proc('@CheckUpgradePlanNT', [ VOLT.FastSerializer.VOLTTYPE_STRING, VOLT.FastSerializer.VOLTTYPE_STRING ], [runner.opts.newKit, runner.opts.newRoot]) error = False for tuple in response.table(0).tuples(): hostId = tuple[0] result = tuple[1] if result != 'Success': error = True host = hosts.hosts_by_id[hostId] if host is None: runner.abort('@CheckUpgradePlanNT returns a host id ' + hostId + " that doesn't belong to the cluster.") print 'Pre-upgrade check fails on host ' + getHostnameOrIp( host) + " with the cause: " + result if error: runner.abort("Failed to pass pre-upgrade check. Abort. ") print '[1/4] Passed new VoltDB kit version check.' print '[2/4] Passed new VoltDB root path existence check.' return hosts, kfactor, largestClusterId
def getClusterInfo(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() clusterId = host.clusterid fullClusterSize = int(host.fullclustersize) version = host.version uptime = host.uptime response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT']) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break cluster = Cluster(int(clusterId), version, int(kfactor), int(fullClusterSize), uptime) for hostId, hostInfo in hosts.hosts_by_id.items(): cluster.add_member(hostId, hostInfo.hostname) # number of live clients connect to the cluster response = checkstats.get_stats(runner, "LIVECLIENTS") liveclients = 0 for tuple in response.table(0).tuples(): isAdmin = tuple[5] # exclude admin connections if isAdmin != 1: liveclients += 1 cluster.update_live_clients(liveclients) if runner.opts.dr: # Do we have any ongoing DR conversation? response = checkstats.get_stats(runner, "DRROLE") for tuple in response.table(0).tuples(): role = tuple[0] status = tuple[1] remote_cluster_id = tuple[2] if (remote_cluster_id != -1): cluster.add_remote_cluster(remote_cluster_id, status, role) response = checkstats.get_stats(runner, "DRPRODUCER") for tuple in response.table(0).tuples(): host_name = tuple[2] remote_cluster_id = tuple[4] last_queued_drid = tuple[10] last_queued_ts = tuple[12] last_acked_ts = tuple[13] if last_queued_drid == -1: delay = 0 else: delay = (last_queued_ts - last_acked_ts).total_seconds() cluster.get_remote_cluster( remote_cluster_id).update_producer_latency( host_name, remote_cluster_id, delay) # Find remote topology through drconsumer stats response = checkstats.get_stats(runner, "DRCONSUMER") for tuple in response.table(1).tuples(): remote_cluster_id = tuple[4] covering_host = tuple[7] last_applied_ts = tuple[9] if covering_host != '': cluster.get_remote_cluster( remote_cluster_id).add_remote_member(covering_host) return cluster