def onlinecpu(): import numa o_cpus = [] for node in range(0, numa.get_max_node() + 1): for cpu in sorted(numa.node_to_cpus(node)): o_cpus.append(cpu) return o_cpus
def check_numa(): try: import numa except: return if not numa.available(): return if numa.get_max_node() > 0 and len(numa.get_run_on_node_mask()) > 1: print("Warning: NUMA settings may be suboptimal!", file=sys.stderr)
def setNuma(): """Without subapAllocation... whole rmx on each numa node""" d = darc.Control() nthr = d.Get("ncamThreads").sum() nnodes = numa.get_max_node() + 1 #specify which numa nodes the threads are closest to: threadToNuma = numpy.zeros(nthr, numpy.int32) #all node 0 for now...! rmx = d.Get("gainReconmxT") d.Set("threadToNuma", threadToNuma) for i in range(nnodes): d.SetNuma("gainReconmxT%d" % i, rmx, i)
def load_numa(): """ Load information about core numbers and numa patterns """ if not numa.available(): raise Exception('Numa detection not available') max_node = numa.get_max_node() nodes = {} for i in range(max_node + 1): nodes[i] = list(numa.node_to_cpus(i)) return nodes
def coremap(): try: import numa except ImportError: print('This script requires the libnuma python bindings') raise RuntimeError("Numa not available") if not numa.available(): raise RuntimeError("Numa not available") node_to_core = { int(i): deque([int(k) for k in numa.node_to_cpus(i)]) for i in range(numa.get_max_node() + 1) } total_core = max(itertools.chain(*node_to_core.values())) + 1 return node_to_core, total_core
def main(): parser = argparse.ArgumentParser() parser.add_argument("-membw", default="./membw/membw") parser.add_argument("-bandwidth", type=int, default=9999) parser.add_argument("-operation", default="nt-write") args = parser.parse_args() max_nid = numa.get_max_node() if max_nid != 1: print("This tool requires two sockets at least") sys.exit() cpus = numa.node_to_cpus(max_nid) for cpuid in cpus: cmd = "%s -c %d -b %d --%s &" % (args.membw, cpuid, args.bandwidth, args.operation) print(cmd) os.system(cmd)
def _configure_numa(self): self._numa_available = \ numa.available() and which('numactl') is not None if not self._numa_available: return num_numa_nodes = numa.get_max_node() + 1 self._numa_cpu_map = {} num_gpus = len(self._gpu_ids) # Calculate how many CPUs to allocate for each GPU. Ensure this number # is a power of 2. num_cpus = 0 for i in range(num_numa_nodes): num_cpus += len(numa.node_to_cpus(i)) num_cpus_per_gpu = min(MAX_CPUS_PER_GPU, max(num_cpus // num_gpus, 1)) num_cpus_per_gpu = pow(2, round(math.log(num_cpus_per_gpu, 2))) # Find blocks of contiguous CPUs. contiguous_blocks = [] for i in range(num_numa_nodes): cpus = sorted(numa.node_to_cpus(i)) contiguous_block = [cpus[0]] for j in range(1, len(cpus)): if (cpus[j] - cpus[j - 1] == 1 and len(contiguous_block) < num_cpus_per_gpu): contiguous_block.append(cpus[j]) else: contiguous_blocks.append( (contiguous_block, len(contiguous_block))) contiguous_block = [cpus[j]] if len(contiguous_block) > 0: contiguous_blocks.append( (contiguous_block, len(contiguous_block))) contiguous_blocks.sort(key=lambda x: x[-1], reverse=True) # Assign CPUs to GPUs. block_idx = 0 for i in range(num_gpus): self._numa_cpu_map[i] = [] while len(self._numa_cpu_map[i]) < num_cpus_per_gpu: self._numa_cpu_map[i] += contiguous_blocks[block_idx][0] block_idx = (block_idx + 1) % len(contiguous_blocks) self._logger.info('GPU {gpu} assigned CPUs {cpus}'.format( gpu=i, cpus=str(self._numa_cpu_map[i])))
def _lazy_cpu_and_mem_set_init(self): # Implicitly assume lock is already held if len(self._numa_nodes) != 0: # Init already happened return if (self._available_cpu_ids is None or self._cpus_per_job is None or self._use_memset_of_nearest_node is None): raise Exception('Cannot do init. One or more params were None') import numa if not numa.available(): raise Exception('NUMA not available') numa_nodes = list(range(0, numa.get_max_node() + 1)) cpu_count = 0 for numa_node in numa_nodes: cpus = numa.node_to_cpus(numa_node) for cpu_id in cpus: if cpu_id in self._available_cpu_ids: try: self._numa_nodes[numa_node].add(cpu_id) except KeyError: self._numa_nodes[numa_node] = set() self._numa_nodes[numa_node].add(cpu_id) try: self._numa_node_pool[numa_node].add(cpu_id) except KeyError: self._numa_node_pool[numa_node] = set() self._numa_node_pool[numa_node].add(cpu_id) _logger.info( 'Putting CPU {} in NUMA node {} in resource pool'. format(cpu_id, numa_node)) cpu_count += 1 else: _logger.info( 'CPU {} in NUMA node {} is NOT IN resource pool'. format(cpu_id, numa_node)) if cpu_count == 0: raise Exception('Found no available CPUs') if cpu_count != len(self._available_cpu_ids): raise Exception( 'Mismatch between provided available CPU ids and what was found on system' ) assert len(self._numa_node_pool) == len(self._numa_nodes)
def setNuma2(): """With subapAllocation: partial rmx on each numa node""" d = darc.Control() nthr = d.Get("ncamThreads").sum() nnodes = numa.get_max_node() + 1 #specify which numa nodes the threads are closest to: threadToNuma = numpy.zeros(nthr, numpy.int32) #all node 0 for now...! sf = d.Get("subapFlag") nsub = sf.size sa = numpy.zeros(nsub, numpy.int32) #divide all threads equally... sa[:] = nthr - 1 start = 0 for i in range(nthr): end = start + nsub / nthr sa[start:end] = i start = end print numpy.reshape(sa, (7, 7)) print numpy.reshape(sf, (7, 7)) d.Set("threadToNuma", threadToNuma, swap=1) rmx = d.Get("gainReconmxT") #nslope,nact thrSubapCnt = numpy.zeros(nthr, numpy.int32) rmxPart = [] for i in range(nthr): rmxPart.append([]) indx = 0 for i in range(nsub): if sf[i]: thrSubapCnt[sa[i]] += 1 rmxPart[sa[i]].append(rmx[indx]) rmxPart[sa[i]].append(rmx[indx + 1]) indx += 2 for i in range(nthr): r = numpy.array(rmxPart[i]) print "Thread %d rmx shape %s, dtype %s" % (i, str(r.shape), r.dtype) d.SetNuma("gainReconmxT%d" % i, r, int(threadToNuma[i]), swap=1) d.Set("subapAllocation", sa, swap=1)
def __init__(self): gr.top_block.__init__(self, "ATA New SNAP X-Engine") ################################################## # Variables ################################################## self.starting_channel = starting_channel = clparam_starting_channel self.num_channels = num_channels = clparam_num_channels self.output_file = output_file = clparam_output_directory + '/casa_2021_jan_04_sync_v3_xeng' self.ending_channel = ending_channel = starting_channel + num_channels - 1 ################################################## # Blocks ################################################## self.clenabled_clXEngine_0 = clenabled.clXEngine( 1, 2, 0, 0, False, 6, 2, clparam_num_antennas, 1, starting_channel, num_channels, clparam_integration_frames, clparam_antenna_list, True, output_file, 0, True, clparam_snap_sync, clparam_object_name, clparam_starting_chan_freq, clparam_channel_width, clparam_no_output, clparam_cpu_integration) if clparam_enable_affinity: # So with affinity here, we're just trying to ensure NUMA doesn't move us off # where our memory was allocated. So we're going to try to be smart about # allocating here. We'll set affinity to all of the cores on each processor till # we've "recommended" a full set. num_nodes = numa.get_max_node() + 1 # core_pairs = [] cpu_core_list = [] cores_per_cpu = 0 cores_per_cpu_2 = 0 for cur_node in range(0, num_nodes): cpu_to_node = list(numa.node_to_cpus(cur_node)) cpu_core_list.append(cpu_to_node) if cores_per_cpu == 0: cores_per_cpu = len(cpu_to_node) cores_per_cpu_2 = cores_per_cpu // 2 print("CPU" + str(cur_node) + " has " + str(len(cpu_to_node)) + " cores: " + str(cpu_to_node)) #i = 0 #for cur_cpu in cpu_to_node: # if i % 2 == 0: # cpu_pair = [cur_cpu] # else: # cpu_pair.append(cur_cpu) # core_pairs.append(cpu_pair) # # i += 1 #print("Setting xEngine affinity to cores " + str(core_pairs[0])) #self.clenabled_clXEngine_0.set_processor_affinity(core_pairs[0]) #core_pairs = core_pairs[1:] # or to all cores on CPU0 if num_nodes > 1: self.clenabled_clXEngine_0.set_processor_affinity( cpu_core_list[0]) self.antenna_list = [] for i in range(0, clparam_num_antennas): if i == 0: input_file = '/home/sonata/casa_pcap/snap_2_ant_1f.pcap' input_port = clparam_base_port + i elif i == 1: input_file = '/home/sonata/casa_pcap/snap_7_ant_3c.pcap' input_port = clparam_base_port + i else: input_file = '/home/sonata/casa_pcap/snap_8_ant_4g.pcap' input_port = 10002 new_ant = ata.snap_source(input_port, 1, True, False, False, starting_channel, ending_channel, 3, input_file, False, True, '224.1.1.10', False) if clparam_enable_affinity and num_nodes > 1: cpu2 = cores_per_cpu_2 - 2 + cores_per_cpu_2 if i < cores_per_cpu_2 - 2: # Subtract 2 for the xengine on the first node. print("Setting affinity for PCAP " + input_file + " to CPU0") new_ant.set_processor_affinity(cpu_core_list[0]) elif i < cpu2: print("Setting affinity for PCAP " + input_file + " to CPU1") new_ant.set_processor_affinity(cpu_core_list[1]) else: # just balance index = i % 2 print("Setting affinity for PCAP " + input_file + " to CPU" + str(index)) new_ant.set_processor_affinity(cpu_core_list[index]) ################################################## # Connections ################################################## self.msg_connect((self.clenabled_clXEngine_0, 'sync'), (new_ant, 'sync')) self.connect((new_ant, 0), (self.clenabled_clXEngine_0, i)) self.antenna_list.append(new_ant)
def get_dst_numa_node_from_pcpu(self, pcpu_id): #module numa has not implemented numa_node_of_cpu() call of numa(3) library for i in range(0, numa.get_max_node() + 1): if pcpu_id in numa.node_to_cpus(i): return i
def onlinecpu(): import numa for node in range(0, numa.get_max_node() + 1): for cpu in sorted(numa.node_to_cpus(node)): yield str(cpu)
def test_node_size(self): for node in range(numa.get_max_node()+1): print 'Node: %d, size: %r' % (node, numa.get_node_size(node))
"figureOpen": 0, "figureName": "libfigureSL240.so", "figureParams": None, "reconName": "libreconmvm.so", "fluxThreshold": 0, "printUnused": 1, "useBrightest": 0, "figureGain": 1, "decayFactor": None, #used in libreconmvm.so "reconlibOpen": 1, "maxAdapOffset": 10, "noPrePostThread": 0, } nthr = control.get("ncamThreads").sum() nnodes = numa.get_max_node() + 1 threadToNuma = numpy.zeros(nthr, numpy.int32) sf = control.get("subapFlag") nsub = sf.size sa = numpy.zeros(nsub, numpy.int32) #divide all threads equally.... sa[:] = nthr - 1 start = 0 for i in range(nnodes): control["numa%d" % i] = {} for i in range(nthr): end = start + nsub / nthr sa[start:end] = i start = end control["threadToNuma"] = threadToNuma gainReconmxT = control["rmx"].transpose().astype(numpy.float32)
def test_node_size(self): for node in range(numa.get_max_node()+1): print('Node: %d, size: %r' % (node, numa.get_node_size(node)))
def onlinecpu(): import numa for node in range(0,numa.get_max_node()+1): for cpu in sorted(numa.node_to_cpus(node)): yield str(cpu)
def __init__(self, **kwargs): test_module = kwargs.pop('test_module') viewers = kwargs.pop('viewers', False) n_nodes = kwargs.pop('n_nodes') n_workers = kwargs.pop('n_workers') partition = kwargs.pop('partition') do_numa = kwargs.pop('numa') log_file = kwargs.pop('log_file') input_files = kwargs.pop('input_files') self.__logger = kwargs.pop('logger') from collections import defaultdict self.__results = defaultdict(list) import numa max_nodes = numa.get_max_node() + 1 if do_numa: if n_nodes > max_nodes: print( 'WARNING: %d NUMA nodes have been requested to be used, ' + 'but only %d are available and will be used.') n_nodes = max_nodes n_nodes = max_nodes if n_nodes == -1 else n_nodes CalibLauncher = NumaLauncher else: n_nodes = 1 if n_nodes == -1 else n_nodes CalibLauncher = Launcher if n_workers != -1: n_work = n_workers else: import multiprocessing n_work = int( float(multiprocessing.cpu_count()) / float(max_nodes) + 0.5) tms = __import__("CalibTests.TestModules", globals(), locals(), [test_module]) tm = getattr(tms, test_module) import os directory = os.path.dirname(input_files[0]) self.__logger.info("Will start %d workers on %d nodes." % (n_work, n_nodes)) launcher = Launcher(partition, self.__logger) self.__components = [ (LogServer, dict(launcher=launcher, priority=1)), (Iterator, dict(launcher=launcher, priority=3, python=tm.Iterator(directory))), (Analyser, dict(n_nodes=n_nodes, n_workers=n_work, python=tm.Analyzer(input_files), launcher=CalibLauncher(partition, self.__logger), priority=4)) ] if log_file: self.__components.append((LogViewer, dict(launcher=launcher, priority=2, output_file=log_file, timing_width=90, split_regex='CALIBWORK_(\d+)'))) if viewers: self.__components.append( (LogViewer, dict(launcher=launcher, priority=2))) self.__calib_process = CalibrationSteering("CalibrationTest", self.__logger, self.__components) # To get an idea of overall CPU usage from OfflineOnline.BenchmarkProcess import CPUMeasurer self.__cpu_process = CPUMeasurer(self.__logger)