def test_min(self): l = [1, 2] dl = make_deferrable(l) self.assertEqual(1, sn.min(dl)) l.append(0) self.assertEqual(0, sn.min(dl))
def test_min(): l = [1, 2] dl = sn.defer(l) assert 1 == sn.min(dl) l.append(0) assert 0 == sn.min(dl)
def test_min(self): l = [1, 2] dl = sn.defer(l) self.assertEqual(1, sn.min(dl)) l.append(0) self.assertEqual(0, sn.min(dl))
def set_perf_patterns(self): '''Set the performance patterns. These include host-device (h2d), device-host (d2h) and device=device (d2d) transfers. ''' self.perf_patterns = { 'h2d': sn.min(sn.extractall(self._xfer_pattern('h2d'), self.stdout, 1, float)), 'd2h': sn.min(sn.extractall(self._xfer_pattern('d2h'), self.stdout, 1, float)), 'd2d': sn.min(sn.extractall(self._xfer_pattern('d2d'), self.stdout, 1, float)), }
def gpu_usage_sanity(self): '''Verify that the jobreport output has sensible numbers. This function asserts that the nodes reported are at least a subset of all nodes used by the gpu burn app. Also, the GPU usage is verified by assuming that in the worst case scenario, the usage is near 100% during the burn, and 0% outside the burn period. Lastly, the GPU usage time for each node is also asserted to be greater or equal than the burn time. ''' # Get set with all nodes patt = r'^\s*\[([^\]]*)\]\s*GPU\s*\d+\(OK\)' full_node_set = set(sn.extractall(patt, self.stdout, 1)) # Parse job report data patt = r'^\s*(\w*)\s*(\d+)\s*%\s*\d+\s*MiB\s*\d+:\d+:(\d+)' self.nodes_reported = sn.extractall(patt, self.stdout, 1) usage = sn.extractall(patt, self.stdout, 2, int) time_reported = sn.extractall(patt, self.stdout, 3, int) return sn.all([ sn.assert_ge(sn.count(self.nodes_reported), 1), set(self.nodes_reported).issubset(full_node_set), sn.all( map(lambda x, y: self.burn_time / x <= y, time_reported, usage)), sn.assert_ge(sn.min(time_reported), self.burn_time) ])
def __init__(self): self.valid_systems = [ 'cannon:local-gpu', 'cannon:gpu_test', 'fasse:fasse_gpu', 'test:gpu' ] self.valid_prog_environs = ['gpu'] self.build_system = 'Make' self.executable = './dgemm.x' self.perf_patterns = { 'perf': sn.min( sn.extractall(r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s', self.stdout, 'fp', float)) } self.reference = { 'cannon:local-gpu': { 'perf': (5.2, -0.1, None, 'TF/s per gpu') }, 'cannon:gpu_test': { 'perf': (5.2, -0.1, None, 'TF/s per gpu') }, '*': { 'perf': (3.35, None, None, 'TF/s per gpu') }, }
def __init__(self): self.valid_systems = [ 'cannon:local-gpu', 'cannon:gpu_test', 'fasse:fasse_gpu', 'test:gpu' ] self.valid_prog_environs = ['gpu'] self.build_system = 'Make' self.executable = './shmem.x' self.perf_patterns = { 'bandwidth': sn.min( sn.extractall( r'^\s*\[[^\]]*\]\s*GPU\s*\d+: ' r'Bandwidth\(double\) (?P<bw>\S+) GB/s', self.stdout, 'bw', float)) } self.reference = { # theoretical limit for P100: # 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s 'cannon:local-gpu': { 'bandwidth': (13000, -0.01, None, 'GB/s per gpu') }, 'cannon:gpu_test': { 'bandwidth': (13000, -0.01, None, 'GB/s per gpu') }, '*': { 'bandwidth': (8850, None, None, 'GB/s per gpu') }, }
def __init__(self): self.valid_systems = ['cannon:local-gpu','cannon:gpu_test','fasse:fasse_gpu','test:gpu'] self.descr = 'GPU burn test' self.valid_prog_environs = ['gpu'] self.executable_opts = ['-d', '40'] self.build_system = 'Make' self.build_system.makefile = 'makefile.cuda' self.executable = './gpu_burn.x' patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s' r'\s+(?P<temp>\S*)\s+Celsius') self.perf_patterns = { 'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)), 'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)), } self.reference = { 'cannon:local-gpu': { 'perf': (6200, -0.10, None, 'Gflop/s per gpu'), }, 'cannon:gpu_test': { 'perf': (6200, -0.10, None, 'Gflop/s per gpu'), }, 'test:gpu': { 'perf': (4115, None, None, 'Gflop/s per gpu'), }, '*': { 'perf': (4115, None, None, 'Gflop/s per gpu'), }, '*': {'temp': (0, None, None, 'degC')} }
def __init__(self): self.modules = ['likwid'] self.valid_prog_environs = ['PrgEnv-gnu'] self.sourcesdir = None self.executable = 'likwid-bench' self.num_tasks = 1 self.num_tasks_per_node = 1 self.num_tasks_per_core = 2 self.system_num_cpus = { 'daint:mc': 72, 'daint:gpu': 24, 'dom:mc': 72, 'dom:gpu': 24, } self.system_numa_domains = { 'daint:mc': ['S0', 'S1'], 'daint:gpu': ['S0'], 'dom:mc': ['S0', 'S1'], 'dom:gpu': ['S0'], } # Test each level at half capacity times nthreads per domain self.system_cache_sizes = { 'daint:mc': { 'L1': '288kB', 'L2': '2304kB', 'L3': '23MB', 'memory': '1800MB' }, 'daint:gpu': { 'L1': '192kB', 'L2': '1536kB', 'L3': '15MB', 'memory': '1200MB' }, 'dom:mc': { 'L1': '288kB', 'L2': '2304kB', 'L3': '23MB', 'memory': '1800MB' }, 'dom:gpu': { 'L1': '192kB', 'L2': '1536kB', 'L3': '15MB', 'memory': '1200MB' }, } self.maintainers = ['SK', 'CB'] self.tags = {'benchmark', 'diagnostic', 'health'} bw_pattern = sn.min( sn.extractall(r'MByte/s:\s*(?P<bw>\S+)', self.stdout, 'bw', float)) self.sanity_patterns = sn.assert_ge(bw_pattern, 0.0) self.perf_patterns = {'bandwidth': bw_pattern}
def mpip_perf_patterns(obj, reg): '''More perf_patterns for the tool .. code-block:: ----------------------------------- @--- MPI Time (seconds) ----------- ----------------------------------- Task AppTime MPITime MPI% 0 8.6 0.121 1.40 <-- min 1 8.6 0.157 1.82 2 8.6 5.92 68.84 <-- max * 25.8 6.2 24.02 <--- => NonMPI= AppTime - MPITime Typical performance reporting: .. code-block:: * mpip_avg_app_time: 8.6 s (= 25.8/3mpi) * mpip_avg_mpi_time: 2.07 s (= 6.2/3mpi) * %mpip_avg_mpi_time: 24.02 % * %max/%min * %mpip_avg_non_mpi_time: 75.98 % ''' # rpt = os.path.join(obj.stagedir, obj.rpt_file_txt) rpt = sn.extractsingle(r'^mpiP: Storing mpiP output in \[(?P<rpt>.*)\]', obj.stdout, 'rpt', str) regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$' regex_minmax = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+' r'(?P<pct>\S+)$') if reg == 1: # mpip_avg_mpi_time result = sn.round( sn.extractsingle(regex_star, rpt, 'mpit', float) / obj.num_tasks, 2) elif reg == 2: # mpip_avg_app_time result = sn.round( sn.extractsingle(regex_star, rpt, 'appt', float) / obj.num_tasks, 2) elif reg == 3: # %mpip_avg_mpi_time result = sn.extractsingle(regex_star, rpt, 'pct', float) elif reg == 4: # %nonmpi mpi_pct = sn.extractsingle(regex_star, rpt, 'pct', float) result = sn.round(100 - mpi_pct, 2) elif reg == 5: # %mpip_avg_mpi_time_max result = sn.max(sn.extractall(regex_minmax, rpt, 'pct', float)) elif reg == 6: # %mpip_avg_mpi_time_min result = sn.min(sn.extractall(regex_minmax, rpt, 'pct', float)) else: raise ValueError('unknown region id in mpip_perf_patterns') return result
def set_perf_patterns(self): '''Extract the TF/s achieved.''' self.perf_patterns = { 'perf': sn.min(sn.extractall( r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s', self.stdout, 'fp', float)) }
def __init__(self): self.valid_systems = [ 'cannon:local-gpu', 'cannon:gpu_test', 'fasse:fasse_gpu', 'test:gpu' ] self.valid_prog_environs = ['gpu'] # Perform a single bandwidth test with a buffer size of 1024MB self.copy_size = 1073741824 self.build_system = 'Make' self.executable = './memory_bandwidth.x' self.build_system.cxxflags = [f'-DCOPY={self.copy_size}'] # perf_patterns and reference will be set by the sanity check function self.perf_patterns = { 'h2d': sn.min( sn.extractall(self._xfer_pattern('h2d'), self.stdout, 1, float)), 'd2h': sn.min( sn.extractall(self._xfer_pattern('d2h'), self.stdout, 1, float)), 'd2d': sn.min( sn.extractall(self._xfer_pattern('d2d'), self.stdout, 1, float)), } self.reference = { 'cannon:local-gpu': { 'h2d': (12000, -0.1, None, 'MB/s per gpu'), 'd2h': (13000, -0.1, None, 'MB/s per gpu'), 'd2d': (630000, -0.1, None, 'MB/s per gpu') }, 'cannon:gpu_test': { 'h2d': (12000, -0.1, None, 'MB/s per gpu'), 'd2h': (13000, -0.1, None, 'MB/s per gpu'), 'd2d': (780000, -0.1, None, 'MB/s per gpu') }, '*': { 'h2d': (11881, None, None, 'MB/s per gpu'), 'd2h': (12571, None, None, 'MB/s per gpu'), 'd2d': (499000, None, None, 'MB/s per gpu') }, }
def __init__(self): super().__init__() self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] self.descr = 'GPU burn test' self.valid_prog_environs = ['PrgEnv-gnu'] if self.current_system.name == 'kesch': self.exclusive_access = True self.modules = ['craype-accel-nvidia35'] # NOTE: The first option indicates the precision (-d for double) # while the seconds is the time (in secs) to run the test. # For multi-gpu nodes, we run the gpu burn test for more # time to get reliable measurements. self.executable_opts = ['-d', '40'] self.num_gpus_per_node = 16 gpu_arch = '37' else: self.modules = ['craype-accel-nvidia60'] self.executable_opts = ['-d', '20'] self.num_gpus_per_node = 1 gpu_arch = '60' self.sourcepath = 'gpu_burn.cu' self.build_system = 'SingleSource' self.build_system.cxxflags = [ '-arch=compute_%s' % gpu_arch, '-code=sm_%s' % gpu_arch ] self.build_system.ldflags = ['-lcuda', '-lcublas', '-lnvidia-ml'] self.sanity_patterns = sn.assert_eq( sn.count(sn.findall('OK', self.stdout)), self.num_tasks_assigned) self.perf_patterns = { 'perf': sn.min( sn.extractall(r'GPU\s+\d+\(\S*\): (?P<perf>\S*) GF\/s', self.stdout, 'perf', float)) } self.reference = { 'dom:gpu': { 'perf': (4115, -0.10, None) }, 'daint:gpu': { 'perf': (4115, -0.10, None) }, 'kesch:cn': { 'perf': (950, -0.10, None) } } self.num_tasks = 0 self.num_tasks_per_node = 1 self.maintainers = ['AJ', 'VK', 'TM'] self.tags = {'diagnostic', 'benchmark'}
def set_perf_patterns(self): '''Extract the bandwidth data from the stdout.''' self.perf_patterns = { 'bandwidth': sn.min( sn.extractall( r'^\s*\[[^\]]*\]\s*GPU\s*\d+: ' r'Bandwidth\(double\) (?P<bw>\S+) GB/s', self.stdout, 'bw', float)) }
def vtune_physical_core_utilization(self): '''Reports the minimum ``Physical Core Utilization`` (%) measured by the tool .. code-block:: Effective Physical Core Utilization: 96.3% (11.554 out of 12) Effective Physical Core Utilization: 96.1% (11.534 out of 12) Effective Physical Core Utilization: 95.9% (11.512 out of 12) ''' regex = r'^Effective Physical Core Utilization: (?P<pct>\S+)%' return sn.round(sn.min(sn.extractall(regex, self.stdout, 'pct', float)), 4)
def vtune_logical_core_utilization(self): '''Reports the minimum ``Physical Core Utilization`` (%) measured by the tool .. code-block:: Effective Logical Core Utilization: 96.0% (23.028 out of 24) Effective Logical Core Utilization: 95.9% (23.007 out of 24) Effective Logical Core Utilization: 95.5% (22.911 out of 24) ''' regex = r'^\s+Effective Logical Core Utilization: (?P<pct>\S+)%' return sn.round(sn.min(sn.extractall(regex, self.stdout, 'pct', float)), 4)
def __init__(self): self.valid_systems = [ 'daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn', 'ault:amdv100', 'ault:intelv100', 'ault:amda100', 'ault:amdvega' ] self.descr = 'GPU burn test' self.valid_prog_environs = ['PrgEnv-gnu'] self.exclusive_access = True self.executable_opts = ['-d', '40'] self.build_system = 'Make' self.executable = './gpu_burn.x' self.num_tasks = 0 self.num_tasks_per_node = 1 self.sanity_patterns = self.assert_num_tasks() patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s' r'\s+(?P<temp>\S*)\s+Celsius') self.perf_patterns = { 'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)), 'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)), } self.reference = { 'dom:gpu': { 'perf': (4115, -0.10, None, 'Gflop/s'), }, 'daint:gpu': { 'perf': (4115, -0.10, None, 'Gflop/s'), }, 'arolla:cn': { 'perf': (5861, -0.10, None, 'Gflop/s'), }, 'tsa:cn': { 'perf': (5861, -0.10, None, 'Gflop/s'), }, 'ault:amda100': { 'perf': (15000, -0.10, None, 'Gflop/s'), }, 'ault:amdv100': { 'perf': (5500, -0.10, None, 'Gflop/s'), }, 'ault:intelv100': { 'perf': (5500, -0.10, None, 'Gflop/s'), }, 'ault:amdvega': { 'perf': (3450, -0.10, None, 'Gflop/s'), }, '*': { 'temp': (0, None, None, 'degC') } } self.maintainers = ['AJ', 'TM'] self.tags = {'diagnostic', 'benchmark', 'craype'}
def set_perf_patterns(self): '''Set the performance patterns. In addition to the individual transfer rates amongst devices, this test also reports the average bandwidth per device with all the other devices. Hence, the performance pattern will report the device with the lowest average copy bandwidth with all the other devices. ''' self.perf_patterns = { 'bw': sn.min(sn.extractall( r'^\[[^\]]*\]\s+GPU\s+\d+\s+(\s*\d+.\d+\s)+', self.stdout, 1, float)) }
def set_perf_patterns(self): '''Extract the minimum performance and maximum temperature recorded. The performance and temperature data are reported in Gflops/s and deg. Celsius respectively. ''' patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s' r'\s+(?P<temp>\S*)\s+Celsius') self.perf_patterns = { 'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)), 'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)), }
def __init__(self, peerAccess): self.valid_systems = ['cannon:local-gpu','cannon:gpu_test','fasse:fasse_gpu','test:gpu'] self.valid_prog_environs = ['gpu'] # Perform a single bandwidth test with a buffer size of 1024MB copy_size = 1073741824 self.build_system = 'Make' self.executable = './p2p_bandwidth.x' self.build_system.cxxflags = [f'-DCOPY={copy_size}'] if (peerAccess == 'peerAccess'): self.build_system.cxxflags += ['-DP2P'] p2p = True else: p2p = False self.perf_patterns = { 'bw': sn.min(sn.extractall( r'^[^,]*\[[^\]]*\]\s+GPU\s+\d+\s+(\s*\d+.\d+\s)+', self.stdout, 1, float)) } if p2p: self.reference = { 'cannon:local-gpu': { 'bw': (28, -0.05, None, 'GB/s'), }, 'cannon:gpu_test': { 'bw': (9, -0.05, None, 'GB/s'), }, '*': { 'bw': (172.5, None, None, 'GB/s'), }, } else: self.reference = { 'cannon:local-gpu': { 'bw': (35, -0.05, None, 'GB/s'), }, 'cannon:gpu_test': { 'bw': (11, -0.05, None, 'GB/s'), }, '*': { 'bw': (79.6, None, None, 'GB/s'), }, }
def set_mpip_perf_patterns(self): '''More perf_patterns for the tool .. code-block:: ----------------------------------- @--- MPI Time (seconds) ----------- ----------------------------------- Task AppTime MPITime MPI% 0 8.6 0.121 1.40 <-- min 1 8.6 0.157 1.82 2 8.6 5.92 68.84 <-- max * 25.8 6.2 24.02 <--- => NonMPI= AppTime - MPITime Typical performance reporting: .. code-block:: * mpip_avg_app_time: 8.6 s (= 25.8/3mpi) * mpip_avg_mpi_time: 2.07 s (= 6.2/3mpi) * %mpip_avg_mpi_time: 24.02 % * %mpip_avg_non_mpi_time: 75.98 % ''' regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$' app_t = sn.extractsingle(regex_star, self.rpt, 'appt', float) mpi_t = sn.extractsingle(regex_star, self.rpt, 'mpit', float) mpi_pct = sn.extractsingle(regex_star, self.rpt, 'pct', float) nonmpi_pct = sn.round(100 - mpi_pct, 2) # min/max regex = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+' r'(?P<pct>\S+)$') mpi_pct_max = sn.max(sn.extractall(regex, self.rpt, 'pct', float)) mpi_pct_min = sn.min(sn.extractall(regex, self.rpt, 'pct', float)) perf_pattern = { 'mpip_avg_app_time': sn.round(app_t / self.num_tasks, 2), 'mpip_avg_mpi_time': sn.round(mpi_t / self.num_tasks, 2), '%mpip_avg_mpi_time': mpi_pct, '%mpip_avg_mpi_time_max': mpi_pct_max, '%mpip_avg_mpi_time_min': mpi_pct_min, '%mpip_avg_non_mpi_time': nonmpi_pct, } if self.perf_patterns: self.perf_patterns = {**self.perf_patterns, **perf_pattern} else: self.perf_patterns = perf_pattern
def __init__(self): self.valid_systems = [ 'daint:gpu', 'dom:gpu', 'ault:amdv100', 'ault:intelv100', 'ault:amda100', 'ault:amdvega' ] self.valid_prog_environs = ['PrgEnv-gnu'] self.num_tasks = 0 self.num_tasks_per_node = 1 self.build_system = 'Make' self.executable = 'shmem.x' self.sanity_patterns = self.assert_count_gpus() self.perf_patterns = { 'bandwidth': sn.min( sn.extractall( r'^\s*\[[^\]]*\]\s*GPU\s*\d+: ' r'Bandwidth\(double\) (?P<bw>\S+) GB/s', self.stdout, 'bw', float)) } self.reference = { # theoretical limit for P100: # 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s 'dom:gpu': { 'bandwidth': (8850, -0.01, 9520 / 8850 - 1, 'GB/s') }, 'daint:gpu': { 'bandwidth': (8850, -0.01, 9520 / 8850 - 1, 'GB/s') }, 'ault:amdv100': { 'bandwidth': (13020, -0.01, None, 'GB/s') }, 'ault:intelv100': { 'bandwidth': (13020, -0.01, None, 'GB/s') }, 'ault:amda100': { 'bandwidth': (18139, -0.01, None, 'GB/s') }, 'ault:amdvega': { 'bandwidth': (9060, -0.01, None, 'GB/s') } } self.maintainers = ['SK'] self.tags = {'benchmark', 'diagnostic', 'craype'}
def pw_perf_patterns(obj): '''Reports hardware counter values from the tool .. code-block:: collector time time (%) PAPI_REF_CYC PAPI_L2_DCM -------------------------------------------------------------------------- computeMomentumAndEnergyIAD 0.6816 100.00 1770550470 2438527 ^^^^^^^ ''' regex = r'^computeMomentumAndEnergyIAD\s+\S+\s+\S+\s+\S+\s+(?P<hwc>\d+)$' hwc_min = sn.min(sn.extractall(regex, obj.stderr, 'hwc', int)) hwc_avg = sn.round(sn.avg(sn.extractall(regex, obj.stderr, 'hwc', int)), 1) hwc_max = sn.max(sn.extractall(regex, obj.stderr, 'hwc', int)) res_d = { 'papiwrap_hwc_min': hwc_min, 'papiwrap_hwc_avg': hwc_avg, 'papiwrap_hwc_max': hwc_max, } return res_d
def __init__(self): self.valid_systems = ['daint:gpu', 'dom:gpu', 'ault:amdv100', 'ault:intelv100', 'ault:amda100', 'ault:amdvega'] self.valid_prog_environs = ['PrgEnv-gnu'] self.num_tasks = 0 self.num_tasks_per_node = 1 self.build_system = 'Make' self.executable = 'dgemm.x' self.sanity_patterns = self.assert_num_gpus() self.perf_patterns = { 'perf': sn.min(sn.extractall( r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s', self.stdout, 'fp', float)) } self.reference = { 'dom:gpu': { 'perf': (3.35, -0.1, None, 'TF/s') }, 'daint:gpu': { 'perf': (3.35, -0.1, None, 'TF/s') }, 'ault:amdv100': { 'perf': (5.25, -0.1, None, 'TF/s') }, 'ault:intelv100': { 'perf': (5.25, -0.1, None, 'TF/s') }, 'ault:amda100': { 'perf': (10.5, -0.1, None, 'TF/s') }, 'ault:amdvega': { 'perf': (3.45, -0.1, None, 'TF/s') } } self.maintainers = ['JO', 'SK'] self.tags = {'benchmark'}
def speedup(self): regex = r'^\S+(f32|f64)\s+(\S+) ns\s+' slowest = sn.max(sn.extractall(regex, self.stdout, 2, float)) fastest = sn.min(sn.extractall(regex, self.stdout, 2, float)) return sn.round(slowest / fastest, 3)
def vtune_time(self): '''Vtune creates 1 report per compute node. For example, a 48 mpi tasks job (= 2 compute nodes when running with 24 c/cn) will create 2 directories: * rpt.nid00001/rpt.nid00001.vtune * rpt.nid00002/rpt.nid00002.vtune Typical output (for each compute node) is: .. code-block:: Elapsed Time: 14.866s CPU Time: 319.177s /24 = 13.3 Effective Time: 308.218s /24 = 12.8 Idle: 0s Poor: 19.725s Ok: 119.570s Ideal: 168.922s Over: 0s Spin Time: 10.959s /24 = 0.4 MPI Busy Wait Time: 10.795s Other: 0.164s Overhead Time: 0s Total Thread Count: 25 Paused Time: 0s ''' result_d = {} # --- ranks per node if self.num_tasks < self.num_tasks_per_node: vtune_tasks_per_node = self.num_tasks else: vtune_tasks_per_node = self.num_tasks_per_node # --- Elapsed Time (min, max) regex = r'.*Elapsed Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_min'] = sn.round(sn.min(result), 4) result_d['elapsed_max'] = sn.round(sn.max(result), 4) # --- CPU Time (max) regex = r'^\s+CPU Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_cput'] = sn.round( sn.max(result) / vtune_tasks_per_node, 4) # --- CPU Time: Effective Time (max) regex = r'^\s+Effective Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_cput_efft'] = sn.round( sn.max(result) / vtune_tasks_per_node, 4) # --- CPU Time: Spin Time (max) regex = r'^\s+Spin Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_cput_spint'] = sn.round( sn.max(result) / vtune_tasks_per_node, 4) # --- CPU Time: Spin Time: MPI Busy Wait (max) if self.num_tasks > 1: regex = r'\s+MPI Busy Wait Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_cput_spint_mpit'] = sn.round( sn.max(result) / vtune_tasks_per_node, 4) else: result_d['elapsed_cput_spint_mpit'] = 0 # TODO: # 'vtune_momentumAndEnergyIAD': # sphsintel.vtune_momentumAndEnergyIAD(self), # '%vtune_srcf_lookupTables': self.vtune_pct_lookupTables, # '%vtune_srcf_Octree': self.vtune_pct_Octree, # '%vtune_srcf_momentumAndEnergyIAD': # self.vtune_pct_momentumAndEnergyIAD, # '%vtune_srcf_IAD': self.vtune_pct_IAD, return result_d
def patrun_imbalance(self): # {{{ '''Load imbalance from csv report .. code-block:: Table 1: load Balance with MPI Message Stats ''' # }}} rpt = os.path.join(self.stagedir, self.csv_rpt) if self.num_tasks == 1: regex_use = r'^(?P<pe>1),\S+,\s?(?P<samples>\S+),USER$' regex_mpi = r'^(?P<pe>1),\S+,\s?(?P<samples>\S+),MPI$' regex_etc = r'^(?P<pe>1),\S+,\s?(?P<samples>\S+),ETC$' else: regex_use = r'^2,\S+,\s?(?P<samples>\S+),USER/pe.(?P<pe>\d+)$' regex_mpi = r'^2,\S+,\s?(?P<samples>\S+),MPI/pe.(?P<pe>\d+)$' regex_etc = r'^2,\S+,\s?(?P<samples>\S+),ETC/pe.(?P<pe>\d+)$' res_user_sm_l = sn.extractall(regex_use, rpt, 'samples', float) res_user_pe_l = sn.extractall(regex_use, rpt, 'pe', int) # MPI: res_mpi_sm_l = sn.extractall(regex_mpi, rpt, 'samples', float) res_mpi_pe_l = sn.extractall(regex_mpi, rpt, 'pe', int) if not sn.evaluate(res_mpi_sm_l): res_mpi_sm_l = [0 for i in sn.evaluate(res_user_sm_l)] res_mpi_pe_l = [i for i in sn.evaluate(res_user_pe_l)] # ETC: res_etc_sm_l = sn.extractall(regex_etc, rpt, 'samples', float) res_etc_pe_l = sn.extractall(regex_etc, rpt, 'pe', int) # DICT from LISTs: dict(zip(pe,usr)) # TOTAL = USER+MPI+ETC res_total_sm_l = [] # WARNING: this fails if data is not sorted by pe, use pat_report with: # -s sort_by_pe='yes' !!! res_total_sm_l = [ sum(sam) for sam in zip(res_user_sm_l, res_mpi_sm_l, res_etc_sm_l) ] # USER pes # {{{ slowest pe (USER) # slowest = max(max(res_user_sm_l), # max(res_mpi_sm_l), # max(res_etc_sm_l)) slowest = max(res_user_sm_l) user_slowest_pe = -1 index = -1 if slowest in res_user_sm_l: for sam in res_user_sm_l: index += 1 if sam == slowest: user_slowest_pe = index if user_slowest_pe == -1: user_slowest_pe = 0 # }}} # {{{ fastest pe (USER) fastest = min(res_user_sm_l) user_fastest_pe = -1 index = -1 for sam in res_user_sm_l: index += 1 if sam == fastest: user_fastest_pe = index if user_fastest_pe == -1: user_fastest_pe = 0 # }}} # MPI pes # {{{ slowest pe (MPI) slowest = max(res_mpi_sm_l) # try: # slowest = max(res_mpi_sm_l) # except ValueError: # slowest = 0 mpi_slowest_pe = -1 index = -1 if slowest in res_mpi_sm_l: for sam in res_mpi_sm_l: index += 1 if sam == slowest: mpi_slowest_pe = index if mpi_slowest_pe == -1: mpi_slowest_pe = 0 # }}} # {{{ fastest pe (MPI) fastest = min(res_mpi_sm_l) # try: # fastest = min(res_mpi_sm_l) # except ValueError: # fastest = 0 mpi_fastest_pe = -1 index = -1 for sam in res_mpi_sm_l: index += 1 if sam == fastest: mpi_fastest_pe = index if mpi_fastest_pe == -1: mpi_fastest_pe = 0 # }}} # ETC pes # {{{ slowest pe (ETC) slowest = max(res_etc_sm_l) etc_slowest_pe = -1 index = -1 if slowest in res_etc_sm_l: for sam in res_etc_sm_l: index += 1 if sam == slowest: etc_slowest_pe = index if etc_slowest_pe == -1: etc_slowest_pe = 0 # }}} # {{{ fastest pe (ETC) fastest = min(res_etc_sm_l) etc_fastest_pe = -1 index = -1 for sam in res_etc_sm_l: index += 1 if sam == fastest: etc_fastest_pe = index if etc_fastest_pe == -1: etc_fastest_pe = 0 # }}} # TOTAL pes # {{{ slowest pe (TOTAL) slowest = max(res_total_sm_l) # try: # slowest = max(res_total_sm_l) # except ValueError: # slowest = 0 total_slowest_pe = -1 index = -1 if slowest in res_total_sm_l: for sam in res_total_sm_l: index += 1 if sam == slowest: total_slowest_pe = index if total_slowest_pe == -1: total_slowest_pe = 0 # }}} # {{{ fastest pe (TOTAL) fastest = min(res_total_sm_l) # try: # fastest = min(res_total_sm_l) # except ValueError: # fastest = 0 total_fastest_pe = -1 index = -1 for sam in res_total_sm_l: index += 1 if sam == fastest: total_fastest_pe = index if total_fastest_pe == -1: total_fastest_pe = 0 # }}} # {{{ res dict res = {} # min/(mean=average)/median/max res['user_samples_min'] = sn.round(sn.min(res_user_sm_l), 0) res['mpi_samples_min'] = sn.round(sn.min(res_mpi_sm_l), 0) res['etc_samples_min'] = sn.round(sn.min(res_etc_sm_l), 0) res['total_samples_min'] = sn.round(sn.min(res_total_sm_l), 0) # res['user_samples_mean'] = sn.round(sn.avg(res_user_sm_l), 1) res['mpi_samples_mean'] = sn.round(sn.avg(res_mpi_sm_l), 1) res['etc_samples_mean'] = sn.round(sn.avg(res_etc_sm_l), 1) res['total_samples_mean'] = sn.round(sn.avg(res_total_sm_l), 1) # res['user_samples_median'] = \ sn.sanity_function(np.median)(res_user_sm_l) res['mpi_samples_median'] = sn.sanity_function(np.median)(res_mpi_sm_l) res['etc_samples_median'] = sn.sanity_function(np.median)(res_etc_sm_l) res['total_samples_median'] = \ sn.sanity_function(np.median)(res_total_sm_l) # res['user_samples_max'] = sn.round(sn.max(res_user_sm_l), 0) res['mpi_samples_max'] = sn.round(sn.max(res_mpi_sm_l), 0) res['etc_samples_max'] = sn.round(sn.max(res_etc_sm_l), 0) res['total_samples_max'] = sn.round(sn.max(res_total_sm_l), 0) # res['%user_samples'] = sn.round( 100 * res['user_samples_mean'] / res['total_samples_mean'], 1) res['%mpi_samples'] = sn.round( 100 * res['mpi_samples_mean'] / res['total_samples_mean'], 1) res['%etc_samples'] = sn.round( 100 * res['etc_samples_mean'] / res['total_samples_mean'], 1) # slowest pes res['user_slowest_pe'] = user_slowest_pe res['mpi_slowest_pe'] = mpi_slowest_pe res['etc_slowest_pe'] = etc_slowest_pe res['total_slowest_pe'] = total_slowest_pe # --- debug with: # print("> res_user_sm_l", sn.evaluate(res_user_sm_l)) # print("> res_user_pe_l", sn.evaluate(res_user_pe_l)) # print("> res_mpi_sm_l", sn.evaluate(res_mpi_sm_l)) # print("> res_mpi_pe_l", sn.evaluate(res_mpi_pe_l)) # print("> res_etc_sm_l", sn.evaluate(res_etc_sm_l)) # print("> res_etc_pe_l", sn.evaluate(res_etc_pe_l)) try: res['%user_slowest'] = \ sn.round(100 * res_user_sm_l[user_slowest_pe] / res_total_sm_l[user_slowest_pe], 1) except ValueError: res['%user_slowest'] = 0 try: res['%mpi_slowest'] = \ sn.round(100 * res_mpi_sm_l[user_slowest_pe] / res_total_sm_l[user_slowest_pe], 1) except ValueError: res['%mpi_slowest'] = 0 try: res['%etc_slowest'] = \ sn.round(100 * res_etc_sm_l[user_slowest_pe] / res_total_sm_l[user_slowest_pe], 1) except ValueError: res['%etc_slowest'] = 0 # fastest pes res['user_fastest_pe'] = user_fastest_pe res['mpi_fastest_pe'] = mpi_fastest_pe res['etc_fastest_pe'] = etc_fastest_pe res['total_fastest_pe'] = total_fastest_pe try: res['%user_fastest'] = \ sn.round(100 * res_user_sm_l[user_fastest_pe] / res_total_sm_l[user_fastest_pe], 1) except ValueError: res['%user_fastest'] = 0 try: res['%mpi_fastest'] = \ sn.round(100 * res_mpi_sm_l[user_fastest_pe] / res_total_sm_l[user_fastest_pe], 1) except ValueError: res['%mpi_fastest'] = 0 try: res['%etc_fastest'] = \ sn.round(100 * res_etc_sm_l[user_fastest_pe] / res_total_sm_l[user_fastest_pe], 1) except ValueError: res['%etc_fastest'] = 0 # }}} self.patrun_stats_d = res
def __init__(self, peerAccess): self.valid_systems = [ 'tsa:cn', 'arola:cn', 'ault:amdv100', 'ault:intelv100', 'ault:amda100', 'ault:amdvega' ] self.valid_prog_environs = ['PrgEnv-gnu'] if self.current_system.name in ['arolla', 'tsa']: self.valid_prog_environs = ['PrgEnv-gnu-nompi'] # Perform a single bandwidth test with a buffer size of 1024MB copy_size = 1073741824 self.build_system = 'Make' self.executable = 'p2p_bandwidth.x' self.build_system.cxxflags = [f'-DCOPY={copy_size}'] self.num_tasks = 0 self.num_tasks_per_node = 1 self.exclusive_access = True if (peerAccess == 'peerAccess'): self.build_system.cxxflags += ['-DP2P'] p2p = True else: p2p = False self.sanity_patterns = self.do_sanity_check() self.perf_patterns = { 'bw': sn.min( sn.extractall( r'^[^,]*\[[^\]]*\]\s+GPU\s+\d+\s+(\s*\d+.\d+\s)+', self.stdout, 1, float)) } if p2p: self.reference = { 'tsa:cn': { 'bw': (172.5, -0.05, None, 'GB/s'), }, 'arola:cn': { 'bw': (172.5, -0.05, None, 'GB/s'), }, 'ault:amda100': { 'bw': (282.07, -0.1, None, 'GB/s'), }, 'ault:amdv100': { 'bw': (5.7, -0.1, None, 'GB/s'), }, 'ault:intelv100': { 'bw': (31.0, -0.1, None, 'GB/s'), }, 'ault:amdvega': { 'bw': (11.75, -0.1, None, 'GB/s'), }, } else: self.reference = { 'tsa:cn': { 'bw': (79.6, -0.05, None, 'GB/s'), }, 'arola:cn': { 'bw': (79.6, -0.05, None, 'GB/s'), }, 'ault:amda100': { 'bw': (54.13, -0.1, None, 'GB/s'), }, 'ault:amdv100': { 'bw': (7.5, -0.1, None, 'GB/s'), }, 'ault:intelv100': { 'bw': (33.6, -0.1, None, 'GB/s'), }, 'ault:amdvega': { 'bw': (11.75, -0.1, None, 'GB/s'), }, } self.tags = {'diagnostic', 'benchmark', 'mch'} self.maintainers = ['JO']
def __init__(self): self.valid_systems = [ 'daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn', 'ault:amdv100', 'ault:intelv100', 'ault:amda100', 'ault:amdvega' ] self.valid_prog_environs = ['PrgEnv-gnu'] if self.current_system.name in ['arolla', 'tsa']: self.valid_prog_environs = ['PrgEnv-gnu-nompi'] # Perform a single bandwidth test with a buffer size of 1024MB self.copy_size = 1073741824 self.build_system = 'Make' self.executable = 'memory_bandwidth.x' self.build_system.cxxflags = [f'-DCOPY={self.copy_size}'] self.num_tasks = 0 self.num_tasks_per_node = 1 self.exclusive_access = True # perf_patterns and reference will be set by the sanity check function self.sanity_patterns = self.do_sanity_check() self.perf_patterns = { 'h2d': sn.min( sn.extractall(self._xfer_pattern('h2d'), self.stdout, 1, float)), 'd2h': sn.min( sn.extractall(self._xfer_pattern('d2h'), self.stdout, 1, float)), 'd2d': sn.min( sn.extractall(self._xfer_pattern('d2d'), self.stdout, 1, float)), } self.reference = { 'daint:gpu': { 'h2d': (11881, -0.1, None, 'MB/s'), 'd2h': (12571, -0.1, None, 'MB/s'), 'd2d': (499000, -0.1, None, 'MB/s') }, 'dom:gpu': { 'h2d': (11881, -0.1, None, 'MB/s'), 'd2h': (12571, -0.1, None, 'MB/s'), 'd2d': (499000, -0.1, None, 'MB/s') }, 'tsa:cn': { 'h2d': (13000, -0.1, None, 'MB/s'), 'd2h': (12416, -0.1, None, 'MB/s'), 'd2d': (777000, -0.1, None, 'MB/s') }, 'ault:amda100': { 'h2d': (25500, -0.1, None, 'MB/s'), 'd2h': (26170, -0.1, None, 'MB/s'), 'd2d': (1322500, -0.1, None, 'MB/s') }, 'ault:amdv100': { 'h2d': (13189, -0.1, None, 'MB/s'), 'd2h': (13141, -0.1, None, 'MB/s'), 'd2d': (777788, -0.1, None, 'MB/s') }, 'ault:intelv100': { 'h2d': (13183, -0.1, None, 'MB/s'), 'd2h': (13411, -0.1, None, 'MB/s'), 'd2d': (778200, -0.1, None, 'MB/s') }, 'ault:amdvega': { 'h2d': (14000, -0.1, None, 'MB/s'), 'd2h': (14000, -0.1, None, 'MB/s'), 'd2d': (575700, -0.1, None, 'MB/s') }, } self.tags = { 'diagnostic', 'benchmark', 'mch', 'craype', 'external-resources' } self.maintainers = ['AJ', 'SK']
def min_perf(self, nid=None): '''Lowest performance recorded.''' return sn.min(self._extract_perf_metric('perf', nid))