def seconds_elaps(self): '''Reports elapsed time in seconds using the internal timer from the code .. code-block:: === Total time for iteration(0) 3.61153s reports: * Elapsed: 3.6115 s ''' regex = r'^=== Total time for iteration\(\d+\)\s+(?P<sec>\d+\D\d+)s' res = sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4) if res > 0: return sn.round( sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4) else: return 1
def scorep_inclusivepct_energy(obj): '''Reports % of elapsed time (inclusive) for MomentumAndEnergy function (small scale job) .. code-block:: > sqpatch_048mpi_001omp_125n_10steps_1000000cycles/rpt.exclusive 0.0193958 (0.0009252%) sqpatch.exe 1.39647 (0.06661%) + main ... 714.135 (34.063%) | + ... ******* _ZN6sphexa3sph31computeMomentumAndEnergyIADImplIdNS_13 ... ParticlesDataIdEEEEvRKNS_4TaskERT0_ 0.205453 (0.0098%) | + _ZN6sphexa3sph15computeTimestepIdNS0_21TimestepPress2ndOrderIdNS_13 ... ParticlesDataIdEEEES4_EEvRKSt6vectorINS_4TaskESaIS7_EERT1_ 201.685 (9.62%) | | + MPI_Allreduce ''' # regex = r'^\d+.\d+ \((?P<pct>\d+.\d+).*computeMomentumAndEnergy' # return sn.extractsingle(regex, obj.rpt_inclusive, 'pct', float) regex = r'^\d+.\d+\s+\((?P<pct>\d+.\d+).*momentumAndEnergyIAD' try: result = sn.round( sn.sum(sn.extractall(regex, obj.rpt_inclusive, 'pct', float)), 2) except Exception as e: printer.error(f'scorep_inclusivepct_energy failed: {e}') result = 0 return result
def stress_diff(filename, data_ref): ''' Return the difference between obtained and reference stress tensor components''' parsed_output = load_json(filename) if 'stress' in parsed_output['ground_state'] and 'stress' in data_ref['ground_state']: return sn.sum(sn.abs(parsed_output['ground_state']['stress'][i][j] - data_ref['ground_state']['stress'][i][j]) for i in [0, 1, 2] for j in [0, 1, 2]) else: return sn.abs(0)
def forces_diff(filename, data_ref): ''' Return the difference between obtained and reference atomic forces''' parsed_output = load_json(filename) if 'forces' in parsed_output['ground_state'] and 'forces' in data_ref['ground_state']: na = parsed_output['ground_state']['num_atoms'].evaluate() return sn.sum(sn.abs(parsed_output['ground_state']['forces'][i][j] - data_ref['ground_state']['forces'][i][j]) for i in range(na) for j in [0, 1, 2]) else: return sn.abs(0)
def seconds_iad(self): '''Reports `IAD` time in seconds using the internal timer from the code .. code-block:: # IAD: 0.626564s reports: * IAD: 0.6284 s ''' regex = r'^# IAD:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_denst(self): '''Reports `Density` time in seconds using the internal timer from the code .. code-block:: # Density: 0.296224s reports: * Density: 0.296 s ''' regex = r'^# Density:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_neigh(self): '''Reports `FindNeighbors` time in seconds using the internal timer from the code .. code-block:: # FindNeighbors: 0.354712s reports: * FindNeighbors: 0.3547 s ''' regex = r'^# FindNeighbors:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_domaindistrib(self): '''Reports `domain::distribute` time in seconds using the internal timer from the code .. code-block:: # domain::distribute: 0.0983208s reports: * domain_distribute: 0.0983 s ''' regex = r'^# domain::distribute:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_smoothinglength(self): '''Reports `UpdateSmoothingLength` time in seconds using the internal timer from the code .. code-block:: # UpdateSmoothingLength: 0.00321161s reports: * SmoothingLength: 0.0032 s ''' regex = r'^# UpdateSmoothingLength:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_consv(self): '''Reports `EnergyConservation` time in seconds using the internal timer from the code .. code-block:: # EnergyConservation: 0.00137127s reports: * EnergyConservation: 0.0013 s ''' regex = r'^# EnergyConservation:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_updat(self): '''Reports `UpdateQuantities` time in seconds using the internal timer from the code .. code-block:: # UpdateQuantities: 0.00498222s reports: * UpdateQuantities: 0.0049 s ''' regex = r'^# UpdateQuantities:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_step(self): '''Reports `Timestep` time in seconds using the internal timer from the code .. code-block:: # Timestep: 0.621583s reports: * Timestep: 0.6215 s ''' regex = r'^# Timestep:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_state(self): '''Reports `EquationOfState` time in seconds using the internal timer from the code .. code-block:: # EquationOfState: 0.00244751s reports: * EquationOfState: 0.0024 s ''' regex = r'^# EquationOfState:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_tree(self): '''Reports `domain:BuildTree` time in seconds using the internal timer from the code .. code-block:: # domain::buildTree: 0.084004s reports: * BuildTree: 0 s ''' regex = r'^# domain::buildTree:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_updateTasks(self): '''Reports `updateTasks` time in seconds using the internal timer from the code .. code-block:: # updateTasks: 0.000900428s reports: ... ''' regex = r'^# updateTasks:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_energ(self): '''Reports `MomentumEnergyIAD` time in seconds using the internal timer from the code .. code-block:: # MomentumEnergyIAD: 1.05951s reports: * MomentumEnergyIAD: 1.0595 s ''' regex = r'^# MomentumEnergyIAD:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_halos(self): '''Reports `mpi::synchronizeHalos` time in seconds using the internal timer from the code .. code-block:: # mpi::synchronizeHalos: 0.0341479s # mpi::synchronizeHalos: 0.0770191s # mpi::synchronizeHalos: 0.344856s reports: * mpi_synchronizeHalos: 0.4560 s ''' regex = r'^# mpi::synchronizeHalos:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def set_vtune_perf_patterns_rpt(self): '''More perf_patterns for the tool Typical performance reporting: .. literalinclude:: ../../reframechecks/intel/intel_vtune.res :lines: 117-127 ''' regex_l = r'^vtune: Using result path .(?P<paths>\S+).$' paths_l = sn.extractall(regex_l, self.dir_rpt, 'paths') regex = (r'(?P<funcname>.*);(?P<cput>\d+.\d+);(?P<cput_efft>\S+);' r'(?P<cput_spint>\S+);(?P<cput_overhead>\S+)') res = {} res2 = {} for ii in range(self.vtune_paths): rpt = paths_l[ii] + '.csv' # rpt.nid00034.csv kk = 'vtune_cput_cn%s' % ii res[kk] = sn.round(sn.sum(sn.extractall(regex, rpt, 'cput', float)), 2) kk = 'vtune_cput_cn%s_efft' % ii res2[kk] = sn.round(sn.sum(sn.extractall(regex, rpt, 'cput_efft', float)), 2) kk = 'vtune_cput_cn%s_spint' % ii res2[kk] = sn.round(sn.sum(sn.extractall(regex, rpt, 'cput_spint', float)), 2) kk = '%svtune_cput_cn%s_efft' % ('%', ii) res[kk] = sn.round(res2['vtune_cput_cn%s_efft' % ii] / res['vtune_cput_cn%s' % ii] * 100, 1) kk = '%svtune_cput_cn%s_spint' % ('%', ii) res[kk] = sn.round(res2['vtune_cput_cn%s_spint' % ii] / res['vtune_cput_cn%s' % ii] * 100, 1) if self.perf_patterns: self.perf_patterns = {**self.perf_patterns, **res} else: self.perf_patterns = res
def seconds_timers(self, region): '''Reports timings (in seconds) using the internal timer from the code .. code-block:: # domain::sync: 0.118225s # updateTasks: 0.00561256s # FindNeighbors: 0.266282s # Density: 0.120372s # EquationOfState: 0.00255166s # mpi::synchronizeHalos: 0.116917s # IAD: 0.185804s # mpi::synchronizeHalos: 0.0850162s # MomentumEnergyIAD: 0.423282s # Timestep: 0.0405346s # UpdateQuantities: 0.0140938s # EnergyConservation: 0.0224118s # UpdateSmoothingLength: 0.00413466s ''' if region == 1: regex = r'^# domain::sync:\s+(?P<sec>.*)s' elif region == 2: regex = r'^# updateTasks:\s+(?P<sec>.*)s' elif region == 3: regex = r'^# FindNeighbors:\s+(?P<sec>.*)s' elif region == 4: regex = r'^# Density:\s+(?P<sec>.*)s' elif region == 5: regex = r'^# EquationOfState:\s+(?P<sec>.*)s' elif region == 6: regex = r'^# mpi::synchronizeHalos:\s+(?P<sec>.*)s' elif region == 7: regex = r'^# IAD:\s+(?P<sec>.*)s' elif region == 8: regex = r'^# MomentumEnergyIAD:\s+(?P<sec>.*)s' elif region == 9: regex = r'^# Timestep:\s+(?P<sec>.*)s' elif region == 10: regex = r'^# UpdateQuantities:\s+(?P<sec>.*)s' elif region == 11: regex = r'^# EnergyConservation:\s+(?P<sec>.*)s' elif region == 12: regex = r'^# UpdateSmoothingLength:\s+(?P<sec>.*)s' else: raise ValueError('unknown region id in sanity_function') return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def advisor_elapsed(obj): ''' Reports the elapsed time (sum of ``Self Time`` in seconds) measured by the tool .. code-block:: > summary.rpt ID / Function Call Sites and Loops / Total Time / Self Time / Type 71 [loop in sphexa::sph::computeMomentumAndEnergyIADImpl<double, ... sphexa::ParticlesData<double>> at momentumAndEnergyIAD.hpp:94] ... 1.092s 0.736s Scalar momentumAndEnergyIAD.hpp:94 34 [loop in MPIDI_Cray_shared_mem_coll_bcast] ... 0.596s 0.472s Scalar libmpich_gnu_82.so.3 etc. returns: * advisor_elapsed: 2.13 s ''' regex = r'\s+\d+\s+\[.*\]\s+(?P<inclusive>\S+)s\s+(?P<exclusive>\S+)s' rpt = os.path.join(obj.stagedir, obj.summary_rpt) return sn.round(sn.sum(sn.extractall(regex, rpt, 'exclusive', float)), 4)
def scorep_exclusivepct_energy(obj): '''Reports % of elapsed time (exclusive) for MomentumAndEnergy function (small scale job) .. code-block:: > sqpatch_048mpi_001omp_125n_10steps_1000000cycles/rpt.exclusive 0.0193958 (0.0009252%) sqpatch.exe 1.39647 (0.06661%) + main ... 714.135 (34.063%) | + ... ******* _ZN6sphexa3sph31computeMomentumAndEnergyIADImplIdNS_13 ... ParticlesDataIdEEEEvRKNS_4TaskERT0_ 0.205453 (0.0098%) | + _ZN6sphexa3sph15computeTimestepIdNS0_21TimestepPress2ndOrderIdNS_13 ... ParticlesDataIdEEEES4_EEvRKSt6vectorINS_4TaskESaIS7_EERT1_ 201.685 (9.62%) | | + MPI_Allreduce type max_buf[B] visits hits time[s] time[%] time/visit[us] region OMP 1,925,120 81,920 0 63.84 2.5 779.29 !$omp parallel @momentumAndEnergyIAD.hpp:87 *** OMP 920,500 81,920 48,000 125.41 5.0 1530.93 !$omp for @momentumAndEnergyIAD.hpp:87 *** OMP 675,860 81,920 1 30.95 1.2 377.85 !$omp implicit barrier @momentumAndEnergyIAD.hpp:93 *** ''' # regex = r'^\s+\S+(\s+\S+){4}\s+(?P<pct>\S+).*@momentumAndEnergyIAD' regex = r'^\d+.\d+\s+\((?P<pct>\d+.\d+).*momentumAndEnergyIAD' try: result = sn.round( sn.sum(sn.extractall(regex, obj.rpt_exclusive, 'pct', float)), 2) except Exception as e: printer.error(f'scorep_exclusivepct_energy failed: {e}') result = 0 return result
def set_sanity_gpu(self): # {{{ ''' This method runs sanity checks on the following logs: - info cuda devices .. literalinclude:: ../../reframechecks/debug/res/cuda-gdb/info_devices.log :lines: 1-3 - info cuda kernels .. literalinclude:: ../../reframechecks/debug/res/cuda-gdb/info_kernels.log :lines: 5-7 - info cuda threads .. literalinclude:: ../../reframechecks/debug/res/cuda-gdb/info_threads.log :lines: 1-5, 458-459 - navigate between cuda kernels/blocks/threads/ .. literalinclude:: ../../reframechecks/debug/res/cuda-gdb/info_navigate.log :lines: 5-6, 17-18, 33-34 :emphasize-lines: 1, 3, 5 - inspect variables (std::vector) .. literalinclude:: ../../reframechecks/debug/res/cuda-gdb/info_std_vector.log :lines: 1-25 :emphasize-lines: 4 - inspect variables (int*) .. literalinclude:: ../../reframechecks/debug/res/cuda-gdb/info_const_int.log :lines: 6-37 :emphasize-lines: 17 ''' # }}} self.gpu_specs = {} self.gpu_specs_bool = {} ref_gpu_specs = {} ref_gpu_specs['P100'] = {} ref_gpu_specs['V100'] = {} # {{{ info_devices.log: # Dev PCI Bus/Dev ID Name Description SM Type SMs Warps/SM Lanes/Warp # Max Regs/Lane Active SMs Mask # * 0 88:00.0 Tesla V100-SXM2-16GB GV100GL-A sm_70 80 64 ... # ^^^^ ^^^^^ ^^ ^^ # 32 256 0x000000000000ffffffffffffffffffff # ^^ self.rpt = os.path.join(self.stagedir, self.log_devices) ref_gpu_specs = { 'V100': { 'capability': 'sm_70', 'sms': 80, 'WarpsPerSM': 64, 'LanesPerWarp': 32, # = warpSize 'max_threads_per_sm': 2048, 'max_threads_per_device': 163840, }, 'P100': { 'capability': 'sm_60', 'sms': 56, 'WarpsPerSM': 64, 'LanesPerWarp': 32, # = warpSize 'max_threads_per_sm': 2048, 'max_threads_per_device': 114688, }, } regex = (r'Tesla (?P<gpu_name>\S+)-\S+-\S+\s+\S+\s+(?P<cap>sm_\d+)\s+' r'(?P<sms>\d+)\s+(?P<WarpsPerSM>\d+)\s+(?P<LanesPerWarp>\d+)') # --- get gpu_name (V100 or P100): gpu_name = sn.evaluate(sn.extractsingle(regex, self.rpt, 'gpu_name')) # --- get capability (True means that extracted value matches ref): res = sn.extractsingle(regex, self.rpt, 'cap') self.gpu_specs['capability'] = res self.gpu_specs_bool['capability'] = \ (res == ref_gpu_specs[gpu_name]['capability']) # --- get sms: res = sn.extractsingle(regex, self.rpt, 'sms', int) self.gpu_specs['sms'] = res self.gpu_specs_bool['sms'] = (res == ref_gpu_specs[gpu_name]['sms']) # --- get WarpsPerSM: res = sn.extractsingle(regex, self.rpt, 'WarpsPerSM', int) self.gpu_specs['WarpsPerSM'] = res self.gpu_specs_bool['WarpsPerSM'] = \ (res == ref_gpu_specs[gpu_name]['WarpsPerSM']) # --- get LanesPerWarp|warpSize: res = sn.extractsingle(regex, self.rpt, 'LanesPerWarp', int) self.gpu_specs['LanesPerWarp'] = res self.gpu_specs_bool['LanesPerWarp'] = \ (res == ref_gpu_specs[gpu_name]['LanesPerWarp']) # --- threads_per_sm <= LanesPerWarp * WarpsPerSM res = self.gpu_specs['LanesPerWarp'] * self.gpu_specs['WarpsPerSM'] self.gpu_specs['max_threads_per_sm'] = res self.gpu_specs_bool['max_threads_per_sm'] = \ (res == ref_gpu_specs[gpu_name]['max_threads_per_sm']) # --- threads_per_device <= threads_per_sm * sms res = self.gpu_specs['sms'] * self.gpu_specs['max_threads_per_sm'] self.gpu_specs['max_threads_per_device'] = res self.gpu_specs_bool['max_threads_per_device'] = \ (res == ref_gpu_specs[gpu_name]['max_threads_per_device']) # --- max_np of 1gpu = f(max_threads_per_device) where np = cube_size^3 import math self.gpu_specs['max_cubesz'] = sn.defer( math.ceil(pow(sn.evaluate(res), 1 / 3))) # }}} # {{{ info_kernels.log: # Kernel Parent Dev Grid Status SMs Mask GridDim BlockDim Invocation # * 0 - 0 3 Active 0x (106,1,1) (256,1,1) ...::density<double>(n=27000, # ^^^^^^^ ^^^^^^^ ^^^^^ # --------------------------------------------------------------------- self.log = os.path.join(self.stagedir, self.log_kernels) regex = (r'\*.*Active \S+ \((?P<grid_x>\d+),(?P<grid_y>\d+),' r'(?P<grid_z>\d+)\)\s+\((?P<block_x>\d+),(?P<block_y>\d+),' r'(?P<block_z>\d+)\).*\(n=(?P<np>\d+), ') grid_x = sn.extractsingle(regex, self.log, 'grid_x', int) grid_y = sn.extractsingle(regex, self.log, 'grid_y', int) grid_z = sn.extractsingle(regex, self.log, 'grid_z', int) block_x = sn.extractsingle(regex, self.log, 'block_x', int) block_y = sn.extractsingle(regex, self.log, 'block_y', int) block_z = sn.extractsingle(regex, self.log, 'block_z', int) np = sn.extractsingle(regex, self.log, 'np', int) self.kernel_grid = grid_x * grid_y * grid_z self.kernel_block = block_x * block_y * block_z self.kernel_np = np import math self.gpu_specs['cubesz'] = \ sn.defer(math.ceil(pow(sn.evaluate(self.kernel_np), 1/3))) # {{{ TODO:tuple # https://github.com/eth-cscs/reframe/blob/master/cscs-checks/ # prgenv/affinity_check.py#L38 # regex=(r'\*.*Active \S+ (?P<griddim>\(\d+,\d+,\d+\))\s+(?P<blockdim>' # r'\(\d+,\d+,\d+\)).*\(n=(?P<np>\d+), ') # from functools import reduce # self.res = reduce(lambda x, y: x*y, list(res)) # sn.extractsingle(regex, self.stdout, 'nrgy', # conv=lambda x: int(x.replace(',', ''))) # res: ('(', '1', '0', '6', ',', '1', ',', '1', ')') # }}} # }}} # {{{ info_threads.log: # BlockIdx ThreadIdx To BlockIdx ThreadIdx Count Virtual PC Filename L # Kernel 0 # * (0,0,0) (0,0,0) (1,0,0) (63,0,0) 320 0x0... ../cudaDensity.cu 27 # (1,0,0) (64,0,0) (1,0,0) (95,0,0) 32 0x0... ../cudaDensity.cu 26 # etc... sum(^^^) # --------------------------------------------------------------------- self.log = os.path.join(self.stagedir, self.log_threads) regex = r'(\(\S+\)\s+){4}(?P<nth>\d+)\s+0x' self.threads_np = sn.sum(sn.extractall(regex, self.log, 'nth', int)) # }}} # {{{ info_navigate.log: # gridDim=(106,1,1) blockDim=(256,1,1) blockIdx=(0,0,0) \ # threadIdx=(0,0,0) warpSize=32 thid=0 # kernel 0 grid 3 block (0,0,0) thread (0,0,0) device 0 sm 0 warp 0 ... # -- # gridDim=(106,1,1) blockDim=(256,1,1) blockIdx=(105,0,0) # threadIdx=(255,0,0) warpSize=32 thid=27135 # kernel 0 grid 3 block (105,0,0) thread (255,0,0) device 0 sm 43 ... # -- # gridDim=(106,1,1) blockDim=(256,1,1) blockIdx=(55,0,0) # threadIdx=(255,0,0) warpSize=32 thid=14335 # kernel 0 grid 3 block (55,0,0) thread (255,0,0) device 0 sm 55 ... # --------------------------------------------------------------------- self.log = os.path.join(self.stagedir, self.log_navigate) regex = r'^gridDim.*warpSize=\d+ thid=(?P<th>\d+)$' self.thids = sn.extractall(regex, self.log, 'th', int) # }}} # {{{ info_std_vector.log: # --- get vector length(True means that extracted value matches ref): self.rpt = os.path.join(self.stagedir, self.log_stdvector) # std::vector of length 27000, capacity 27000 regex = r'std::vector of length (?P<vec_len1>\d+),' res = sn.extractsingle(regex, self.rpt, 'vec_len1', int) self.gpu_specs['vec_len1'] = res self.gpu_specs_bool['vec_len1'] = (res == self.cubesize**3) # Vector size = 27000 (pvector) regex = r'^Vector size = (?P<vec_len2>\d+)$' res = sn.extractsingle(regex, self.rpt, 'vec_len2', int) self.gpu_specs['vec_len2'] = res self.gpu_specs_bool['vec_len2'] = (res == self.cubesize**3) # }}} # {{{ --- sanity_patterns: self.sanity_patterns = sn.all([ sn.assert_true(self.gpu_specs_bool['capability']), sn.assert_true(self.gpu_specs_bool['sms']), sn.assert_true(self.gpu_specs_bool['WarpsPerSM']), sn.assert_true(self.gpu_specs_bool['LanesPerWarp']), sn.assert_true(self.gpu_specs_bool['max_threads_per_sm']), sn.assert_true(self.gpu_specs_bool['max_threads_per_device']), sn.assert_true(self.gpu_specs_bool['vec_len1']), sn.assert_true(self.gpu_specs_bool['vec_len2']), # NO: sn.assert_true(self.gpu_specs_bool), ])
def test_sum(self): self.assertEqual(3, sn.sum([1, 1, 1])) self.assertEqual(3, sn.sum(make_deferrable([1, 1, 1])))
def test_sum(): assert 3 == sn.sum([1, 1, 1]) assert 3 == sn.sum(sn.defer([1, 1, 1]))
def test_sum(self): self.assertEqual(3, sn.sum([1, 1, 1])) self.assertEqual(3, sn.sum(sn.defer([1, 1, 1])))