예제 #1
0
    def test_min(self):
        l = [1, 2]
        dl = make_deferrable(l)
        self.assertEqual(1, sn.min(dl))

        l.append(0)
        self.assertEqual(0, sn.min(dl))
예제 #2
0
def test_min():
    l = [1, 2]
    dl = sn.defer(l)
    assert 1 == sn.min(dl)

    l.append(0)
    assert 0 == sn.min(dl)
예제 #3
0
    def test_min(self):
        l = [1, 2]
        dl = sn.defer(l)
        self.assertEqual(1, sn.min(dl))

        l.append(0)
        self.assertEqual(0, sn.min(dl))
예제 #4
0
    def set_perf_patterns(self):
        '''Set the performance patterns.

        These include host-device (h2d), device-host (d2h) and device=device
        (d2d) transfers.
        '''

        self.perf_patterns = {
            'h2d': sn.min(sn.extractall(self._xfer_pattern('h2d'),
                                        self.stdout, 1, float)),
            'd2h': sn.min(sn.extractall(self._xfer_pattern('d2h'),
                                        self.stdout, 1, float)),
            'd2d': sn.min(sn.extractall(self._xfer_pattern('d2d'),
                                        self.stdout, 1, float)),
        }
예제 #5
0
    def gpu_usage_sanity(self):
        '''Verify that the jobreport output has sensible numbers.

        This function asserts that the nodes reported are at least a subset of
        all nodes used by the gpu burn app. Also, the GPU usage is verified by
        assuming that in the worst case scenario, the usage is near 100% during
        the burn, and 0% outside the burn period. Lastly, the GPU usage time
        for each node is also asserted to be greater or equal than the burn
        time.
        '''

        # Get set with all nodes
        patt = r'^\s*\[([^\]]*)\]\s*GPU\s*\d+\(OK\)'
        full_node_set = set(sn.extractall(patt, self.stdout, 1))

        # Parse job report data
        patt = r'^\s*(\w*)\s*(\d+)\s*%\s*\d+\s*MiB\s*\d+:\d+:(\d+)'
        self.nodes_reported = sn.extractall(patt, self.stdout, 1)
        usage = sn.extractall(patt, self.stdout, 2, int)
        time_reported = sn.extractall(patt, self.stdout, 3, int)
        return sn.all([
            sn.assert_ge(sn.count(self.nodes_reported), 1),
            set(self.nodes_reported).issubset(full_node_set),
            sn.all(
                map(lambda x, y: self.burn_time / x <= y, time_reported,
                    usage)),
            sn.assert_ge(sn.min(time_reported), self.burn_time)
        ])
예제 #6
0
 def __init__(self):
     self.valid_systems = [
         'cannon:local-gpu', 'cannon:gpu_test', 'fasse:fasse_gpu',
         'test:gpu'
     ]
     self.valid_prog_environs = ['gpu']
     self.build_system = 'Make'
     self.executable = './dgemm.x'
     self.perf_patterns = {
         'perf':
         sn.min(
             sn.extractall(r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s',
                           self.stdout, 'fp', float))
     }
     self.reference = {
         'cannon:local-gpu': {
             'perf': (5.2, -0.1, None, 'TF/s per gpu')
         },
         'cannon:gpu_test': {
             'perf': (5.2, -0.1, None, 'TF/s per gpu')
         },
         '*': {
             'perf': (3.35, None, None, 'TF/s per gpu')
         },
     }
예제 #7
0
 def __init__(self):
     self.valid_systems = [
         'cannon:local-gpu', 'cannon:gpu_test', 'fasse:fasse_gpu',
         'test:gpu'
     ]
     self.valid_prog_environs = ['gpu']
     self.build_system = 'Make'
     self.executable = './shmem.x'
     self.perf_patterns = {
         'bandwidth':
         sn.min(
             sn.extractall(
                 r'^\s*\[[^\]]*\]\s*GPU\s*\d+: '
                 r'Bandwidth\(double\) (?P<bw>\S+) GB/s', self.stdout, 'bw',
                 float))
     }
     self.reference = {
         # theoretical limit for P100:
         # 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s
         'cannon:local-gpu': {
             'bandwidth': (13000, -0.01, None, 'GB/s per gpu')
         },
         'cannon:gpu_test': {
             'bandwidth': (13000, -0.01, None, 'GB/s per gpu')
         },
         '*': {
             'bandwidth': (8850, None, None, 'GB/s per gpu')
         },
     }
예제 #8
0
 def __init__(self):
     self.valid_systems = ['cannon:local-gpu','cannon:gpu_test','fasse:fasse_gpu','test:gpu']
     self.descr = 'GPU burn test'
     self.valid_prog_environs = ['gpu']
     self.executable_opts = ['-d', '40']
     self.build_system = 'Make'
     self.build_system.makefile = 'makefile.cuda'
     self.executable = './gpu_burn.x'
     patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s'
             r'\s+(?P<temp>\S*)\s+Celsius')
     self.perf_patterns = {
         'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
         'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)),
     }
     self.reference = {
         'cannon:local-gpu': {
             'perf': (6200, -0.10, None, 'Gflop/s per gpu'),
         },
         'cannon:gpu_test': {
             'perf': (6200, -0.10, None, 'Gflop/s per gpu'),
         },
         'test:gpu': {
             'perf': (4115, None, None, 'Gflop/s per gpu'),
         },
         '*': {
             'perf': (4115, None, None, 'Gflop/s per gpu'),
         },
         '*': {'temp': (0, None, None, 'degC')}
     }
예제 #9
0
    def __init__(self):
        self.modules = ['likwid']
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.sourcesdir = None

        self.executable = 'likwid-bench'

        self.num_tasks = 1
        self.num_tasks_per_node = 1
        self.num_tasks_per_core = 2
        self.system_num_cpus = {
            'daint:mc': 72,
            'daint:gpu': 24,
            'dom:mc': 72,
            'dom:gpu': 24,
        }
        self.system_numa_domains = {
            'daint:mc': ['S0', 'S1'],
            'daint:gpu': ['S0'],
            'dom:mc': ['S0', 'S1'],
            'dom:gpu': ['S0'],
        }

        # Test each level at half capacity times nthreads per domain
        self.system_cache_sizes = {
            'daint:mc': {
                'L1': '288kB',
                'L2': '2304kB',
                'L3': '23MB',
                'memory': '1800MB'
            },
            'daint:gpu': {
                'L1': '192kB',
                'L2': '1536kB',
                'L3': '15MB',
                'memory': '1200MB'
            },
            'dom:mc': {
                'L1': '288kB',
                'L2': '2304kB',
                'L3': '23MB',
                'memory': '1800MB'
            },
            'dom:gpu': {
                'L1': '192kB',
                'L2': '1536kB',
                'L3': '15MB',
                'memory': '1200MB'
            },
        }

        self.maintainers = ['SK', 'CB']
        self.tags = {'benchmark', 'diagnostic', 'health'}

        bw_pattern = sn.min(
            sn.extractall(r'MByte/s:\s*(?P<bw>\S+)', self.stdout, 'bw', float))

        self.sanity_patterns = sn.assert_ge(bw_pattern, 0.0)
        self.perf_patterns = {'bandwidth': bw_pattern}
예제 #10
0
def mpip_perf_patterns(obj, reg):
    '''More perf_patterns for the tool

    .. code-block::

      -----------------------------------
      @--- MPI Time (seconds) -----------
      -----------------------------------
      Task    AppTime    MPITime     MPI%
         0        8.6      0.121     1.40 <-- min
         1        8.6      0.157     1.82
         2        8.6       5.92    68.84 <-- max
         *       25.8        6.2    24.02 <---

      => NonMPI= AppTime - MPITime

    Typical performance reporting:

    .. code-block::

      * mpip_avg_app_time: 8.6 s  (= 25.8/3mpi)
      * mpip_avg_mpi_time: 2.07 s (=  6.2/3mpi)
      * %mpip_avg_mpi_time: 24.02 %
      * %max/%min
      * %mpip_avg_non_mpi_time: 75.98 %
    '''
    # rpt = os.path.join(obj.stagedir, obj.rpt_file_txt)
    rpt = sn.extractsingle(r'^mpiP: Storing mpiP output in \[(?P<rpt>.*)\]',
                           obj.stdout, 'rpt', str)
    regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$'
    regex_minmax = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+'
                    r'(?P<pct>\S+)$')
    if reg == 1:
        # mpip_avg_mpi_time
        result = sn.round(
            sn.extractsingle(regex_star, rpt, 'mpit', float) / obj.num_tasks,
            2)
    elif reg == 2:
        # mpip_avg_app_time
        result = sn.round(
            sn.extractsingle(regex_star, rpt, 'appt', float) / obj.num_tasks,
            2)
    elif reg == 3:
        # %mpip_avg_mpi_time
        result = sn.extractsingle(regex_star, rpt, 'pct', float)
    elif reg == 4:
        # %nonmpi
        mpi_pct = sn.extractsingle(regex_star, rpt, 'pct', float)
        result = sn.round(100 - mpi_pct, 2)
    elif reg == 5:
        # %mpip_avg_mpi_time_max
        result = sn.max(sn.extractall(regex_minmax, rpt, 'pct', float))
    elif reg == 6:
        # %mpip_avg_mpi_time_min
        result = sn.min(sn.extractall(regex_minmax, rpt, 'pct', float))
    else:
        raise ValueError('unknown region id in mpip_perf_patterns')

    return result
예제 #11
0
    def set_perf_patterns(self):
        '''Extract the TF/s achieved.'''

        self.perf_patterns = {
            'perf': sn.min(sn.extractall(
                r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s',
                self.stdout, 'fp', float))
        }
예제 #12
0
    def __init__(self):
        self.valid_systems = [
            'cannon:local-gpu', 'cannon:gpu_test', 'fasse:fasse_gpu',
            'test:gpu'
        ]
        self.valid_prog_environs = ['gpu']

        # Perform a single bandwidth test with a buffer size of 1024MB
        self.copy_size = 1073741824

        self.build_system = 'Make'
        self.executable = './memory_bandwidth.x'
        self.build_system.cxxflags = [f'-DCOPY={self.copy_size}']

        # perf_patterns and reference will be set by the sanity check function
        self.perf_patterns = {
            'h2d':
            sn.min(
                sn.extractall(self._xfer_pattern('h2d'), self.stdout, 1,
                              float)),
            'd2h':
            sn.min(
                sn.extractall(self._xfer_pattern('d2h'), self.stdout, 1,
                              float)),
            'd2d':
            sn.min(
                sn.extractall(self._xfer_pattern('d2d'), self.stdout, 1,
                              float)),
        }
        self.reference = {
            'cannon:local-gpu': {
                'h2d': (12000, -0.1, None, 'MB/s per gpu'),
                'd2h': (13000, -0.1, None, 'MB/s per gpu'),
                'd2d': (630000, -0.1, None, 'MB/s per gpu')
            },
            'cannon:gpu_test': {
                'h2d': (12000, -0.1, None, 'MB/s per gpu'),
                'd2h': (13000, -0.1, None, 'MB/s per gpu'),
                'd2d': (780000, -0.1, None, 'MB/s per gpu')
            },
            '*': {
                'h2d': (11881, None, None, 'MB/s per gpu'),
                'd2h': (12571, None, None, 'MB/s per gpu'),
                'd2d': (499000, None, None, 'MB/s per gpu')
            },
        }
예제 #13
0
    def __init__(self):
        super().__init__()
        self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
        self.descr = 'GPU burn test'
        self.valid_prog_environs = ['PrgEnv-gnu']

        if self.current_system.name == 'kesch':
            self.exclusive_access = True
            self.modules = ['craype-accel-nvidia35']
            # NOTE: The first option indicates the precision (-d for double)
            #       while the seconds is the time (in secs) to run the test.
            #       For multi-gpu nodes, we run the gpu burn test for more
            #       time to get reliable measurements.
            self.executable_opts = ['-d', '40']
            self.num_gpus_per_node = 16
            gpu_arch = '37'
        else:
            self.modules = ['craype-accel-nvidia60']
            self.executable_opts = ['-d', '20']
            self.num_gpus_per_node = 1
            gpu_arch = '60'

        self.sourcepath = 'gpu_burn.cu'
        self.build_system = 'SingleSource'
        self.build_system.cxxflags = [
            '-arch=compute_%s' % gpu_arch,
            '-code=sm_%s' % gpu_arch
        ]
        self.build_system.ldflags = ['-lcuda', '-lcublas', '-lnvidia-ml']

        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall('OK', self.stdout)), self.num_tasks_assigned)

        self.perf_patterns = {
            'perf':
            sn.min(
                sn.extractall(r'GPU\s+\d+\(\S*\): (?P<perf>\S*) GF\/s',
                              self.stdout, 'perf', float))
        }

        self.reference = {
            'dom:gpu': {
                'perf': (4115, -0.10, None)
            },
            'daint:gpu': {
                'perf': (4115, -0.10, None)
            },
            'kesch:cn': {
                'perf': (950, -0.10, None)
            }
        }

        self.num_tasks = 0
        self.num_tasks_per_node = 1

        self.maintainers = ['AJ', 'VK', 'TM']
        self.tags = {'diagnostic', 'benchmark'}
예제 #14
0
    def set_perf_patterns(self):
        '''Extract the bandwidth data from the stdout.'''

        self.perf_patterns = {
            'bandwidth':
            sn.min(
                sn.extractall(
                    r'^\s*\[[^\]]*\]\s*GPU\s*\d+: '
                    r'Bandwidth\(double\) (?P<bw>\S+) GB/s', self.stdout, 'bw',
                    float))
        }
예제 #15
0
def vtune_physical_core_utilization(self):
    '''Reports the minimum ``Physical Core Utilization`` (%) measured by the
    tool

    .. code-block::

      Effective Physical Core Utilization: 96.3% (11.554 out of 12)
      Effective Physical Core Utilization: 96.1% (11.534 out of 12)
      Effective Physical Core Utilization: 95.9% (11.512 out of 12)
    '''
    regex = r'^Effective Physical Core Utilization: (?P<pct>\S+)%'
    return sn.round(sn.min(sn.extractall(regex, self.stdout, 'pct', float)), 4)
예제 #16
0
def vtune_logical_core_utilization(self):
    '''Reports the minimum ``Physical Core Utilization`` (%) measured by the
    tool

    .. code-block::

      Effective Logical Core Utilization: 96.0% (23.028 out of 24)
      Effective Logical Core Utilization: 95.9% (23.007 out of 24)
      Effective Logical Core Utilization: 95.5% (22.911 out of 24)
    '''
    regex = r'^\s+Effective Logical Core Utilization: (?P<pct>\S+)%'
    return sn.round(sn.min(sn.extractall(regex, self.stdout, 'pct', float)), 4)
예제 #17
0
    def __init__(self):
        self.valid_systems = [
            'daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn', 'ault:amdv100',
            'ault:intelv100', 'ault:amda100', 'ault:amdvega'
        ]
        self.descr = 'GPU burn test'
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.exclusive_access = True
        self.executable_opts = ['-d', '40']
        self.build_system = 'Make'
        self.executable = './gpu_burn.x'
        self.num_tasks = 0
        self.num_tasks_per_node = 1
        self.sanity_patterns = self.assert_num_tasks()
        patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s'
                r'\s+(?P<temp>\S*)\s+Celsius')
        self.perf_patterns = {
            'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
            'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)),
        }

        self.reference = {
            'dom:gpu': {
                'perf': (4115, -0.10, None, 'Gflop/s'),
            },
            'daint:gpu': {
                'perf': (4115, -0.10, None, 'Gflop/s'),
            },
            'arolla:cn': {
                'perf': (5861, -0.10, None, 'Gflop/s'),
            },
            'tsa:cn': {
                'perf': (5861, -0.10, None, 'Gflop/s'),
            },
            'ault:amda100': {
                'perf': (15000, -0.10, None, 'Gflop/s'),
            },
            'ault:amdv100': {
                'perf': (5500, -0.10, None, 'Gflop/s'),
            },
            'ault:intelv100': {
                'perf': (5500, -0.10, None, 'Gflop/s'),
            },
            'ault:amdvega': {
                'perf': (3450, -0.10, None, 'Gflop/s'),
            },
            '*': {
                'temp': (0, None, None, 'degC')
            }
        }

        self.maintainers = ['AJ', 'TM']
        self.tags = {'diagnostic', 'benchmark', 'craype'}
예제 #18
0
    def set_perf_patterns(self):
        '''Set the performance patterns.

        In addition to the individual transfer rates amongst devices, this test
        also reports the average bandwidth per device with all the other
        devices. Hence, the performance pattern will report the device with the
        lowest average copy bandwidth with all the other devices.
        '''
        self.perf_patterns = {
            'bw': sn.min(sn.extractall(
                r'^\[[^\]]*\]\s+GPU\s+\d+\s+(\s*\d+.\d+\s)+',
                self.stdout, 1, float))
        }
예제 #19
0
    def set_perf_patterns(self):
        '''Extract the minimum performance and maximum temperature recorded.

        The performance and temperature data are reported in Gflops/s and
        deg. Celsius respectively.
        '''

        patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s'
                r'\s+(?P<temp>\S*)\s+Celsius')
        self.perf_patterns = {
            'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
            'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)),
        }
예제 #20
0
    def __init__(self, peerAccess):
        self.valid_systems = ['cannon:local-gpu','cannon:gpu_test','fasse:fasse_gpu','test:gpu']
        self.valid_prog_environs = ['gpu']

        # Perform a single bandwidth test with a buffer size of 1024MB
        copy_size = 1073741824

        self.build_system = 'Make'
        self.executable = './p2p_bandwidth.x'
        self.build_system.cxxflags = [f'-DCOPY={copy_size}']

        if (peerAccess == 'peerAccess'):
            self.build_system.cxxflags += ['-DP2P']
            p2p = True
        else:
            p2p = False

        self.perf_patterns = {
            'bw': sn.min(sn.extractall(
                r'^[^,]*\[[^\]]*\]\s+GPU\s+\d+\s+(\s*\d+.\d+\s)+',
                self.stdout, 1, float))
        }

        if p2p:
            self.reference = {
                'cannon:local-gpu': {
                    'bw':   (28, -0.05, None, 'GB/s'),
                },
                'cannon:gpu_test': {
                    'bw':   (9, -0.05, None, 'GB/s'),
                },
                '*': {
                    'bw':   (172.5, None, None, 'GB/s'),
                },
            }
        else:
            self.reference = {
                'cannon:local-gpu': {
                    'bw': (35, -0.05, None, 'GB/s'),
                },
                'cannon:gpu_test': {
                    'bw': (11, -0.05, None, 'GB/s'),
                },
                '*': {
                    'bw': (79.6, None, None, 'GB/s'),
                },
            }
예제 #21
0
    def set_mpip_perf_patterns(self):
        '''More perf_patterns for the tool

    .. code-block::

      -----------------------------------
      @--- MPI Time (seconds) -----------
      -----------------------------------
      Task    AppTime    MPITime     MPI%
         0        8.6      0.121     1.40 <-- min
         1        8.6      0.157     1.82
         2        8.6       5.92    68.84 <-- max
         *       25.8        6.2    24.02 <---

      => NonMPI= AppTime - MPITime

    Typical performance reporting:

    .. code-block::

      * mpip_avg_app_time: 8.6 s  (= 25.8/3mpi)
      * mpip_avg_mpi_time: 2.07 s (=  6.2/3mpi)
      * %mpip_avg_mpi_time: 24.02 %
      * %mpip_avg_non_mpi_time: 75.98 %
        '''
        regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$'
        app_t = sn.extractsingle(regex_star, self.rpt, 'appt', float)
        mpi_t = sn.extractsingle(regex_star, self.rpt, 'mpit', float)
        mpi_pct = sn.extractsingle(regex_star, self.rpt, 'pct', float)
        nonmpi_pct = sn.round(100 - mpi_pct, 2)
        # min/max
        regex = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+'
                 r'(?P<pct>\S+)$')
        mpi_pct_max = sn.max(sn.extractall(regex, self.rpt, 'pct', float))
        mpi_pct_min = sn.min(sn.extractall(regex, self.rpt, 'pct', float))
        perf_pattern = {
            'mpip_avg_app_time': sn.round(app_t / self.num_tasks, 2),
            'mpip_avg_mpi_time': sn.round(mpi_t / self.num_tasks, 2),
            '%mpip_avg_mpi_time': mpi_pct,
            '%mpip_avg_mpi_time_max': mpi_pct_max,
            '%mpip_avg_mpi_time_min': mpi_pct_min,
            '%mpip_avg_non_mpi_time': nonmpi_pct,
        }
        if self.perf_patterns:
            self.perf_patterns = {**self.perf_patterns, **perf_pattern}
        else:
            self.perf_patterns = perf_pattern
예제 #22
0
    def __init__(self):
        self.valid_systems = [
            'daint:gpu', 'dom:gpu', 'ault:amdv100', 'ault:intelv100',
            'ault:amda100', 'ault:amdvega'
        ]
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.num_tasks = 0
        self.num_tasks_per_node = 1
        self.build_system = 'Make'
        self.executable = 'shmem.x'
        self.sanity_patterns = self.assert_count_gpus()
        self.perf_patterns = {
            'bandwidth':
            sn.min(
                sn.extractall(
                    r'^\s*\[[^\]]*\]\s*GPU\s*\d+: '
                    r'Bandwidth\(double\) (?P<bw>\S+) GB/s', self.stdout, 'bw',
                    float))
        }
        self.reference = {
            # theoretical limit for P100:
            # 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s
            'dom:gpu': {
                'bandwidth': (8850, -0.01, 9520 / 8850 - 1, 'GB/s')
            },
            'daint:gpu': {
                'bandwidth': (8850, -0.01, 9520 / 8850 - 1, 'GB/s')
            },
            'ault:amdv100': {
                'bandwidth': (13020, -0.01, None, 'GB/s')
            },
            'ault:intelv100': {
                'bandwidth': (13020, -0.01, None, 'GB/s')
            },
            'ault:amda100': {
                'bandwidth': (18139, -0.01, None, 'GB/s')
            },
            'ault:amdvega': {
                'bandwidth': (9060, -0.01, None, 'GB/s')
            }
        }

        self.maintainers = ['SK']
        self.tags = {'benchmark', 'diagnostic', 'craype'}
예제 #23
0
def pw_perf_patterns(obj):
    '''Reports hardware counter values from the tool

    .. code-block::

     collector                       time time (%)   PAPI_REF_CYC   PAPI_L2_DCM
     --------------------------------------------------------------------------
     computeMomentumAndEnergyIAD   0.6816   100.00     1770550470       2438527
                                                                        ^^^^^^^

    '''
    regex = r'^computeMomentumAndEnergyIAD\s+\S+\s+\S+\s+\S+\s+(?P<hwc>\d+)$'
    hwc_min = sn.min(sn.extractall(regex, obj.stderr, 'hwc', int))
    hwc_avg = sn.round(sn.avg(sn.extractall(regex, obj.stderr, 'hwc', int)), 1)
    hwc_max = sn.max(sn.extractall(regex, obj.stderr, 'hwc', int))
    res_d = {
        'papiwrap_hwc_min': hwc_min,
        'papiwrap_hwc_avg': hwc_avg,
        'papiwrap_hwc_max': hwc_max,
    }
    return res_d
예제 #24
0
파일: dgmemm.py 프로젝트: samcom12/reframe
    def __init__(self):
        self.valid_systems = ['daint:gpu', 'dom:gpu',
                              'ault:amdv100', 'ault:intelv100',
                              'ault:amda100', 'ault:amdvega']
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.num_tasks = 0
        self.num_tasks_per_node = 1
        self.build_system = 'Make'
        self.executable = 'dgemm.x'
        self.sanity_patterns = self.assert_num_gpus()
        self.perf_patterns = {
            'perf': sn.min(sn.extractall(
                r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s',
                self.stdout, 'fp', float))
        }
        self.reference = {
            'dom:gpu': {
                'perf': (3.35, -0.1, None, 'TF/s')
            },
            'daint:gpu': {
                'perf': (3.35, -0.1, None, 'TF/s')
            },
            'ault:amdv100': {
                'perf': (5.25, -0.1, None, 'TF/s')
            },
            'ault:intelv100': {
                'perf': (5.25, -0.1, None, 'TF/s')
            },
            'ault:amda100': {
                'perf': (10.5, -0.1, None, 'TF/s')
            },
            'ault:amdvega': {
                'perf': (3.45, -0.1, None, 'TF/s')
            }
        }

        self.maintainers = ['JO', 'SK']
        self.tags = {'benchmark'}
예제 #25
0
 def speedup(self):
     regex = r'^\S+(f32|f64)\s+(\S+) ns\s+'
     slowest = sn.max(sn.extractall(regex, self.stdout, 2, float))
     fastest = sn.min(sn.extractall(regex, self.stdout, 2, float))
     return sn.round(slowest / fastest, 3)
예제 #26
0
def vtune_time(self):
    '''Vtune creates 1 report per compute node. For example, a 48 mpi tasks job
    (= 2 compute nodes when running with 24 c/cn) will create 2 directories:
    * rpt.nid00001/rpt.nid00001.vtune
    * rpt.nid00002/rpt.nid00002.vtune

    Typical output (for each compute node) is:

    .. code-block::

      Elapsed Time:	14.866s
          CPU Time:	319.177s            /24 = 13.3
              Effective Time:	308.218s    /24 = 12.8
                  Idle:	0s
                  Poor:	19.725s
                  Ok:	119.570s
                  Ideal:	168.922s
                  Over:	0s
              Spin Time:	10.959s             /24 =  0.4
                  MPI Busy Wait Time:	10.795s
                  Other:	0.164s
              Overhead Time:	0s
      Total Thread Count:	25
      Paused Time:	0s
    '''
    result_d = {}
    # --- ranks per node
    if self.num_tasks < self.num_tasks_per_node:
        vtune_tasks_per_node = self.num_tasks
    else:
        vtune_tasks_per_node = self.num_tasks_per_node
    # --- Elapsed Time (min, max)
    regex = r'.*Elapsed Time: (?P<sec>\S+)s'
    result = sn.extractall(regex, self.stdout, 'sec', float)
    result_d['elapsed_min'] = sn.round(sn.min(result), 4)
    result_d['elapsed_max'] = sn.round(sn.max(result), 4)
    # --- CPU Time (max)
    regex = r'^\s+CPU Time: (?P<sec>\S+)s'
    result = sn.extractall(regex, self.stdout, 'sec', float)
    result_d['elapsed_cput'] = sn.round(
        sn.max(result) / vtune_tasks_per_node, 4)
    # --- CPU Time: Effective Time (max)
    regex = r'^\s+Effective Time: (?P<sec>\S+)s'
    result = sn.extractall(regex, self.stdout, 'sec', float)
    result_d['elapsed_cput_efft'] = sn.round(
        sn.max(result) / vtune_tasks_per_node, 4)
    # --- CPU Time: Spin Time (max)
    regex = r'^\s+Spin Time: (?P<sec>\S+)s'
    result = sn.extractall(regex, self.stdout, 'sec', float)
    result_d['elapsed_cput_spint'] = sn.round(
        sn.max(result) / vtune_tasks_per_node, 4)
    # --- CPU Time: Spin Time: MPI Busy Wait (max)
    if self.num_tasks > 1:
        regex = r'\s+MPI Busy Wait Time: (?P<sec>\S+)s'
        result = sn.extractall(regex, self.stdout, 'sec', float)
        result_d['elapsed_cput_spint_mpit'] = sn.round(
            sn.max(result) / vtune_tasks_per_node, 4)
    else:
        result_d['elapsed_cput_spint_mpit'] = 0

# TODO:
# 'vtune_momentumAndEnergyIAD':
# sphsintel.vtune_momentumAndEnergyIAD(self),
# '%vtune_srcf_lookupTables': self.vtune_pct_lookupTables,
# '%vtune_srcf_Octree': self.vtune_pct_Octree,
# '%vtune_srcf_momentumAndEnergyIAD':
# self.vtune_pct_momentumAndEnergyIAD,
# '%vtune_srcf_IAD': self.vtune_pct_IAD,
    return result_d
예제 #27
0
    def patrun_imbalance(self):
        # {{{
        '''Load imbalance from csv report

        .. code-block::

          Table 1:  load Balance with MPI Message Stats

        '''
        # }}}
        rpt = os.path.join(self.stagedir, self.csv_rpt)
        if self.num_tasks == 1:
            regex_use = r'^(?P<pe>1),\S+,\s?(?P<samples>\S+),USER$'
            regex_mpi = r'^(?P<pe>1),\S+,\s?(?P<samples>\S+),MPI$'
            regex_etc = r'^(?P<pe>1),\S+,\s?(?P<samples>\S+),ETC$'
        else:
            regex_use = r'^2,\S+,\s?(?P<samples>\S+),USER/pe.(?P<pe>\d+)$'
            regex_mpi = r'^2,\S+,\s?(?P<samples>\S+),MPI/pe.(?P<pe>\d+)$'
            regex_etc = r'^2,\S+,\s?(?P<samples>\S+),ETC/pe.(?P<pe>\d+)$'

        res_user_sm_l = sn.extractall(regex_use, rpt, 'samples', float)
        res_user_pe_l = sn.extractall(regex_use, rpt, 'pe', int)
        # MPI:
        res_mpi_sm_l = sn.extractall(regex_mpi, rpt, 'samples', float)
        res_mpi_pe_l = sn.extractall(regex_mpi, rpt, 'pe', int)
        if not sn.evaluate(res_mpi_sm_l):
            res_mpi_sm_l = [0 for i in sn.evaluate(res_user_sm_l)]
            res_mpi_pe_l = [i for i in sn.evaluate(res_user_pe_l)]
        # ETC:
        res_etc_sm_l = sn.extractall(regex_etc, rpt, 'samples', float)
        res_etc_pe_l = sn.extractall(regex_etc, rpt, 'pe', int)
        # DICT from LISTs: dict(zip(pe,usr))
        # TOTAL = USER+MPI+ETC
        res_total_sm_l = []
        # WARNING: this fails if data is not sorted by pe, use pat_report with:
        # -s sort_by_pe='yes' !!!
        res_total_sm_l = [
            sum(sam) for sam in zip(res_user_sm_l, res_mpi_sm_l, res_etc_sm_l)
        ]
        # USER pes
        # {{{ slowest pe (USER)
        # slowest = max(max(res_user_sm_l),
        #               max(res_mpi_sm_l),
        #               max(res_etc_sm_l))
        slowest = max(res_user_sm_l)
        user_slowest_pe = -1
        index = -1
        if slowest in res_user_sm_l:
            for sam in res_user_sm_l:
                index += 1
                if sam == slowest:
                    user_slowest_pe = index

        if user_slowest_pe == -1:
            user_slowest_pe = 0
        # }}}
        # {{{ fastest pe (USER)
        fastest = min(res_user_sm_l)
        user_fastest_pe = -1
        index = -1
        for sam in res_user_sm_l:
            index += 1
            if sam == fastest:
                user_fastest_pe = index

        if user_fastest_pe == -1:
            user_fastest_pe = 0
        # }}}

        # MPI pes
        # {{{ slowest pe (MPI)
        slowest = max(res_mpi_sm_l)
        #         try:
        #             slowest = max(res_mpi_sm_l)
        #         except ValueError:
        #             slowest = 0

        mpi_slowest_pe = -1
        index = -1
        if slowest in res_mpi_sm_l:
            for sam in res_mpi_sm_l:
                index += 1
                if sam == slowest:
                    mpi_slowest_pe = index

        if mpi_slowest_pe == -1:
            mpi_slowest_pe = 0
        # }}}
        # {{{ fastest pe (MPI)
        fastest = min(res_mpi_sm_l)
        #         try:
        #             fastest = min(res_mpi_sm_l)
        #         except ValueError:
        #             fastest = 0

        mpi_fastest_pe = -1
        index = -1
        for sam in res_mpi_sm_l:
            index += 1
            if sam == fastest:
                mpi_fastest_pe = index

        if mpi_fastest_pe == -1:
            mpi_fastest_pe = 0
        # }}}

        # ETC pes
        # {{{ slowest pe (ETC)
        slowest = max(res_etc_sm_l)
        etc_slowest_pe = -1
        index = -1
        if slowest in res_etc_sm_l:
            for sam in res_etc_sm_l:
                index += 1
                if sam == slowest:
                    etc_slowest_pe = index

        if etc_slowest_pe == -1:
            etc_slowest_pe = 0
        # }}}
        # {{{ fastest pe (ETC)
        fastest = min(res_etc_sm_l)
        etc_fastest_pe = -1
        index = -1
        for sam in res_etc_sm_l:
            index += 1
            if sam == fastest:
                etc_fastest_pe = index

        if etc_fastest_pe == -1:
            etc_fastest_pe = 0
        # }}}

        # TOTAL pes
        # {{{ slowest pe (TOTAL)
        slowest = max(res_total_sm_l)
        #         try:
        #             slowest = max(res_total_sm_l)
        #         except ValueError:
        #             slowest = 0

        total_slowest_pe = -1
        index = -1
        if slowest in res_total_sm_l:
            for sam in res_total_sm_l:
                index += 1
                if sam == slowest:
                    total_slowest_pe = index

        if total_slowest_pe == -1:
            total_slowest_pe = 0
        # }}}
        # {{{ fastest pe (TOTAL)
        fastest = min(res_total_sm_l)
        #         try:
        #             fastest = min(res_total_sm_l)
        #         except ValueError:
        #             fastest = 0

        total_fastest_pe = -1
        index = -1
        for sam in res_total_sm_l:
            index += 1
            if sam == fastest:
                total_fastest_pe = index

        if total_fastest_pe == -1:
            total_fastest_pe = 0
        # }}}

        # {{{ res dict
        res = {}
        # min/(mean=average)/median/max
        res['user_samples_min'] = sn.round(sn.min(res_user_sm_l), 0)
        res['mpi_samples_min'] = sn.round(sn.min(res_mpi_sm_l), 0)
        res['etc_samples_min'] = sn.round(sn.min(res_etc_sm_l), 0)
        res['total_samples_min'] = sn.round(sn.min(res_total_sm_l), 0)
        #
        res['user_samples_mean'] = sn.round(sn.avg(res_user_sm_l), 1)
        res['mpi_samples_mean'] = sn.round(sn.avg(res_mpi_sm_l), 1)
        res['etc_samples_mean'] = sn.round(sn.avg(res_etc_sm_l), 1)
        res['total_samples_mean'] = sn.round(sn.avg(res_total_sm_l), 1)
        #
        res['user_samples_median'] = \
            sn.sanity_function(np.median)(res_user_sm_l)
        res['mpi_samples_median'] = sn.sanity_function(np.median)(res_mpi_sm_l)
        res['etc_samples_median'] = sn.sanity_function(np.median)(res_etc_sm_l)
        res['total_samples_median'] = \
            sn.sanity_function(np.median)(res_total_sm_l)
        #
        res['user_samples_max'] = sn.round(sn.max(res_user_sm_l), 0)
        res['mpi_samples_max'] = sn.round(sn.max(res_mpi_sm_l), 0)
        res['etc_samples_max'] = sn.round(sn.max(res_etc_sm_l), 0)
        res['total_samples_max'] = sn.round(sn.max(res_total_sm_l), 0)
        #
        res['%user_samples'] = sn.round(
            100 * res['user_samples_mean'] / res['total_samples_mean'], 1)
        res['%mpi_samples'] = sn.round(
            100 * res['mpi_samples_mean'] / res['total_samples_mean'], 1)
        res['%etc_samples'] = sn.round(
            100 * res['etc_samples_mean'] / res['total_samples_mean'], 1)
        # slowest pes
        res['user_slowest_pe'] = user_slowest_pe
        res['mpi_slowest_pe'] = mpi_slowest_pe
        res['etc_slowest_pe'] = etc_slowest_pe
        res['total_slowest_pe'] = total_slowest_pe
        # --- debug with:
        # print("> res_user_sm_l", sn.evaluate(res_user_sm_l))
        # print("> res_user_pe_l", sn.evaluate(res_user_pe_l))
        # print("> res_mpi_sm_l", sn.evaluate(res_mpi_sm_l))
        # print("> res_mpi_pe_l", sn.evaluate(res_mpi_pe_l))
        # print("> res_etc_sm_l", sn.evaluate(res_etc_sm_l))
        # print("> res_etc_pe_l", sn.evaluate(res_etc_pe_l))
        try:
            res['%user_slowest'] = \
                sn.round(100 * res_user_sm_l[user_slowest_pe] /
                         res_total_sm_l[user_slowest_pe], 1)
        except ValueError:
            res['%user_slowest'] = 0

        try:
            res['%mpi_slowest'] = \
                sn.round(100 * res_mpi_sm_l[user_slowest_pe] /
                         res_total_sm_l[user_slowest_pe], 1)
        except ValueError:
            res['%mpi_slowest'] = 0

        try:
            res['%etc_slowest'] = \
                sn.round(100 * res_etc_sm_l[user_slowest_pe] /
                         res_total_sm_l[user_slowest_pe], 1)
        except ValueError:
            res['%etc_slowest'] = 0

        # fastest pes
        res['user_fastest_pe'] = user_fastest_pe
        res['mpi_fastest_pe'] = mpi_fastest_pe
        res['etc_fastest_pe'] = etc_fastest_pe
        res['total_fastest_pe'] = total_fastest_pe
        try:
            res['%user_fastest'] = \
                sn.round(100 * res_user_sm_l[user_fastest_pe] /
                         res_total_sm_l[user_fastest_pe], 1)
        except ValueError:
            res['%user_fastest'] = 0

        try:
            res['%mpi_fastest'] = \
                sn.round(100 * res_mpi_sm_l[user_fastest_pe] /
                         res_total_sm_l[user_fastest_pe], 1)
        except ValueError:
            res['%mpi_fastest'] = 0

        try:
            res['%etc_fastest'] = \
                sn.round(100 * res_etc_sm_l[user_fastest_pe] /
                         res_total_sm_l[user_fastest_pe], 1)
        except ValueError:
            res['%etc_fastest'] = 0
        # }}}
        self.patrun_stats_d = res
예제 #28
0
    def __init__(self, peerAccess):
        self.valid_systems = [
            'tsa:cn', 'arola:cn', 'ault:amdv100', 'ault:intelv100',
            'ault:amda100', 'ault:amdvega'
        ]
        self.valid_prog_environs = ['PrgEnv-gnu']
        if self.current_system.name in ['arolla', 'tsa']:
            self.valid_prog_environs = ['PrgEnv-gnu-nompi']

        # Perform a single bandwidth test with a buffer size of 1024MB
        copy_size = 1073741824

        self.build_system = 'Make'
        self.executable = 'p2p_bandwidth.x'
        self.build_system.cxxflags = [f'-DCOPY={copy_size}']
        self.num_tasks = 0
        self.num_tasks_per_node = 1
        self.exclusive_access = True

        if (peerAccess == 'peerAccess'):
            self.build_system.cxxflags += ['-DP2P']
            p2p = True
        else:
            p2p = False

        self.sanity_patterns = self.do_sanity_check()
        self.perf_patterns = {
            'bw':
            sn.min(
                sn.extractall(
                    r'^[^,]*\[[^\]]*\]\s+GPU\s+\d+\s+(\s*\d+.\d+\s)+',
                    self.stdout, 1, float))
        }

        if p2p:
            self.reference = {
                'tsa:cn': {
                    'bw': (172.5, -0.05, None, 'GB/s'),
                },
                'arola:cn': {
                    'bw': (172.5, -0.05, None, 'GB/s'),
                },
                'ault:amda100': {
                    'bw': (282.07, -0.1, None, 'GB/s'),
                },
                'ault:amdv100': {
                    'bw': (5.7, -0.1, None, 'GB/s'),
                },
                'ault:intelv100': {
                    'bw': (31.0, -0.1, None, 'GB/s'),
                },
                'ault:amdvega': {
                    'bw': (11.75, -0.1, None, 'GB/s'),
                },
            }
        else:
            self.reference = {
                'tsa:cn': {
                    'bw': (79.6, -0.05, None, 'GB/s'),
                },
                'arola:cn': {
                    'bw': (79.6, -0.05, None, 'GB/s'),
                },
                'ault:amda100': {
                    'bw': (54.13, -0.1, None, 'GB/s'),
                },
                'ault:amdv100': {
                    'bw': (7.5, -0.1, None, 'GB/s'),
                },
                'ault:intelv100': {
                    'bw': (33.6, -0.1, None, 'GB/s'),
                },
                'ault:amdvega': {
                    'bw': (11.75, -0.1, None, 'GB/s'),
                },
            }

        self.tags = {'diagnostic', 'benchmark', 'mch'}
        self.maintainers = ['JO']
예제 #29
0
    def __init__(self):
        self.valid_systems = [
            'daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn', 'ault:amdv100',
            'ault:intelv100', 'ault:amda100', 'ault:amdvega'
        ]
        self.valid_prog_environs = ['PrgEnv-gnu']
        if self.current_system.name in ['arolla', 'tsa']:
            self.valid_prog_environs = ['PrgEnv-gnu-nompi']

        # Perform a single bandwidth test with a buffer size of 1024MB
        self.copy_size = 1073741824

        self.build_system = 'Make'
        self.executable = 'memory_bandwidth.x'
        self.build_system.cxxflags = [f'-DCOPY={self.copy_size}']
        self.num_tasks = 0
        self.num_tasks_per_node = 1
        self.exclusive_access = True

        # perf_patterns and reference will be set by the sanity check function
        self.sanity_patterns = self.do_sanity_check()
        self.perf_patterns = {
            'h2d':
            sn.min(
                sn.extractall(self._xfer_pattern('h2d'), self.stdout, 1,
                              float)),
            'd2h':
            sn.min(
                sn.extractall(self._xfer_pattern('d2h'), self.stdout, 1,
                              float)),
            'd2d':
            sn.min(
                sn.extractall(self._xfer_pattern('d2d'), self.stdout, 1,
                              float)),
        }
        self.reference = {
            'daint:gpu': {
                'h2d': (11881, -0.1, None, 'MB/s'),
                'd2h': (12571, -0.1, None, 'MB/s'),
                'd2d': (499000, -0.1, None, 'MB/s')
            },
            'dom:gpu': {
                'h2d': (11881, -0.1, None, 'MB/s'),
                'd2h': (12571, -0.1, None, 'MB/s'),
                'd2d': (499000, -0.1, None, 'MB/s')
            },
            'tsa:cn': {
                'h2d': (13000, -0.1, None, 'MB/s'),
                'd2h': (12416, -0.1, None, 'MB/s'),
                'd2d': (777000, -0.1, None, 'MB/s')
            },
            'ault:amda100': {
                'h2d': (25500, -0.1, None, 'MB/s'),
                'd2h': (26170, -0.1, None, 'MB/s'),
                'd2d': (1322500, -0.1, None, 'MB/s')
            },
            'ault:amdv100': {
                'h2d': (13189, -0.1, None, 'MB/s'),
                'd2h': (13141, -0.1, None, 'MB/s'),
                'd2d': (777788, -0.1, None, 'MB/s')
            },
            'ault:intelv100': {
                'h2d': (13183, -0.1, None, 'MB/s'),
                'd2h': (13411, -0.1, None, 'MB/s'),
                'd2d': (778200, -0.1, None, 'MB/s')
            },
            'ault:amdvega': {
                'h2d': (14000, -0.1, None, 'MB/s'),
                'd2h': (14000, -0.1, None, 'MB/s'),
                'd2d': (575700, -0.1, None, 'MB/s')
            },
        }
        self.tags = {
            'diagnostic', 'benchmark', 'mch', 'craype', 'external-resources'
        }
        self.maintainers = ['AJ', 'SK']
예제 #30
0
 def min_perf(self, nid=None):
     '''Lowest performance recorded.'''
     return sn.min(self._extract_perf_metric('perf', nid))