예제 #1
0
    def test_max(self):
        l = [1, 2]
        dl = make_deferrable(l)
        self.assertEqual(2, sn.max(dl))

        l.append(3)
        self.assertEqual(3, sn.max(dl))
예제 #2
0
    def test_max(self):
        l = [1, 2]
        dl = sn.defer(l)
        self.assertEqual(2, sn.max(dl))

        l.append(3)
        self.assertEqual(3, sn.max(dl))
예제 #3
0
 def __init__(self):
     super().__init__()
     self.descr = 'Test Cray LibSci on the GPU (dgemm with libsci alloc)'
     self.build_system = 'SingleSource'
     self.sourcesdir = None
     self.sourcepath = ('$CRAY_LIBSCI_ACC_DIR/examples/examples/c_simple/'
                        'dgemm_simple.c')
     self.sanity_patterns = sn.assert_found(r'(4096\s+){3}', self.stdout)
     regex = r'(\s+\d+){3}\s+(?P<gpu_flops>\S+)\s+(?P<cpu_flops>\S+)\s+'
     self.perf_patterns = {
         'dgemm_gpu':
         sn.max(sn.extractall(regex, self.stdout, 'gpu_flops', float)),
         'dgemm_cpu':
         sn.max(sn.extractall(regex, self.stdout, 'cpu_flops', float)),
     }
     self.reference = {
         'daint:gpu': {
             'dgemm_gpu': (2264.0, -0.05, None, 'GFLop/s'),
             'dgemm_cpu': (45.0, -0.05, None, 'GFLop/s'),
         },
         'dom:gpu': {
             'dgemm_gpu': (2264.0, -0.05, None, 'GFLop/s'),
             'dgemm_cpu': (45.0, -0.05, None, 'GFLop/s'),
         },
     }
예제 #4
0
def test_max():
    l = [1, 2]
    dl = sn.defer(l)
    assert 2 == sn.max(dl)

    l.append(3)
    assert 3 == sn.max(dl)
예제 #5
0
 def set_perf_patterns(self):
     '''Set performance patterns.'''
     self.perf_patterns = {
         'latency': sn.max(sn.extractall(
             r'\[\S+\] \[gpu \d+\] Kernel launch latency: '
             r'(?P<latency>\S+) us', self.stdout, 'latency', float))
     }
예제 #6
0
 def __init__(self):
     super().__init__()
     self.maintainers = ['JG']
     self.valid_systems += ['eiger:mc', 'pilatus:mc']
     self.time_limit = '5m'
     self.sourcepath = 'eatmemory_mpi.c'
     self.tags.add('mem')
     self.executable_opts = ['100%']
     self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)',
                                            self.stderr)
     # {{{ perf
     regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/'
              r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:'
              r' (\d+) GB')
     self.perf_patterns = {
         'max_cn_memory':
         sn.getattr(self, 'reference_meminfo'),
         'max_allocated_memory':
         sn.max(sn.extractall(regex, self.stdout, 1, int)),
     }
     no_limit = (0, None, None, 'GB')
     self.reference = {
         '*': {
             'max_cn_memory':
             no_limit,
             'max_allocated_memory':
             (sn.getattr(self, 'reference_meminfo'), -0.05, None, 'GB'),
         }
     }
예제 #7
0
 def __init__(self, **kwargs):
     super().__init__('Monch', **kwargs)
     self.tags = {'monch_acceptance'}
     self.valid_systems = ['monch:compute']
     self.valid_prog_environs = ['PrgEnv-gnu']
     self.num_tasks = 1
     self.num_tasks_per_node = 1
     self.num_tasks_per_core = 1
     self.num_cpus_per_task = 20
     self.num_tasks_per_socket = 10
     self.use_multithreading = False
     self.cflags  = '-O3 -I$EBROOTOPENBLAS/include'
     self.ldflags = '-L$EBROOTOPENBLAS/lib -lopenblas -lpthread -lgfortran'
     self.variables = {
         'OMP_NUM_THREADS': str(self.num_cpus_per_task),
         'MV2_ENABLE_AFFINITY': '0'
     }
     self.perf_patterns = {
         'perf': sn.max(
             sn.extractall(r'Run\s\d\s+:\s+(?P<gflops>\S+)\s\S+',
                           self.stdout, "gflops", float)
         )
     }
     self.reference = {
         'monch:compute': {
             'perf': (350, -0.1, None)
         }
     }
예제 #8
0
 def __init__(self):
     self.valid_systems = ['cannon:local-gpu','cannon:gpu_test','fasse:fasse_gpu','test:gpu']
     self.descr = 'GPU burn test'
     self.valid_prog_environs = ['gpu']
     self.executable_opts = ['-d', '40']
     self.build_system = 'Make'
     self.build_system.makefile = 'makefile.cuda'
     self.executable = './gpu_burn.x'
     patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s'
             r'\s+(?P<temp>\S*)\s+Celsius')
     self.perf_patterns = {
         'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
         'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)),
     }
     self.reference = {
         'cannon:local-gpu': {
             'perf': (6200, -0.10, None, 'Gflop/s per gpu'),
         },
         'cannon:gpu_test': {
             'perf': (6200, -0.10, None, 'Gflop/s per gpu'),
         },
         'test:gpu': {
             'perf': (4115, None, None, 'Gflop/s per gpu'),
         },
         '*': {
             'perf': (4115, None, None, 'Gflop/s per gpu'),
         },
         '*': {'temp': (0, None, None, 'degC')}
     }
예제 #9
0
    def __init__(self, linkage, **kwargs):
        super().__init__('scalapack_performance_compile_run_', linkage,
                         **kwargs)

        # FIXME:
        # Currently, this test case is only aimed for the monch acceptance,
        # yet it could be interesting to extend it to other systems.
        # NB: The test case is very small, but larger cases did not succeed!

        self.tags |= {'monch_acceptance'}
        self.sourcepath = 'scalapack_performance_compile_run.f'
        self.valid_systems = ['monch:compute']
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.num_tasks = 64
        self.num_tasks_per_node = 16

        self.sanity_patterns = sn.assert_found(r'Run', self.stdout)
        self.perf_patterns = {
            'perf':
            sn.max(
                sn.extractall(r'GFLOPS/s:\s+(?P<gflops>\S+)', self.stdout,
                              'gflops', float))
        }

        self.reference = {'monch:compute': {'perf': (24., -0.1, None)}}
예제 #10
0
def mpip_perf_patterns(obj, reg):
    '''More perf_patterns for the tool

    .. code-block::

      -----------------------------------
      @--- MPI Time (seconds) -----------
      -----------------------------------
      Task    AppTime    MPITime     MPI%
         0        8.6      0.121     1.40 <-- min
         1        8.6      0.157     1.82
         2        8.6       5.92    68.84 <-- max
         *       25.8        6.2    24.02 <---

      => NonMPI= AppTime - MPITime

    Typical performance reporting:

    .. code-block::

      * mpip_avg_app_time: 8.6 s  (= 25.8/3mpi)
      * mpip_avg_mpi_time: 2.07 s (=  6.2/3mpi)
      * %mpip_avg_mpi_time: 24.02 %
      * %max/%min
      * %mpip_avg_non_mpi_time: 75.98 %
    '''
    # rpt = os.path.join(obj.stagedir, obj.rpt_file_txt)
    rpt = sn.extractsingle(r'^mpiP: Storing mpiP output in \[(?P<rpt>.*)\]',
                           obj.stdout, 'rpt', str)
    regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$'
    regex_minmax = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+'
                    r'(?P<pct>\S+)$')
    if reg == 1:
        # mpip_avg_mpi_time
        result = sn.round(
            sn.extractsingle(regex_star, rpt, 'mpit', float) / obj.num_tasks,
            2)
    elif reg == 2:
        # mpip_avg_app_time
        result = sn.round(
            sn.extractsingle(regex_star, rpt, 'appt', float) / obj.num_tasks,
            2)
    elif reg == 3:
        # %mpip_avg_mpi_time
        result = sn.extractsingle(regex_star, rpt, 'pct', float)
    elif reg == 4:
        # %nonmpi
        mpi_pct = sn.extractsingle(regex_star, rpt, 'pct', float)
        result = sn.round(100 - mpi_pct, 2)
    elif reg == 5:
        # %mpip_avg_mpi_time_max
        result = sn.max(sn.extractall(regex_minmax, rpt, 'pct', float))
    elif reg == 6:
        # %mpip_avg_mpi_time_min
        result = sn.min(sn.extractall(regex_minmax, rpt, 'pct', float))
    else:
        raise ValueError('unknown region id in mpip_perf_patterns')

    return result
예제 #11
0
 def set_performance_patterns(self):
     self.perf_patterns = {
         'average_latency': sn.max(sn.extractall(
             r'^\s*\[[^\]]*\]\s* On device \d+, '
             r'the chase took on average (\d+) '
             r'cycles per node jump.', self.stdout, 1, int)
         ),
     }
예제 #12
0
def ru_maxrss_rk0(obj):
    '''Reports the ``maximum resident set size``
    '''
    maxrss_rk0 = sn.max(
        sn.extractall(
            r'^METRIC\s+0\s+.*ru_maxrss\" <2>; UINT64; (?P<rss>\d+)\)',
            obj.rpt_otf2, 'rss', int))
    return maxrss_rk0
def stress_diff(ostream, ostream_ref):
    ''' Return the difference between obtained and reference stress tensor components'''

    stress = get_stress(ostream)
    stress_ref = get_stress(ostream_ref)
    return sn.max(
        sn.abs(stress_ref[i][j] - stress[i][j]) for i in range(2)
        for j in range(2))
예제 #14
0
    def average_D2D_latency(self):
        '''Extract the average D2D latency.

        The pChase code returns a table with the cummulative latency for all
        D2D list traversals, and the last column of this table has the max
        values for each device.
        '''
        return sn.max(
            sn.extractall(r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+',
                          self.stdout, 1, int))
예제 #15
0
    def __init__(self):
        self.valid_systems = [
            'daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn', 'ault:amdv100',
            'ault:intelv100', 'ault:amda100', 'ault:amdvega'
        ]
        self.descr = 'GPU burn test'
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.exclusive_access = True
        self.executable_opts = ['-d', '40']
        self.build_system = 'Make'
        self.executable = './gpu_burn.x'
        self.num_tasks = 0
        self.num_tasks_per_node = 1
        self.sanity_patterns = self.assert_num_tasks()
        patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s'
                r'\s+(?P<temp>\S*)\s+Celsius')
        self.perf_patterns = {
            'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
            'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)),
        }

        self.reference = {
            'dom:gpu': {
                'perf': (4115, -0.10, None, 'Gflop/s'),
            },
            'daint:gpu': {
                'perf': (4115, -0.10, None, 'Gflop/s'),
            },
            'arolla:cn': {
                'perf': (5861, -0.10, None, 'Gflop/s'),
            },
            'tsa:cn': {
                'perf': (5861, -0.10, None, 'Gflop/s'),
            },
            'ault:amda100': {
                'perf': (15000, -0.10, None, 'Gflop/s'),
            },
            'ault:amdv100': {
                'perf': (5500, -0.10, None, 'Gflop/s'),
            },
            'ault:intelv100': {
                'perf': (5500, -0.10, None, 'Gflop/s'),
            },
            'ault:amdvega': {
                'perf': (3450, -0.10, None, 'Gflop/s'),
            },
            '*': {
                'temp': (0, None, None, 'degC')
            }
        }

        self.maintainers = ['AJ', 'TM']
        self.tags = {'diagnostic', 'benchmark', 'craype'}
예제 #16
0
 def __init__(self):
     super().__init__()
     self.valid_systems = (
         self.single_device_systems + self.multi_device_systems
     )
     self.perf_patterns = {
         'average_latency': sn.max(sn.extractall(
             r'^\s*\[[^\]]*\]\s* On device \d+, '
             r'the chase took on average (\d+) '
             r'cycles per node jump.', self.stdout, 1, int)
         ),
     }
예제 #17
0
    def set_perf_patterns(self):
        '''Extract the minimum performance and maximum temperature recorded.

        The performance and temperature data are reported in Gflops/s and
        deg. Celsius respectively.
        '''

        patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s'
                r'\s+(?P<temp>\S*)\s+Celsius')
        self.perf_patterns = {
            'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
            'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)),
        }
예제 #18
0
def vtune_momentumAndEnergyIAD(self):
    '''
    sphexa::sph::computeMomentumAndEnergyIADImpl<...>  sqpatch.exe   40.919s
    sphexa::sph::computeMomentumAndEnergyIADImpl<...>  sqpatch.exe   38.994s
    sphexa::sph::computeMomentumAndEnergyIADImpl<...>  sqpatch.exe   40.245s
    sphexa::sph::computeMomentumAndEnergyIADImpl<...>  sqpatch.exe   39.487s
    '''
    # ^[sphexa::|MPI|[Others].*\s+(?P<sec>\S+)s$'
    regex1 = r'^\s+CPU Time: (?P<sec>\S+)s'
    result1 = sn.max(sn.extractall(regex1, self.stdout, 'sec', float))
    regex2 = r'^sphexa::sph::computeMomentumAndEnergyIADImpl.*\s+(?P<x>\S+)s$'
    result2 = sn.max(sn.extractall(regex2, self.stdout, 'x', float))
    print("vtune_cput=", result1)
    print("vtune_energ=", result2)
    print("vtune_cput/24=", result1 / 24)
    print("vtune_energ/24=", result2 / 24)
    # print("t=", result1/result2)
    # print("c=", self.num_tasks)
    # print("t=", (result1/result2) / self.num_tasks)
    # t= 5.208910219363269 / 24 = 0.2170379258068029
    # vtune_momentumAndEnergyIAD: 5.2089 %
    return 0
예제 #19
0
    def set_mpip_perf_patterns(self):
        '''More perf_patterns for the tool

    .. code-block::

      -----------------------------------
      @--- MPI Time (seconds) -----------
      -----------------------------------
      Task    AppTime    MPITime     MPI%
         0        8.6      0.121     1.40 <-- min
         1        8.6      0.157     1.82
         2        8.6       5.92    68.84 <-- max
         *       25.8        6.2    24.02 <---

      => NonMPI= AppTime - MPITime

    Typical performance reporting:

    .. code-block::

      * mpip_avg_app_time: 8.6 s  (= 25.8/3mpi)
      * mpip_avg_mpi_time: 2.07 s (=  6.2/3mpi)
      * %mpip_avg_mpi_time: 24.02 %
      * %mpip_avg_non_mpi_time: 75.98 %
        '''
        regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$'
        app_t = sn.extractsingle(regex_star, self.rpt, 'appt', float)
        mpi_t = sn.extractsingle(regex_star, self.rpt, 'mpit', float)
        mpi_pct = sn.extractsingle(regex_star, self.rpt, 'pct', float)
        nonmpi_pct = sn.round(100 - mpi_pct, 2)
        # min/max
        regex = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+'
                 r'(?P<pct>\S+)$')
        mpi_pct_max = sn.max(sn.extractall(regex, self.rpt, 'pct', float))
        mpi_pct_min = sn.min(sn.extractall(regex, self.rpt, 'pct', float))
        perf_pattern = {
            'mpip_avg_app_time': sn.round(app_t / self.num_tasks, 2),
            'mpip_avg_mpi_time': sn.round(mpi_t / self.num_tasks, 2),
            '%mpip_avg_mpi_time': mpi_pct,
            '%mpip_avg_mpi_time_max': mpi_pct_max,
            '%mpip_avg_mpi_time_min': mpi_pct_min,
            '%mpip_avg_non_mpi_time': nonmpi_pct,
        }
        if self.perf_patterns:
            self.perf_patterns = {**self.perf_patterns, **perf_pattern}
        else:
            self.perf_patterns = perf_pattern
예제 #20
0
    def __init__(self, kernel_version):
        self.valid_systems = [
            'cannon:local-gpu', 'cannon:gpu_test', 'fasse:fasse_gpu',
            'test:gpu'
        ]
        self.valid_prog_environs = ['gpu']

        self.build_system = 'Make'
        self.executable = './kernel_latency.x'
        if kernel_version == 'sync':
            self.build_system.cppflags = ['-D SYNCKERNEL=1']
        else:
            self.build_system.cppflags = ['-D SYNCKERNEL=0']

        self.perf_patterns = {
            'latency':
            sn.max(
                sn.extractall(
                    r'\[\S+\] \[gpu \d+\] Kernel launch latency: '
                    r'(?P<latency>\S+) us', self.stdout, 'latency', float))
        }
        self.sys_reference = {
            'sync': {
                'cannon:local-gpu': {
                    'latency': (6.0, None, 0.10, 'us')
                },
                'cannon:gpu_test': {
                    'latency': (4.0, None, 0.10, 'us')
                },
                '*': {
                    'latency': (15.1, None, None, 'us')
                },
            },
            'async': {
                'cannon:local-gpu': {
                    'latency': (6.0, None, 0.10, 'us')
                },
                'cannon:gpu_test': {
                    'latency': (4.0, None, 0.10, 'us')
                },
                '*': {
                    'latency': (2.2, None, None, 'us')
                },
            },
        }
        self.reference = self.sys_reference[kernel_version]
def forces_diff(ostream, ostream_ref):
    ''' Return the difference between obtained and reference atomic forces'''

    forces = get_forces(ostream)
    forces_ref = get_forces(ostream_ref)

    na = 0
    for e in forces:
        na += 1
    na_ref = 0
    for e in forces_ref:
        na_ref += 1

    sn.assert_eq(na, na_ref,
                 msg='Wrong length of forces array: {0} != {1}').evaluate()

    return sn.max(
        sn.abs(forces[i][j] - forces_ref[i][j]) for i in range(na)
        for j in range(2))
예제 #22
0
def pw_perf_patterns(obj):
    '''Reports hardware counter values from the tool

    .. code-block::

     collector                       time time (%)   PAPI_REF_CYC   PAPI_L2_DCM
     --------------------------------------------------------------------------
     computeMomentumAndEnergyIAD   0.6816   100.00     1770550470       2438527
                                                                        ^^^^^^^

    '''
    regex = r'^computeMomentumAndEnergyIAD\s+\S+\s+\S+\s+\S+\s+(?P<hwc>\d+)$'
    hwc_min = sn.min(sn.extractall(regex, obj.stderr, 'hwc', int))
    hwc_avg = sn.round(sn.avg(sn.extractall(regex, obj.stderr, 'hwc', int)), 1)
    hwc_max = sn.max(sn.extractall(regex, obj.stderr, 'hwc', int))
    res_d = {
        'papiwrap_hwc_min': hwc_min,
        'papiwrap_hwc_avg': hwc_avg,
        'papiwrap_hwc_max': hwc_max,
    }
    return res_d
예제 #23
0
    def __init__(self, linkage):
        super().__init__(linkage)

        self.tags |= {'monch_acceptance'}
        self.sourcepath = 'scalapack_performance_compile_run.f'
        self.valid_systems = ['monch:compute']
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.num_tasks = 64
        self.num_tasks_per_node = 16

        self.sanity_patterns = sn.assert_found(r'Run', self.stdout)
        self.perf_patterns = {
            'perf': sn.max(
                sn.extractall(r'GFLOPS/s:\s+(?P<gflops>\S+)',
                              self.stdout, 'gflops', float)
            )
        }

        self.reference = {
            'monch:compute': {
                'perf': (24., -0.1, None)
            }
        }
 def density_SQ_INSTS_SALU(self):
     regex = self.set_regex('density')
     rpt = os.path.join(self.stagedir,
                        self.metric_file.replace(".txt", ".csv"))
     return sn.round(sn.max(sn.extractall(regex, rpt, 'm4', int)), 0)
예제 #25
0
    def __init__(self, kernel_version):
        super().__init__()
        # List known partitions here so as to avoid specifying them every time
        # with --system
        self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
        self.num_tasks = 0
        self.num_tasks_per_node = 1
        self.sourcepath = 'kernel_latency.cu'
        self.build_system = 'SingleSource'
        self.build_system.cxxflags = ['-std=c++11']
        if self.current_system.name in {'dom', 'daint'}:
            self.num_gpus_per_node = 1
            gpu_arch = '60'
            self.modules = ['craype-accel-nvidia60']
            self.valid_prog_environs = [
                'PrgEnv-cray', 'PrgEnv-pgi', 'PrgEnv-gnu'
            ]
        elif self.current_system.name == 'kesch':
            self.num_gpus_per_node = 16
            self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
            self.modules = ['craype-accel-nvidia35']
            gpu_arch = '37'
        else:
            # Enable test when running on an unknown system
            self.num_gpus_per_node = 1
            self.valid_systems = ['*']
            self.valid_prog_environs = ['*']
            gpu_arch = None

        if gpu_arch:
            self.build_system.cxxflags += [
                '-arch=compute_%s' % gpu_arch,
                '-code=sm_%s' % gpu_arch
            ]

        if kernel_version == 'sync':
            self.build_system.cppflags = ['-D SYNCKERNEL=1']
        else:
            self.build_system.cppflags = ['-D SYNCKERNEL=0']

        self.sanity_patterns = sn.all([
            sn.assert_eq(
                sn.count(sn.findall(r'\[\S+\] Found \d+ gpu\(s\)',
                                    self.stdout)), self.num_tasks_assigned),
            sn.assert_eq(
                sn.count(
                    sn.findall(
                        r'\[\S+\] \[gpu \d+\] Kernel launch '
                        r'latency: \S+ us', self.stdout)),
                self.num_tasks_assigned * self.num_gpus_per_node)
        ])

        self.perf_patterns = {
            'latency':
            sn.max(
                sn.extractall(
                    r'\[\S+\] \[gpu \d+\] Kernel launch latency: '
                    r'(?P<latency>\S+) us', self.stdout, 'latency', float))
        }
        self.sys_reference = {
            'sync': {
                'dom:gpu': {
                    'latency': (6.6, None, 0.10, 'us')
                },
                'daint:gpu': {
                    'latency': (6.6, None, 0.10, 'us')
                },
                'kesch:cn': {
                    'latency': (12.0, None, 0.10, 'us')
                },
                '*': {
                    'latency': (0.0, None, None, 'us')
                }
            },
            'async': {
                'dom:gpu': {
                    'latency': (2.2, None, 0.10, 'us')
                },
                'daint:gpu': {
                    'latency': (2.2, None, 0.10, 'us')
                },
                'kesch:cn': {
                    'latency': (5.7, None, 0.10, 'us')
                },
                '*': {
                    'latency': (0.0, None, None, 'us')
                }
            },
        }

        self.reference = self.sys_reference[kernel_version]

        self.maintainers = ['TM']
        self.tags = {'benchmark', 'diagnostic'}
 def max_gpu_memory(self):
     #    Node name       Usage      Max mem Execution time
     # ------------ ----------- ------------ --------------
     #     nid06681        38 %     2749 MiB       00:00:06
     regex = r'^\s+nid\S+\s+\d+\s+%\s+(\d+)\s+MiB.*:'
     return sn.max(sn.extractall(regex, self.stdout, 1, int))
예제 #27
0
def vtune_time(self):
    '''Vtune creates 1 report per compute node. For example, a 48 mpi tasks job
    (= 2 compute nodes when running with 24 c/cn) will create 2 directories:
    * rpt.nid00001/rpt.nid00001.vtune
    * rpt.nid00002/rpt.nid00002.vtune

    Typical output (for each compute node) is:

    .. code-block::

      Elapsed Time:	14.866s
          CPU Time:	319.177s            /24 = 13.3
              Effective Time:	308.218s    /24 = 12.8
                  Idle:	0s
                  Poor:	19.725s
                  Ok:	119.570s
                  Ideal:	168.922s
                  Over:	0s
              Spin Time:	10.959s             /24 =  0.4
                  MPI Busy Wait Time:	10.795s
                  Other:	0.164s
              Overhead Time:	0s
      Total Thread Count:	25
      Paused Time:	0s
    '''
    result_d = {}
    # --- ranks per node
    if self.num_tasks < self.num_tasks_per_node:
        vtune_tasks_per_node = self.num_tasks
    else:
        vtune_tasks_per_node = self.num_tasks_per_node
    # --- Elapsed Time (min, max)
    regex = r'.*Elapsed Time: (?P<sec>\S+)s'
    result = sn.extractall(regex, self.stdout, 'sec', float)
    result_d['elapsed_min'] = sn.round(sn.min(result), 4)
    result_d['elapsed_max'] = sn.round(sn.max(result), 4)
    # --- CPU Time (max)
    regex = r'^\s+CPU Time: (?P<sec>\S+)s'
    result = sn.extractall(regex, self.stdout, 'sec', float)
    result_d['elapsed_cput'] = sn.round(
        sn.max(result) / vtune_tasks_per_node, 4)
    # --- CPU Time: Effective Time (max)
    regex = r'^\s+Effective Time: (?P<sec>\S+)s'
    result = sn.extractall(regex, self.stdout, 'sec', float)
    result_d['elapsed_cput_efft'] = sn.round(
        sn.max(result) / vtune_tasks_per_node, 4)
    # --- CPU Time: Spin Time (max)
    regex = r'^\s+Spin Time: (?P<sec>\S+)s'
    result = sn.extractall(regex, self.stdout, 'sec', float)
    result_d['elapsed_cput_spint'] = sn.round(
        sn.max(result) / vtune_tasks_per_node, 4)
    # --- CPU Time: Spin Time: MPI Busy Wait (max)
    if self.num_tasks > 1:
        regex = r'\s+MPI Busy Wait Time: (?P<sec>\S+)s'
        result = sn.extractall(regex, self.stdout, 'sec', float)
        result_d['elapsed_cput_spint_mpit'] = sn.round(
            sn.max(result) / vtune_tasks_per_node, 4)
    else:
        result_d['elapsed_cput_spint_mpit'] = 0

# TODO:
# 'vtune_momentumAndEnergyIAD':
# sphsintel.vtune_momentumAndEnergyIAD(self),
# '%vtune_srcf_lookupTables': self.vtune_pct_lookupTables,
# '%vtune_srcf_Octree': self.vtune_pct_Octree,
# '%vtune_srcf_momentumAndEnergyIAD':
# self.vtune_pct_momentumAndEnergyIAD,
# '%vtune_srcf_IAD': self.vtune_pct_IAD,
    return result_d
예제 #28
0
    def __init__(self):
        super().__init__()
        self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn', 'tiger:gpu']
        self.descr = 'GPU burn test'
        self.valid_prog_environs = ['PrgEnv-gnu']

        if self.current_system.name == 'kesch':
            self.exclusive_access = True
            self.modules = ['craype-accel-nvidia35']
            # NOTE: The first option indicates the precision (-d for double)
            #       while the seconds is the time (in secs) to run the test.
            #       For multi-gpu nodes, we run the gpu burn test for more
            #       time to get reliable measurements.
            self.executable_opts = ['-d', '40']
            self.num_gpus_per_node = 16
            gpu_arch = '37'
        elif self.current_system.name in {'daint', 'dom', 'tiger'}:
            self.modules = ['craype-accel-nvidia60']
            self.executable_opts = ['-d', '20']
            self.num_gpus_per_node = 1
            gpu_arch = '60'
        else:
            self.num_gpus_per_node = 1
            gpu_arch = None

        self.sourcepath = 'gpu_burn.cu'
        self.build_system = 'SingleSource'
        if gpu_arch:
            self.build_system.cxxflags = [
                '-arch=compute_%s' % gpu_arch,
                '-code=sm_%s' % gpu_arch
            ]

        self.build_system.ldflags = ['-lcuda', '-lcublas', '-lnvidia-ml']
        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall('OK', self.stdout)), self.num_tasks_assigned)

        patt = r'GPU\s+\d+\(\S*\): (?P<perf>\S*) GF\/s  (?P<temp>\S*) Celsius'
        self.perf_patterns = {
            'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
            'max_temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float))
        }

        self.reference = {
            'dom:gpu': {
                'perf': (4115, -0.10, None, 'Gflop/s'),
                'max_temp': (0, None, None, 'Celsius')
            },
            'daint:gpu': {
                'perf': (4115, -0.10, None, 'Gflop/s'),
                'max_temp': (0, None, None, 'Celsius')
            },
            'kesch:cn': {
                'perf': (950, -0.10, None, 'Gflop/s'),
                'max_temp': (0, None, None, 'Celsius')
            },
            '*': {
                'perf': (0, None, None, 'Gflop/s'),
                'max_temp': (0, None, None, 'Celsius')
            }
        }

        self.num_tasks = 0
        self.num_tasks_per_node = 1

        self.maintainers = ['AJ', 'TM']
        self.tags = {'diagnostic', 'benchmark', 'craype'}
예제 #29
0
 def max_temp(self, nid=None):
     '''Maximum temperature recorded.'''
     return sn.max(self._extract_perf_metric('temp', nid))
예제 #30
0
 def speedup(self):
     regex = r'^\S+(f32|f64)\s+(\S+) ns\s+'
     slowest = sn.max(sn.extractall(regex, self.stdout, 2, float))
     fastest = sn.min(sn.extractall(regex, self.stdout, 2, float))
     return sn.round(slowest / fastest, 3)