示例#1
0
    def cdt2008_pgi_workaround(self):
        cdt = os_ext.cray_cdt_version()
        if not cdt:
            return

        if (self.current_environ.name == 'PrgEnv-pgi' and cdt == '20.08'):
            self.variables.update({'CUDA_HOME': '$CUDATOOLKIT_HOME'})
示例#2
0
def test_cray_cdt_version_unknown_fmt(tmp_path, monkeypatch):
    # Mock up a CDT file
    rcfile = tmp_path / 'rcfile'
    with open(rcfile, 'w') as fp:
        fp.write('random stuff')

    monkeypatch.setenv('MODULERCFILE', str(rcfile))
    assert os_ext.cray_cdt_version() is None
示例#3
0
def test_cray_cdt_version(tmp_path, monkeypatch):
    # Mock up a CDT file
    rcfile = tmp_path / 'rcfile'
    with open(rcfile, 'w') as fp:
        fp.write('#%Module CDT 20.06\nblah blah\n')

    monkeypatch.setenv('MODULERCFILE', str(rcfile))
    assert os_ext.cray_cdt_version() == '20.06'
示例#4
0
 def cdt2006_workaround_intel(self):
     if (self.current_environ.name == 'PrgEnv-intel'
             and os_ext.cray_cdt_version() == '20.06'):
         self.modules += ['cray-netcdf-hdf5parallel']
         self.prebuild_cmds = [
             'ln -s $CRAY_NETCDF_HDF5PARALLEL_PREFIX/lib/pkgconfig/'
             'netcdf-cxx4_parallel.pc netcdf_c++4_parallel.pc'
         ]
         self.variables['PKG_CONFIG_PATH'] = '.:$PKG_CONFIG_PATH'
示例#5
0
    def cdt2006_workaround_dynamic(self):
        if (os_ext.cray_cdt_version() == '20.06' and self.linkage == 'dynamic'
                and self.current_environ.name == 'PrgEnv-gnu'):
            self.variables['PATH'] = (
                '/opt/cray/pe/cce/10.0.1/cce-clang/x86_64/bin:$PATH')
            self.prgenv_flags[self.current_environ.name] += ['-fuse-ld=lld']

            # GCC >= 9 is required for the above option; our CUDA-friendly CDT
            # uses GCC 8 as default.
            self.modules += ['gcc/9.3.0']
示例#6
0
 def __init__(self):
     self.descr = 'Distributed training with TensorFlow using ipyparallel'
     self.valid_systems = ['daint:gpu', 'dom:gpu']
     self.valid_prog_environs = ['PrgEnv-gnu']
     # FIXME: The following will not be needed after the Daint upgrade
     cray_cdt_version = os_ext.cray_cdt_version() or '19.10'
     self.modules = [
         'ipcmagic', f'Horovod/0.19.1-CrayGNU-{cray_cdt_version}-tf-2.2.0'
     ]
     self.num_tasks = 2
     self.num_tasks_per_node = 1
     self.executable = 'ipython'
     self.executable_opts = ['tf-hvd-sgd-ipc-tf2.py']
     nids = sn.extractall(r'nid(?P<nid>\d+)', self.stdout, 'nid', str)
     self.sanity_patterns = sn.all(
         [sn.assert_ne(nids, []),
          sn.assert_ne(nids[0], nids[1])])
     self.reference = {
         'daint:gpu': {
             'slope': (2.0, -0.1, 0.1, None),
             'offset': (0.0, -0.1, 0.1, None),
             'retries': (0, None, None, None),
             'time': (10, None, None, 's'),
         },
         'dom:gpu': {
             'slope': (2.0, -0.1, 0.1, None),
             'offset': (0.0, -0.1, 0.1, None),
             'retries': (0, None, None, None),
             'time': (10, None, None, 's'),
         }
     }
     self.perf_patterns = {
         'slope':
         sn.extractsingle(r'slope=(?P<slope>\S+)', self.stdout, 'slope',
                          float),
         'offset':
         sn.extractsingle(r'offset=(?P<offset>\S+)', self.stdout, 'offset',
                          float),
         'retries':
         4 -
         sn.count(sn.findall(r'IPCluster is already running', self.stdout)),
         'time':
         sn.extractsingle(
             r'IPCluster is ready\!\s+'
             r'\((?P<time>\d+) seconds\)', self.stdout, 'time', float)
     }
     self.maintainers = ['RS', 'TR']
     self.tags = {'production'}
示例#7
0
 def __init__(self, boostver):
     self.descr = f'Test for Boost {boostver} with Python bindings'
     self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc']
     self.valid_prog_environs = ['builtin']
     cdt_version = os_ext.cray_cdt_version()
     self.modules = [f'Boost/{boostver}-CrayGNU-{cdt_version}-python3']
     self.executable = f'python3 hello.py'
     self.sanity_patterns = sn.assert_found('hello, world', self.stdout)
     version_cmd = ('python3 -c \'import sys; '
                    'ver=sys.version_info; '
                    'print(f"{ver.major}{ver.minor}")\'')
     self.variables = {
         'PYTHON_INCLUDE': '$(python3-config --includes)',
         'PYTHON_BOOST_LIB': f'boost_python$({version_cmd})'
     }
     self.maintainers = ['JB', 'AJ']
     self.tags = {'scs', 'production'}
示例#8
0
 def cdt2006_cpp_workaround(self):
     if (os_ext.cray_cdt_version() == '20.06' and self.lang == 'cpp'):
         self.modules += ['cray-hdf5/1.10.6.1']
示例#9
0
 def cdt2008_pgi_workaround(self):
     if (self.current_environ.name == 'PrgEnv-pgi'
             and osx.cray_cdt_version() == '20.08'
             and self.current_system.name in ['daint', 'dom']):
         self.variables['CUDA_HOME'] = '$CUDATOOLKIT_HOME'
示例#10
0
def test_cray_cdt_version_no_such_file(tmp_path, monkeypatch):
    # Mock up a CDT file
    rcfile = tmp_path / 'rcfile'
    monkeypatch.setenv('MODULERCFILE', str(rcfile))
    assert os_ext.cray_cdt_version() is None
示例#11
0
    def __init__(self, variant):
        self.descr = 'Distributed training with TensorFlow2 and Horovod'
        self.valid_systems = ['daint:gpu']
        self.valid_prog_environs = ['builtin']

        # FIXME: The following will not be needed after the Daint upgrade
        cray_cdt_version = os_ext.cray_cdt_version() or '19.10'
        self.modules = [f'Horovod/0.19.1-CrayGNU-{cray_cdt_version}-tf-2.2.0']
        self.sourcesdir = None
        self.num_tasks_per_node = 1
        self.num_cpus_per_task = 12
        if variant == 'small':
            self.valid_systems += ['dom:gpu']
            self.num_tasks = 8
            self.reference = {
                'dom:gpu': {
                    'throughput': (1712, -0.05, None, 'images/s'),
                    'throughput_per_gpu': (214, -0.05, None, 'images/s'),
                },
                'daint:gpu': {
                    'throughput': (1712, -0.05, None, 'images/s'),
                    'throughput_per_gpu': (214, -0.05, None, 'images/s')
                },
            }
        else:
            self.num_tasks = 32
            self.reference = {
                'daint:gpu': {
                    'throughput': (6848, -0.05, None, 'images/s'),
                    'throughput_per_gpu': (214, -0.05, None, 'images/s')
                },
            }
        self.perf_patterns = {
            'throughput':
            sn.extractsingle(
                rf'Total img/sec on {self.num_tasks} GPU\(s\): '
                rf'(?P<throughput>\S+) \S+', self.stdout, 'throughput', float),
            'throughput_per_gpu':
            sn.extractsingle(
                r'Img/sec per GPU: (?P<throughput_per_gpu>\S+) \S+',
                self.stdout, 'throughput_per_gpu', float)
        }
        model = 'InceptionV3'
        batch_size = 64
        self.sanity_patterns = sn.all([
            sn.assert_found(rf'Model: {model}', self.stdout),
            sn.assert_found(rf'Batch size: {batch_size}', self.stdout)
        ])
        self.variables = {
            'NCCL_DEBUG': 'INFO',
            'NCCL_IB_HCA': 'ipogif0',
            'NCCL_IB_CUDA_SUPPORT': '1',
            'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
        }
        self.prerun_cmds = [
            'wget https://raw.githubusercontent.com/horovod/'
            'horovod/842d1075e8440f15e84364f494645c28bf20c3ae/'
            'examples/tensorflow2_synthetic_benchmark.py'
        ]
        self.executable = 'python'
        self.executable_opts = [
            'tensorflow2_synthetic_benchmark.py',
            f'--model {model}',
            f'--batch-size {batch_size}',
            '--num-iters 5',
            '--num-batches-per-iter 5',
            '--num-warmup-batches 5',
        ]
        self.tags = {'production'}
        self.maintainers = ['RS', 'TR']