def skeleton_jag_reconstruction_loss(cluster, dir_name, weekly, data_reader_percent): output_file_name = '%s/ci_test/unit_tests/output/jag_reconstruction_loss_output.txt' % ( dir_name) error_file_name = '%s/ci_test/unit_tests/error/jag_reconstruction_loss_error.txt' % ( dir_name) command = tools.get_command( cluster=cluster, num_nodes=2, num_processes=32, disable_cuda=1, dir_name=dir_name, sample_list_train_default= '/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K4trainers/100Kindex.txt', sample_list_test_default= '/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K16trainers/t1_sample_list.txt', data_reader_name='jag', data_reader_percent='prototext', metadata='applications/physics/data/jag_100M_metadata.prototext', model_folder='tests', model_name='jag_single_layer_ae', optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name)
def build_skeleton(dir_name, compiler, debug): compiler_underscored = re.sub('[@\.]', '_', compiler) if debug: build_type = 'debug' else: build_type = 'rel' output_file_name = '%s/bamboo/compiler_tests/output/%s_%s_build_output.txt' % (dir_name, compiler_underscored, build_type) error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_build_error.txt' % (dir_name, compiler_underscored, build_type) compiler = compiler.replace('@', '-') #mpi_lib = mpi_lib.replace('@', '-') cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).decode('utf-8').strip()) # For reference: # Commenting out for now. These additions to path name will likely return # one day, so I am not removing them entirely. # x86_64 <=> catalyst, pascal # ppc64le <=> ray #architecture = subprocess.check_output('uname -m'.split()).decode('utf-8').strip() #if cluster == 'ray': # architecture += '_gpu_cuda-9.2.64_cudnn-7.0' #elif cluster == 'pascal': # architecture += '_gpu_cuda-9.1.85_cudnn-7.1' os.chdir('%s/bamboo/compiler_tests/builds/%s_%s_%s/build' % (dir_name, cluster, compiler, build_type)) command = 'make -j all > %s 2> %s' % (output_file_name, error_file_name) return_code = os.system(command) os.chdir('../..') tools.assert_success(return_code, error_file_name)
def test_compiler_build_script(cluster, dirname): test_base_dir = os.path.join(dirname, 'ci_test', 'compiler_tests') output_file_name = os.path.join(test_base_dir, 'output', 'build_script_output.txt') error_file_name = os.path.join(test_base_dir, 'error', 'build_script_error.txt') # Get environment variables ENV_NAME = os.getenv('SPACK_ENV_NAME') common_cmd = '%s/scripts/build_lbann.sh -d -l %s --test --clean-build -j $(($(nproc)+2)) -- +deterministic +vision +numpy' % (dirname, ENV_NAME) if cluster in ['lassen', 'pascal', 'ray']: command = '%s +cuda +half +fft > %s 2> %s' % (common_cmd, output_file_name, error_file_name) elif cluster in ['corona']: command = '%s +rocm > %s 2> %s' % (common_cmd, output_file_name, error_file_name) elif cluster in ['catalyst']: command = '%s +onednn +half +fft > %s 2> %s' % (common_cmd, output_file_name, error_file_name) else: e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) return_code = os.system(command) artifact_dir = os.path.join(test_base_dir, 'output') with os.scandir(dirname) as it: for entry in it: if entry.is_file() and re.match(r'spack-.*txt', entry.name): (base, ext) = os.path.splitext(entry.name) new_file_name = base + '_output' + ext shutil.copyfile(entry.path, os.path.join(artifact_dir, new_file_name)) tools.assert_success(return_code, error_file_name)
def test_compiler_build_script(cluster, dirname): if cluster not in ['catalyst', 'corona', 'lassen', 'pascal', 'ray']: e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname) error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname) command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % ( dirname, output_file_name, error_file_name) return_code = os.system(command) tools.assert_success(return_code, error_file_name)
def build_script(cluster, dirname, compiler, debug): print(('Running build_script for cluster={cluster},' ' compiler={compiler}, debug={debug}.').format( cluster=cluster, compiler=compiler, debug=debug)) if debug: build = 'debug' debug_flag = '--debug' else: build = 'release' debug_flag = '' output_file_name = '%s/bamboo/compiler_tests/output/%s_%s_%s_build_script_output.txt' % (dirname, cluster, compiler, build) error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_%s_build_script_error.txt' % (dirname, cluster, compiler, build) command = '%s/bamboo/compiler_tests/build_script_specific.sh --compiler %s %s> %s 2> %s' % (dirname, compiler, debug_flag, output_file_name, error_file_name) return_code = os.system(command) tools.assert_success(return_code, error_file_name)
def test_run_parallel_filesystem_catch_tests(cluster, dirname): output_dir = os.path.join(dirname, 'ci_test', 'unit_tests') build_dir = hack_find_spack_build_dir(dirname) mpi_catch_exe = os.path.join(build_dir, 'unit_test', 'mpi-catch-tests') if not os.path.exists(mpi_catch_exe): print('Skip - executable not found') pytest.skip('executable not found') # Run the parallel tests mpi_launch = get_system_mpi_launch(cluster) mpi_output_file_name = 'mpi_filesystem_catch_tests_output-%s-rank=%%r-size=%%s.xml' % (cluster) mpi_output_file = os.path.join(output_dir, mpi_output_file_name) mpi_error_file = os.path.join(output_dir, "error", "mpi-filesystem-catch-test-error.log") mpi_catch_args = [mpi_catch_exe, '"[filesystem]"', '-r', 'junit', '-o', mpi_output_file] output = sp.run(mpi_launch + mpi_catch_args) tools.assert_success(output.returncode, mpi_error_file)
def test_run_sequential_catch_tests(cluster, dirname): output_dir = os.path.join(dirname, 'ci_test', 'unit_tests') build_dir = hack_find_spack_build_dir(dirname) seq_catch_exe = os.path.join(build_dir, 'unit_test', 'seq-catch-tests') if not os.path.exists(seq_catch_exe): print('Skip - executable not found') pytest.skip('executable not found') # Run the sequential tests seq_launch = get_system_seq_launch(cluster) seq_output_file_name = 'seq_catch_tests_output-%s.xml' % (cluster) seq_output_file = os.path.join(output_dir, seq_output_file_name) seq_error_file = os.path.join(output_dir, "error", "seq-catch-test-error.log") seq_catch_args = [seq_catch_exe, '-r', 'junit', '-o', seq_output_file] output = sp.run(seq_launch + seq_catch_args) tools.assert_success(output.returncode, seq_error_file)
def spack_skeleton(dir_name, compiler, mpi_lib, debug): compiler_underscored = re.sub('[@\.]', '_', compiler) if debug: build_type = 'debug' else: build_type = 'rel' output_file_name = '%s/bamboo/compiler_tests/output/%s_%s_spack_output.txt' % (dir_name, compiler_underscored, build_type) error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_spack_error.txt' % (dir_name, compiler_underscored, build_type) os.chdir('%s/bamboo/compiler_tests/builds' % dir_name) debug_flag = '' if debug: debug_flag = ' -d' command = '%s/scripts/spack_recipes/build_lbann.sh -c %s -m %s%s > %s 2> %s' % ( dir_name, compiler, mpi_lib, debug_flag, output_file_name, error_file_name) return_code = os.system(command) os.chdir('..') tools.assert_success(return_code, error_file_name)
def test_unit_should_work(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: exe = exes print('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n') (data_reader_path, model_path, optimizer_path) = get_default_parameters( dirname) (output_file_name, error_file_name) = get_file_names(dirname, 'should_work') command = tools.get_command( cluster=cluster, executable=exe, data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, optimizer_path=optimizer_path, num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) tools.assert_success(return_code, error_file_name)
def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_name, weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_jag_reconstruction_loss: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) pytest.skip(e) if cluster == 'ray': e = 'skeleton_jag_reconstruction_loss: dataset does not exist on %s' % cluster print('Skip - ' + e) pytest.skip(e) #if cluster == 'lassen': #e = 'skeleton_jag_reconstruction_loss: FIXME dataset consistency issues on Lassen' #print('Skip - ' + e) #pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/jag_reconstruction_loss_%s_output.txt' % ( dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/jag_reconstruction_loss_%s_error.txt' % ( dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=2, num_processes=32, disable_cuda=1, dir_name=dir_name, sample_list_train_default= '/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/100Kindex.txt', sample_list_test_default= '/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/t1_sample_list.txt', data_reader_name='jag', data_reader_percent='prototext', metadata='applications/physics/data/jag_100M_metadata.prototext', model_folder='tests', model_name='jag_single_layer_ae', optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name)
def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_name, weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_checkpoint_lenet_shared: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) pytest.skip(e) exe = executables[compiler_name] # Handle data if data_reader_percent is None: data_reader_percent = 0.01 # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % ( dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % ( dir_name, compiler_name) os.system('rm -rf ckpt_lenet_shared && mkdir ckpt_lenet_shared') no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=no_ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_nockpt = os.system(command) tools.assert_success(return_code_nockpt, error_file_name) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % ( dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_checkpoint_%s_error.txt' % ( dir_name, compiler_name) ckpt_dir = 'ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_ckpt_1 = os.system(command) tools.assert_success(return_code_ckpt_1, error_file_name) # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_restart_%s_output.txt' % ( dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_restart_%s_error.txt' % ( dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_ckpt_2 = os.system(command) tools.assert_success(return_code_ckpt_2, error_file_name) dcmp = dircmp(ckpt_dir, no_ckpt_dir) fail, diffs, warns = tools.print_diff_files(dcmp) for w in warns: print(w) if fail: print() for d in diffs: print(d) path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name) raise AssertionError('Compare {ncd} and {cd} in {p}'.format( ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix))