def test_bad_data_filedir_11(): try: tools.get_command('ray', 'exe', data_filedir_default='filedir', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: data_filedir_default set but neither data_reader_name or data_reader_path are.' assert actual == expected
def test_bad_optimizer(): try: tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', optimizer_path='path', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: optimizer_path is set but so is optimizer_name' assert actual == expected
def test_bad_data_filedir_15(): try: tools.get_command('ray', 'exe', data_filename_test_default='e', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' assert actual == expected
def test_bad_dir_name_5(): try: tools.get_command('ray', 'exe', optimizer_name='name', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected
def test_unsupported_cluster(): try: tools.get_command('quartz', 'exe', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Unsupported Cluster: quartz' assert actual == expected
def test_blacklisted_substrings(): try: tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid character(s): ; contains ; , --model=new_model contains --' assert actual == expected
def test_bad_model_3(): try: tools.get_command('ray', 'exe', dir_name='dir', model_name='name', model_path='path', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected
def test_bad_model_5(): try: tools.get_command('ray', 'exe', dir_name='dir', model_name='name', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: model_name set but not model_folder.' assert actual == expected
def test_bad_data_reader(): try: tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: data_reader_path is set but so is data_reader_name , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected
def test_bad_data_filedir_8(): try: tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_test_default='h', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected
def test_bad_dir_name_4(): try: tools.get_command('catalyst', 'exe', data_reader_name='name', check_executable_existence=False) assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is. , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected
def test_bad_dir_name_1(): try: tools.get_command('ray', 'exe', dir_name='dir', check_executable_existence=False) assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.' assert actual == expected
def test_bad_data_filedir_10(): try: tools.get_command('ray', 'exe', data_reader_path='path', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_ray. If a data reader is provided, an alternative filedir must be available for Ray. Alternatively, all of [data_filedir_train_ray, data_filename_train_ray, data_filedir_test_ray, data_filename_test_ray] can be set.' assert actual == expected
def test_blacklisted_substrings_3(): try: tools.get_command('ray', 'exe', partition='pdebug', extra_lbann_flags={'key': '--bad_value'}, check_executable_existence=False) assert False except Exception as e: actual = str(e) expected = 'Invalid character(s): --bad_value contains --' assert actual == expected
def test_bad_data_reader(): try: tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: data_reader_path is set but so is data_reader_name' assert actual == expected
def test_bad_data_filedir_4(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_ray='filedir', data_filename_test_ray='d', check_executable_existance=False) except Exception, e: actual = str(e) expected = 'Invalid Usage: data_fildir_ray set but so is at least one of [data_filedir_train_ray, data_filename_train_ray, data_filedir_test_ray, data_filename_test_ray]' assert actual == expected
def test_bad_extra_lbann_flags_not_a_dict(): try: tools.get_command('ray', 'exe', partition='pdebug', extra_lbann_flags='invalid_flag', check_executable_existence=False) assert False except Exception as e: actual = str(e) expected = ( 'Invalid Usage: extra_lbann_flags must be a dict e.g. `{flag :' ' None, flag: 4}`. Use `None` if a flag has no value attached ' 'to it.') assert actual == expected
def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) exe = executables[compiler_name] output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_nockpt = os.system(command) if return_code_nockpt != 0: sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') sys.exit(1) os.system('mv ckpt ckpt_baseline') output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_1 = os.system(command) if return_code_ckpt_1 != 0: sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') sys.exit(1) output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_restart_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_restart_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_2 = os.system(command) if return_code_ckpt_2 != 0: sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') sys.exit(1) diff_test = os.system('diff -rq ckpt ckpt_baseline') os.system('rm -rf ckpt*') assert diff_test == 0
def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: e = 'skeleton_mnist_conv_graph: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/mnist_conv_graph_%s_output.txt' % ( dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/mnist_conv_graph_%s_error.txt' % ( dir_name, compiler_name) if compiler_name == 'gcc7': tl = 240 else: tl = None command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, time_limit=tl, num_processes=1, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='tests', model_name='mnist_conv_graph', optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0
def test_unit_no_params_bad(cluster, exes, dirname): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with no params; lbann should throw exception\n') command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True) return_code = os.system(command) assert return_code != 0
def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, debug_build, should_log=False): # If weekly or debug_build are true, then run the test. if not (weekly or debug_build): e = 'skeleton_cifar_debug: Not doing weekly or debug_build testing' print('Skip - ' + e) pytest.skip(e) if cluster == 'ray': e = 'skeleton_cifar_debug: cifar not operational on Ray' print('Skip - ' + e) pytest.skip(e) if compiler_name not in executables: e = 'skeleton_cifar_debug: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) pytest.skip(e) model_name = 'autoencoder_cifar10' output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, partition='pbatch', time_limit=100, dir_name=dir_name, data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin', data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin', data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name, model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) assert output_value == 0
def skeleton_jag_reconstruction_loss(cluster, dir_name, weekly, data_reader_percent): output_file_name = '%s/ci_test/unit_tests/output/jag_reconstruction_loss_output.txt' % ( dir_name) error_file_name = '%s/ci_test/unit_tests/error/jag_reconstruction_loss_error.txt' % ( dir_name) command = tools.get_command( cluster=cluster, num_nodes=2, num_processes=32, disable_cuda=1, dir_name=dir_name, sample_list_train_default= '/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K4trainers/100Kindex.txt', sample_list_test_default= '/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K16trainers/t1_sample_list.txt', data_reader_name='jag', data_reader_percent='prototext', metadata='applications/physics/data/jag_100M_metadata.prototext', model_folder='tests', model_name='jag_single_layer_ae', optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name)
def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, debug, should_log=False): # If weekly or debug are true, then run the test. if (not weekly) and (not debug): pytest.skip('Not doing weekly or debug testing') if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) model_name = 'lenet_mnist' output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % ( dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % ( dir_name, model_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, partition='pbatch', time_limit=100, dir_name=dir_name, data_filedir_ray='/p/gscratchr/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='models/' + model_name, model_name=model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name) assert output_value == 0
def test_unit_two_models_bad2(cluster, exes, dirname): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with two models with missing {; lbann should throw exception\n') model_path='prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path) return_code = os.system(command) assert return_code != 0
def test_unit_one_model_bad(cluster, exes, dirname): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n') model_path = 'prototext/model_mnist_simple_1.prototext' command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path) return_code = os.system(command) assert return_code != 0
def skeleton_gradient_check_resnet(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, dir_name=dir_name, data_filedir_ray='/p/gscratchr/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='tests', model_name='mnist_resnet', optimizer_name='adam') return_code = os.system(command) assert return_code == 0
def test_unit_missing_reader(cluster, exes, dirname): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' optimizer_path = 'prototext/opt_sgd.prototext' command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, optimizer_path=optimizer_path) return_code = os.system(command) assert return_code != 0
def get_command(cluster, dir_name, model_folder, model_name, executable, output_file_name, error_file_name, compiler_name, weekly=False): if model_name in ['alexnet', 'conv_autoencoder_imagenet']: data_reader_percent = 0.01 # If doing weekly testing, increase data_reader_percent if weekly: data_reader_percent = 0.10 command = tools.get_command( cluster=cluster, executable=executable, num_nodes=16, partition='pbatch', time_limit=600, num_processes=32, dir_name=dir_name, data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', data_reader_name='imagenet', data_reader_percent=data_reader_percent, model_folder=model_folder, model_name=model_name, num_epochs=20, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: if (model_name == 'lenet_mnist') and \ (compiler_name in ['clang6', 'intel19']): partition = 'pbatch' time_limit = 600 else: partition = 'pdebug' time_limit = 30 if (cluster == 'ray') and (model_name == 'conv_autoencoder_mnist'): num_processes = 20 else: num_processes = 2 command = tools.get_command( cluster=cluster, executable=executable, num_nodes=1, partition=partition, time_limit=time_limit, num_processes=num_processes, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder=model_folder, model_name=model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) else: raise Exception('Invalid model: %s' % model_name) return command
def test_unit_bad_params(cluster, exes): exe = exes['gcc4'] sys.stderr.write( 'TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n' ) (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True) return_code = os.system( '%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe)) assert return_code != 0
def test_unit_missing_optimizer(cluster, exes, dirname): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n') model_path='{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' data_reader_path='prototext/data_reader_mnist.prototext' command = tools.get_command( cluster=cluster, executable=exe, data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path) return_code = os.system(command) assert return_code != 0
def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) output_file_name = '%s/bamboo/unit_tests/output/layer_log_sigmoid_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_log_sigmoid_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='log_sigmoid', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0
def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) output_file_name = '%s/bamboo/unit_tests/output/mnist_ridge_regression_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/mnist_ridge_regression_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='tests', model_name='mnist_ridge_regression', optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0
def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) output_file_name = '%s/bamboo/unit_tests/output/layer_identity_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/layer_identity_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='', data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='identity', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) assert return_code == 0
def get_command(cluster, dir_name, model_folder, model_name, executable, output_file_name, error_file_name, compiler_name, weekly=False): if model_name in ['alexnet', 'conv_autoencoder_imagenet']: data_reader_percent = 0.01 if weekly: data_reader_percent = 0.10 command = tools.get_command( cluster=cluster, executable=executable, num_nodes=16, partition='pbatch', time_limit=600, num_processes=32, dir_name=dir_name, data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', data_reader_name='imagenet', data_reader_percent=data_reader_percent, model_folder=model_folder, model_name=model_name, num_epochs=20, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: if (model_name == 'lenet_mnist') and (compiler_name in ['clang4', 'intel18']): partition = 'pbatch' time_limit = 600 else: partition = 'pdebug' time_limit = 30 if (cluster == 'ray') and (model_name == 'conv_autoencoder_mnist'): num_processes = 20 else: num_processes = 2 command = tools.get_command( cluster=cluster, executable=executable, num_nodes=1, partition=partition, time_limit=time_limit, num_processes=num_processes, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder=model_folder, model_name=model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) else: raise Exception('Invalid model: %s' % model_name) return command
def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, debug, should_log=False): # If weekly or debug are true, then run the test. if (not weekly) and (not debug): pytest.skip('Not doing weekly or debug testing') if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) model_name = 'lenet_mnist' output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, partition='pbatch', time_limit=100, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='models/' + model_name, model_name=model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name) assert output_value == 0
def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) lbann2 = executables[compiler_name] + '2' model_path = '{../../model_zoo/models/lenet_mnist/model_lenet_mnist.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}' output_file_name = '%s/bamboo/unit_tests/output/lbann2_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, data_reader_name='mnist', data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', dir_name=dir_name, model_path=model_path, optimizer_name='sgd', num_epochs=2, output_file_name=output_file_name, error_file_name=error_file_name) os.mkdir('lbann2_ckpt') return_code = os.system(command) if return_code != 0: sys.stderr.write('LBANN2 LeNet execution failed, exiting with error') sys.exit(1) os.system('mv lbann2_ckpt lbann2_nockpt') output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_checkpoint_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_1 = os.system(command) if return_code_ckpt_1 != 0: sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') sys.exit(1) output_file_name = '%s/bamboo/unit_tests/output/lbann2_restart_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_restart_%s_error.txt' % (dir_name, compiler_name) os.mkdir('lbann2_ckpt') command = tools.get_command( cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', model_path='../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext', num_epochs=2, optimizer_name='sgd', ckpt_dir='ckpt/', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_2 = os.system(command) if return_code_ckpt_2 != 0: sys.stderr.write('LBANN2 LeNet weight reload failed, exiting with error') sys.exit(1) os.system('rm lbann2_ckpt/model0-epoch*') os.system('rm lbann2_nockpt/model0-epoch*') diff_test = os.system('diff -rq lbann2_ckpt/ lbann2_nockpt/') os.system('rm -rf ckpt') os.system('rm -rf lbann2_*') assert diff_test == 0
def test_unit_bad_params(cluster, exes, dirname): exe = exes['gcc4'] sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n') (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True) return_code = os.system('%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe)) assert return_code != 0
def test_command_ray(): actual = tools.get_command(cluster='ray', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False) expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected
def skeleton_io_buffers(cluster, dir_name, executables, compiler_name, weekly): if not weekly: pytest.skip('Not doing weekly testing') if cluster == 'surface': pytest.skip('skeleton_io_buffers does not run on surface') if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) max_mb = 300 # Printing output from 6*6*2=72 runs of LBANN makes the logs too slow. # Output from run_lbann is still printed - if there is a failure. should_log = False partitioned = 'mnist_partitioned_io' distributed = 'mnist_distributed_io' model_names = [partitioned, distributed] accuracies = {} errors = [] all_values = [] fatal_errors = [] overall_min_partitioned_accuracy = float('inf') overall_min_distributed_accuracy = float('inf') for mini_batch_size in [300, 150, 100, 75, 60, 50]: num_models = max_mb / mini_batch_size for procs_per_model in [1, 2, 3, 4, 5, 6]: num_ranks = procs_per_model * num_models for model_name in model_names: output_file_name = '%s/bamboo/integration_tests/output/%s_%d_%d_output.txt' % (dir_name, model_name, mini_batch_size, procs_per_model) error_file_name = '%s/bamboo/integration_tests/error/%s_%d_%d_error.txt' % (dir_name, model_name, mini_batch_size, procs_per_model) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=2, num_processes=num_ranks, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', mini_batch_size=mini_batch_size, model_folder='tests', model_name=model_name, num_epochs=5, optimizer_name='adagrad', processes_per_model=procs_per_model, output_file_name=output_file_name, error_file_name=error_file_name) try: common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) # Don't need return value accuracy_dict = common_code.extract_data(output_file_name, ['test_accuracy'], should_log) accuracies[model_name] = accuracy_dict['test_accuracy'] except Exception: # We want to keep running to see if any other mini_batch_size & procs_per_model combination crashes. # However, it is now pointless to compare accuracies. fatal_errors.append('Crashed running %s with mini_batch_size=%d, procs_per_model=%d' % (model_name, mini_batch_size, procs_per_model)) # End model name loop if fatal_errors == []: partitioned_num_models = len(accuracies[partitioned].keys()) distributed_num_models = len(accuracies[distributed].keys()) assert partitioned_num_models == distributed_num_models min_partitioned_accuracy = float('inf') min_distributed_accuracy = float('inf') for model_num in sorted(accuracies[partitioned].keys()): partitioned_accuracy = accuracies[partitioned][model_num]['overall'] distributed_accuracy = accuracies[distributed][model_num]['overall'] if partitioned_accuracy < min_partitioned_accuracy: min_partitioned_accuracy = partitioned_accuracy if distributed_accuracy < min_distributed_accuracy: min_distributed_accuracy = distributed_accuracy tolerance = 0.05 # Are we within tolerance * expected_value? if abs(partitioned_accuracy - distributed_accuracy) > abs(tolerance * min(partitioned_accuracy, distributed_accuracy)): errors.append('partitioned = %f != %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d' % (partitioned_accuracy, distributed_accuracy, model_num, mini_batch_size, procs_per_model)) all_values.append('partitioned = %f, %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d' % (partitioned_accuracy, distributed_accuracy, model_num, mini_batch_size, procs_per_model)) # End model_num loop if min_partitioned_accuracy < overall_min_partitioned_accuracy: overall_min_partitioned_accuracy = min_partitioned_accuracy if min_distributed_accuracy < overall_min_distributed_accuracy: overall_min_distributed_accuracy = min_distributed_accuracy # End fatal_errors == [] block # End procs_per_model loop # End mini_batch_size loop for fatal_error in fatal_errors: print(fatal_error) assert fatal_errors == [] # If there were no fatal errors, archive the accuracies. if os.environ['LOGNAME'] == 'lbannusr': key = 'bamboo_planKey' if key in os.environ: plan = os.environ[key] if plan in ['LBANN-NIGHTD', 'LBANN-WD']: archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/io_buffers.txt' % (plan, cluster, compiler_name) with open(archive_file, 'a') as archive: archive.write('%s, %f, %f\n' % (os.environ['bamboo_buildNumber'], overall_min_partitioned_accuracy, overall_min_distributed_accuracy)) else: print('The plan %s does not have archiving activated' % plan) else: print('%s is not in os.environ' % key) else: print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME']) print('Errors for: partitioned_and_distributed %s (%d)' % (compiler_name, len(errors))) for error in errors: print(error) if should_log: print('All values for: partitioned_and_distributed %s (%d)' % (compiler_name, len(all_values))) for value in all_values: print(value) assert errors == []
def test_command_catalyst(): actual = tools.get_command(cluster='catalyst', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False) expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected
def skeleton_models(cluster, dir_name, executables, compiler_name): if compiler_name not in executables: pytest.skip('default_exes[%s] does not exist' % compiler_name) opt = 'sgd' node_count = 1 time_limit = 1 defective_models = [] working_models = [] for subdir, dirs, files in os.walk(dir_name + '/model_zoo/models/'): for file_name in files: if file_name.endswith('.prototext') and "model" in file_name: model_path = subdir + '/' + file_name print('Attempting model setup for: ' + file_name ) data_filedir_default = None data_filedir_train_default=None data_filename_train_default=None data_filedir_test_default=None data_filename_test_default=None data_reader_path=None if 'motif' in file_name: print('Skipping %s because motifs are deprecated' % model_path) continue elif 'mnist' in file_name: data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST' data_reader_name = 'mnist' elif 'adversarial' in file_name: data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST' data_reader_path = '%s/model_zoo/models/gan/mnist/adversarial_data.prototext' % (dir_name) data_reader_name = None elif 'discriminator' in file_name: data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST' data_reader_path = '%s/model_zoo/models/gan/mnist/discriminator_data.prototext' % (dir_name) data_reader_name = None elif 'triplet' in file_name: data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/' data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/train/train_list_8h.nfl.npz' data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/' data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/val/val_list_8h.nfl.npz' data_reader_path = '%s/model_zoo/models/siamese/triplet/data_reader_triplet.prototext' % (dir_name) data_reader_name = None elif 'siamese_alexnet' in file_name: data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/' data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt' data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/' data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt' data_reader_path = '%s/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext' % (dir_name) data_reader_name = None elif 'net' in file_name: data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/' data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt' data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/' data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt' data_reader_name = 'imagenet' node_count = 2 if(cluster == 'ray'): time_limit = 3 if 'resnet50' in file_name: node_count = 8 elif 'cifar' in file_name: data_filename_train_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin' data_filename_test_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin' data_reader_name = 'cifar10' elif 'char' in file_name: data_filedir_default = '/p/lscratchh/brainusr/datasets/tinyshakespeare/' data_reader_name = 'ascii' else: print("Shared lbannusr account doesn't have access to dataset this model requires") continue if (cluster == 'ray') and (data_reader_name in ['cifar10', 'ascii']): print('Skipping %s because data is not available on ray' % model_path) elif (cluster == 'ray') or (cluster == 'pascal') and ('conv_autoencoder' in file_name) or ('gan' in subdir): print('Skipping %s because unpooling/noise is not implemented on gpu' % model_path) else: output_file_name = '%s/bamboo/unit_tests/output/check_proto_models_%s_%s_output.txt' % (dir_name, file_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/check_proto_models_%s_%s_error.txt' % (dir_name, file_name, compiler_name) cmd = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=node_count, partition='pbatch', time_limit=time_limit, dir_name=dir_name, data_filedir_default=data_filedir_default, data_filedir_train_default=data_filedir_train_default, data_filename_train_default=data_filename_train_default, data_filedir_test_default=data_filedir_test_default, data_filename_test_default=data_filename_test_default, data_reader_name=data_reader_name, data_reader_path=data_reader_path, exit_after_setup=True, model_path=model_path, optimizer_name=opt, output_file_name=output_file_name, error_file_name=error_file_name) if os.system(cmd) != 0: print("Error detected in " + model_path) #defective_models.append(file_name) defective_models.append(cmd) else: working_models.append(cmd) num_defective = len(defective_models) if num_defective != 0: print('Working models: %d. Defective models: %d', len(working_models), num_defective) print('Errors for: The following models exited with errors %s' % compiler_name) for model in defective_models: print(model) assert num_defective == 0