예제 #1
0
def test_bad_data_filedir_11():
    try:
        tools.get_command('ray', 'exe', data_filedir_default='filedir', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: data_filedir_default set but neither data_reader_name or data_reader_path are.'
	assert actual == expected
예제 #2
0
def test_bad_optimizer():
    try:
        tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', optimizer_path='path', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: optimizer_path is set but so is optimizer_name'
        assert actual == expected
예제 #3
0
def test_bad_data_filedir_15():
    try:
        tools.get_command('ray', 'exe', data_filename_test_default='e', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.'
        assert actual == expected
예제 #4
0
def test_bad_dir_name_5():
    try:
        tools.get_command('ray', 'exe', optimizer_name='name', check_executable_existance=False)
    except Exception, e:
	actual = str(e)
	expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.'
        assert actual == expected
예제 #5
0
파일: test_tools.py 프로젝트: LLNL/lbann
def test_unsupported_cluster():
    try:
        tools.get_command('quartz', 'exe', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Unsupported Cluster: quartz'
        assert actual == expected
예제 #6
0
def test_blacklisted_substrings():
    try:
        tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid character(s): ; contains ; , --model=new_model contains --'
        assert actual == expected
예제 #7
0
def test_bad_model_3():
    try:
        tools.get_command('ray', 'exe', dir_name='dir', model_name='name',  model_path='path', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name'
        assert actual == expected
예제 #8
0
def test_bad_model_5():
    try:
        tools.get_command('ray', 'exe', dir_name='dir', model_name='name', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: model_name set but not model_folder.'
        assert actual == expected
예제 #9
0
def test_bad_data_reader():
    try:
        tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: data_reader_path is set but so is data_reader_name , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.'
        assert actual == expected
예제 #10
0
def test_bad_data_filedir_8():
    try:
        tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_test_default='h', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]'
        assert actual == expected
예제 #11
0
def test_unsupported_cluster():
    try:
        tools.get_command('quartz', 'exe', check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Unsupported Cluster: quartz'
        assert actual == expected
예제 #12
0
def test_bad_dir_name_4():
    try:
        tools.get_command('catalyst', 'exe', data_reader_name='name',
                          check_executable_existence=False)
        assert False
    except Exception as e:
        actual = str(e)
        expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is. , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.'
        assert actual == expected
예제 #13
0
def test_bad_dir_name_1():
    try:
        tools.get_command('ray', 'exe', dir_name='dir',
                          check_executable_existence=False)
        assert False
    except Exception as e:
        actual = str(e)
        expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.'
        assert actual == expected
예제 #14
0
def test_bad_data_filedir_10():
    try:
        tools.get_command('ray',
                          'exe',
                          data_reader_path='path',
                          check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_ray. If a data reader is provided, an alternative filedir must be available for Ray. Alternatively, all of [data_filedir_train_ray, data_filename_train_ray, data_filedir_test_ray, data_filename_test_ray] can be set.'
        assert actual == expected
예제 #15
0
def test_blacklisted_substrings_3():
    try:
        tools.get_command('ray', 'exe', partition='pdebug',
                          extra_lbann_flags={'key': '--bad_value'},
                          check_executable_existence=False)
        assert False
    except Exception as e:
        actual = str(e)
        expected = 'Invalid character(s): --bad_value contains --'
        assert actual == expected
예제 #16
0
def test_bad_data_reader():
    try:
        tools.get_command('catalyst',
                          'exe',
                          dir_name='dir',
                          data_reader_name='name',
                          data_reader_path='path',
                          check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: data_reader_path is set but so is data_reader_name'
        assert actual == expected
예제 #17
0
def test_bad_data_filedir_4():
    try:
        tools.get_command('ray',
                          'exe',
                          dir_name='dir',
                          data_reader_name='name',
                          data_filedir_ray='filedir',
                          data_filename_test_ray='d',
                          check_executable_existance=False)
    except Exception, e:
        actual = str(e)
        expected = 'Invalid Usage: data_fildir_ray set but so is at least one of [data_filedir_train_ray, data_filename_train_ray, data_filedir_test_ray, data_filename_test_ray]'
        assert actual == expected
예제 #18
0
def test_bad_extra_lbann_flags_not_a_dict():
    try:
        tools.get_command('ray', 'exe', partition='pdebug',
                          extra_lbann_flags='invalid_flag',
                          check_executable_existence=False)
        assert False
    except Exception as e:
        actual = str(e)
        expected = (
            'Invalid Usage: extra_lbann_flags must be a dict e.g. `{flag :'
            ' None, flag: 4}`. Use `None` if a flag has no value attached '
            'to it.')
        assert actual == expected
예제 #19
0
def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_name):
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)
    exe = executables[compiler_name]
    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist', model_folder='tests',
        model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
        output_file_name=output_file_name, error_file_name=error_file_name)
    return_code_nockpt = os.system(command)
    if return_code_nockpt != 0:
        sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error')
        sys.exit(1)
    os.system('mv ckpt ckpt_baseline')

    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_checkpoint_%s_error.txt' % (dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist', model_folder='tests',
        model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd',
        output_file_name=output_file_name, error_file_name=error_file_name)
    return_code_ckpt_1 = os.system(command)
    if return_code_ckpt_1 != 0:
        sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error')
        sys.exit(1)

    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_restart_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_restart_%s_error.txt' % (dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist', model_folder='tests',
        model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
        output_file_name=output_file_name, error_file_name=error_file_name)
    return_code_ckpt_2 = os.system(command)
    if return_code_ckpt_2 != 0:
        sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error')
        sys.exit(1)

    diff_test = os.system('diff -rq ckpt ckpt_baseline')
    os.system('rm -rf ckpt*')
    assert diff_test == 0
예제 #20
0
def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name):
    if compiler_name not in executables:
        e = 'skeleton_mnist_conv_graph: default_exes[%s] does not exist' % compiler_name
        print('Skip - ' + e)
        pytest.skip(e)
    output_file_name = '%s/bamboo/unit_tests/output/mnist_conv_graph_%s_output.txt' % (
        dir_name, compiler_name)
    error_file_name = '%s/bamboo/unit_tests/error/mnist_conv_graph_%s_error.txt' % (
        dir_name, compiler_name)
    if compiler_name == 'gcc7':
        tl = 240
    else:
        tl = None
    command = tools.get_command(
        cluster=cluster,
        executable=executables[compiler_name],
        num_nodes=1,
        time_limit=tl,
        num_processes=1,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist',
        model_folder='tests',
        model_name='mnist_conv_graph',
        optimizer_name='adam',
        output_file_name=output_file_name,
        error_file_name=error_file_name)
    return_code = os.system(command)
    assert return_code == 0
예제 #21
0
def test_unit_no_params_bad(cluster, exes, dirname):
    exe = exes['gcc4']
    sys.stderr.write('TESTING: run lbann with no params; lbann should throw exception\n')
    command = tools.get_command(
        cluster=cluster, executable=exe, exit_after_setup=True)
    return_code = os.system(command)
    assert return_code != 0
예제 #22
0
def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly,
                         debug_build, should_log=False):
    # If weekly or debug_build are true, then run the test.
    if not (weekly or debug_build):
        e = 'skeleton_cifar_debug: Not doing weekly or debug_build testing'
        print('Skip - ' + e)
        pytest.skip(e)
    if cluster == 'ray':
        e = 'skeleton_cifar_debug: cifar not operational on Ray'
        print('Skip - ' + e)
        pytest.skip(e)
    if compiler_name not in executables:
        e = 'skeleton_cifar_debug: default_exes[%s] does not exist' % compiler_name
        print('Skip - ' + e)
        pytest.skip(e)
    model_name = 'autoencoder_cifar10'
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=executables[compiler_name],	num_nodes=1,
        partition='pbatch', time_limit=100, dir_name=dir_name,
        data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin',
        data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin',
        data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name,
        model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad',
        output_file_name=output_file_name, error_file_name=error_file_name)
    output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log)
    assert output_value == 0
예제 #23
0
def skeleton_jag_reconstruction_loss(cluster, dir_name, weekly,
                                     data_reader_percent):
    output_file_name = '%s/ci_test/unit_tests/output/jag_reconstruction_loss_output.txt' % (
        dir_name)
    error_file_name = '%s/ci_test/unit_tests/error/jag_reconstruction_loss_error.txt' % (
        dir_name)
    command = tools.get_command(
        cluster=cluster,
        num_nodes=2,
        num_processes=32,
        disable_cuda=1,
        dir_name=dir_name,
        sample_list_train_default=
        '/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K4trainers/100Kindex.txt',
        sample_list_test_default=
        '/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K16trainers/t1_sample_list.txt',
        data_reader_name='jag',
        data_reader_percent='prototext',
        metadata='applications/physics/data/jag_100M_metadata.prototext',
        model_folder='tests',
        model_name='jag_single_layer_ae',
        optimizer_name='adam',
        output_file_name=output_file_name,
        error_file_name=error_file_name,
        weekly=weekly)
    return_code = os.system(command)
    tools.assert_success(return_code, error_file_name)
예제 #24
0
def skeleton_mnist_debug(cluster,
                         dir_name,
                         executables,
                         compiler_name,
                         weekly,
                         debug,
                         should_log=False):
    # If weekly or debug are true, then run the test.
    if (not weekly) and (not debug):
        pytest.skip('Not doing weekly or debug testing')
    if compiler_name not in executables:
        pytest.skip('default_exes[%s] does not exist' % compiler_name)
    model_name = 'lenet_mnist'
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (
        dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (
        dir_name, model_name, compiler_name)
    command = tools.get_command(
        cluster=cluster,
        executable=executables[compiler_name],
        num_nodes=1,
        partition='pbatch',
        time_limit=100,
        dir_name=dir_name,
        data_filedir_ray='/p/gscratchr/brainusr/datasets/MNIST',
        data_reader_name='mnist',
        model_folder='models/' + model_name,
        model_name=model_name,
        num_epochs=5,
        optimizer_name='adagrad',
        output_file_name=output_file_name,
        error_file_name=error_file_name)
    output_value = common_code.run_lbann(command, model_name, output_file_name,
                                         error_file_name)
    assert output_value == 0
예제 #25
0
def test_unit_two_models_bad2(cluster, exes,  dirname):
    exe = exes['gcc4']
    sys.stderr.write('TESTING: run lbann with two models with missing {; lbann should throw exception\n')
    model_path='prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}'
    command = tools.get_command(
        cluster=cluster, executable=exe, exit_after_setup=True,
        model_path=model_path)
    return_code = os.system(command)
    assert return_code != 0
예제 #26
0
def test_unit_one_model_bad(cluster, exes, dirname):
    exe = exes['gcc4']
    sys.stderr.write('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n')
    model_path = 'prototext/model_mnist_simple_1.prototext'
    command = tools.get_command(
        cluster=cluster, executable=exe, exit_after_setup=True,
        model_path=model_path)
    return_code = os.system(command)
    assert return_code != 0
예제 #27
0
def skeleton_gradient_check_resnet(cluster, executables, dir_name, compiler_name):
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1,
        dir_name=dir_name, data_filedir_ray='/p/gscratchr/brainusr/datasets/MNIST',
        data_reader_name='mnist', model_folder='tests', model_name='mnist_resnet',
        optimizer_name='adam')
    return_code = os.system(command)
    assert return_code == 0
예제 #28
0
def test_unit_missing_reader(cluster, exes, dirname):
    exe = exes['gcc4']
    sys.stderr.write('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n')
    model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}'
    optimizer_path = 'prototext/opt_sgd.prototext'
    command = tools.get_command(
        cluster=cluster, executable=exe, exit_after_setup=True,
        model_path=model_path, optimizer_path=optimizer_path)
    return_code = os.system(command)
    assert return_code != 0
예제 #29
0
def get_command(cluster, dir_name, model_folder, model_name, executable,
                output_file_name, error_file_name, compiler_name, weekly=False):
    if model_name in ['alexnet', 'conv_autoencoder_imagenet']:
        data_reader_percent = 0.01
        # If doing weekly testing, increase data_reader_percent
        if weekly:
            data_reader_percent = 0.10
        command = tools.get_command(
            cluster=cluster, executable=executable, num_nodes=16,
            partition='pbatch', time_limit=600, num_processes=32,
            dir_name=dir_name,
            data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/',
            data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt',
            data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/',
            data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt',
            data_reader_name='imagenet', data_reader_percent=data_reader_percent,
            model_folder=model_folder, model_name=model_name, num_epochs=20,
            optimizer_name='adagrad', output_file_name=output_file_name,
            error_file_name=error_file_name)
    elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']:
        if (model_name == 'lenet_mnist') and \
                (compiler_name in ['clang6', 'intel19']):
            partition = 'pbatch'
            time_limit = 600
        else:
            partition = 'pdebug'
            time_limit = 30
        if (cluster == 'ray') and (model_name == 'conv_autoencoder_mnist'):
            num_processes = 20
        else:
            num_processes = 2
        command = tools.get_command(
            cluster=cluster, executable=executable, num_nodes=1,
            partition=partition, time_limit=time_limit,
            num_processes=num_processes, dir_name=dir_name,
            data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
            data_reader_name='mnist', model_folder=model_folder,
            model_name=model_name, num_epochs=5, optimizer_name='adagrad',
            output_file_name=output_file_name, error_file_name=error_file_name)
    else:
        raise Exception('Invalid model: %s' % model_name)
    return command
예제 #30
0
def test_unit_bad_params(cluster, exes):
    exe = exes['gcc4']
    sys.stderr.write(
        'TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n'
    )
    (command_allocate, command_run, _,
     _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True)
    return_code = os.system(
        '%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext'
        % (command_allocate, command_run, exe))
    assert return_code != 0
예제 #31
0
def test_unit_missing_optimizer(cluster, exes, dirname):
    exe = exes['gcc4']
    sys.stderr.write('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n')
    model_path='{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}'
    data_reader_path='prototext/data_reader_mnist.prototext'
    command = tools.get_command(
        cluster=cluster, executable=exe, data_reader_path=data_reader_path,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        exit_after_setup=True, model_path=model_path)
    return_code = os.system(command)
    assert return_code != 0
예제 #32
0
def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name):
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)
    output_file_name = '%s/bamboo/unit_tests/output/layer_log_sigmoid_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/layer_log_sigmoid_%s_error.txt' % (dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
        data_filedir_default='', data_reader_name='synthetic',
        model_folder='tests/layer_tests', model_name='log_sigmoid', optimizer_name='sgd',
        output_file_name=output_file_name, error_file_name=error_file_name)
    return_code = os.system(command)
    assert return_code == 0
예제 #33
0
def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_name):
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)
    output_file_name = '%s/bamboo/unit_tests/output/mnist_ridge_regression_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/mnist_ridge_regression_%s_error.txt' % (dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist',
        model_folder='tests', model_name='mnist_ridge_regression', optimizer_name='adam',
        output_file_name=output_file_name, error_file_name=error_file_name)
    return_code = os.system(command)
    assert return_code == 0
예제 #34
0
def skeleton_layer_identity(cluster, executables, dir_name, compiler_name):
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)
    output_file_name = '%s/bamboo/unit_tests/output/layer_identity_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/layer_identity_%s_error.txt' % (dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
        data_filedir_default='', data_reader_name='synthetic',
        model_folder='tests/layer_tests', model_name='identity', optimizer_name='sgd',
        output_file_name=output_file_name, error_file_name=error_file_name)
    return_code = os.system(command)
    assert return_code == 0
예제 #35
0
파일: common_code.py 프로젝트: LLNL/lbann
def get_command(cluster, dir_name, model_folder, model_name, executable,
                output_file_name, error_file_name, compiler_name, weekly=False):
    if model_name in ['alexnet', 'conv_autoencoder_imagenet']:
        data_reader_percent = 0.01
        if weekly:
            data_reader_percent = 0.10
        command = tools.get_command(
            cluster=cluster, executable=executable, num_nodes=16,
            partition='pbatch', time_limit=600, num_processes=32,
            dir_name=dir_name,
            data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/',
            data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt',
            data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/',
            data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt',
            data_reader_name='imagenet', data_reader_percent=data_reader_percent,
            model_folder=model_folder, model_name=model_name, num_epochs=20,
            optimizer_name='adagrad', output_file_name=output_file_name,
            error_file_name=error_file_name)
    elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']:
        if (model_name == 'lenet_mnist') and (compiler_name in ['clang4', 'intel18']):
            partition = 'pbatch'
            time_limit = 600
        else:
            partition = 'pdebug'
            time_limit = 30
        if (cluster == 'ray') and (model_name == 'conv_autoencoder_mnist'):
            num_processes = 20
        else:
            num_processes = 2
	command = tools.get_command(
            cluster=cluster, executable=executable, num_nodes=1,
            partition=partition, time_limit=time_limit, num_processes=num_processes,
            dir_name=dir_name,
            data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
            data_reader_name='mnist', model_folder=model_folder,
            model_name=model_name, num_epochs=5, optimizer_name='adagrad',
            output_file_name=output_file_name, error_file_name=error_file_name)
    else:
        raise Exception('Invalid model: %s' % model_name)
    return command
예제 #36
0
def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, debug, should_log=False):
    # If weekly or debug are true, then run the test.
    if (not weekly) and (not debug):
        pytest.skip('Not doing weekly or debug testing')
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)
    model_name = 'lenet_mnist'
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
        partition='pbatch', time_limit=100, dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist', model_folder='models/' + model_name,
        model_name=model_name, num_epochs=5, optimizer_name='adagrad',
        output_file_name=output_file_name, error_file_name=error_file_name)
    output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name)
    assert output_value == 0
예제 #37
0
def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name):
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)

    lbann2  = executables[compiler_name] + '2'
    model_path = '{../../model_zoo/models/lenet_mnist/model_lenet_mnist.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}'
    output_file_name = '%s/bamboo/unit_tests/output/lbann2_no_checkpoint_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/lbann2_no_checkpoint_%s_error.txt' % (dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2,
        data_reader_name='mnist',
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        dir_name=dir_name,
        model_path=model_path,
        optimizer_name='sgd',
        num_epochs=2,
        output_file_name=output_file_name,
        error_file_name=error_file_name)
    os.mkdir('lbann2_ckpt')
    return_code = os.system(command)
    if return_code != 0:
        sys.stderr.write('LBANN2 LeNet execution failed, exiting with error')
        sys.exit(1)

    os.system('mv lbann2_ckpt lbann2_nockpt')

    output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/lbann2_checkpoint_%s_error.txt' % (dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist', model_folder='tests',
        model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
        output_file_name=output_file_name,
        error_file_name=error_file_name)
    return_code_ckpt_1 = os.system(command)
    if return_code_ckpt_1 != 0:
        sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error')
        sys.exit(1)

    output_file_name = '%s/bamboo/unit_tests/output/lbann2_restart_%s_output.txt' % (dir_name, compiler_name)
    error_file_name  = '%s/bamboo/unit_tests/error/lbann2_restart_%s_error.txt' % (dir_name, compiler_name)
    os.mkdir('lbann2_ckpt')
    command = tools.get_command(
        cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist',
        model_path='../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext',
        num_epochs=2, optimizer_name='sgd', ckpt_dir='ckpt/',
        output_file_name=output_file_name,
        error_file_name=error_file_name)
    return_code_ckpt_2 = os.system(command)
    if return_code_ckpt_2 != 0:
        sys.stderr.write('LBANN2 LeNet weight reload failed, exiting with error')
        sys.exit(1)
    os.system('rm lbann2_ckpt/model0-epoch*')
    os.system('rm lbann2_nockpt/model0-epoch*')
    diff_test = os.system('diff -rq lbann2_ckpt/ lbann2_nockpt/')
    os.system('rm -rf ckpt')
    os.system('rm -rf lbann2_*')
    assert diff_test == 0
예제 #38
0
def test_unit_bad_params(cluster, exes, dirname):
    exe = exes['gcc4']
    sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n')
    (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True)
    return_code = os.system('%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe))
    assert return_code != 0
예제 #39
0
파일: test_tools.py 프로젝트: LLNL/lbann
def test_command_ray():
    actual = tools.get_command(cluster='ray', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False)
    expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file'
    assert actual == expected
예제 #40
0
def skeleton_io_buffers(cluster, dir_name, executables, compiler_name, weekly):
    if not weekly:
        pytest.skip('Not doing weekly testing')
    if cluster == 'surface':
        pytest.skip('skeleton_io_buffers does not run on surface')
    if compiler_name not in executables:
        pytest.skip('default_exes[%s] does not exist' % compiler_name)
    max_mb = 300
    # Printing output from 6*6*2=72 runs of LBANN makes the logs too slow.
    # Output from run_lbann is still printed - if there is a failure.
    should_log = False
    partitioned = 'mnist_partitioned_io'
    distributed = 'mnist_distributed_io'
    model_names = [partitioned, distributed]
    accuracies = {}
    errors = []
    all_values = []
    fatal_errors = []
    overall_min_partitioned_accuracy = float('inf')
    overall_min_distributed_accuracy = float('inf')
    for mini_batch_size in [300, 150, 100, 75, 60, 50]:
        num_models = max_mb / mini_batch_size
        for procs_per_model in [1, 2, 3, 4, 5, 6]:
            num_ranks = procs_per_model * num_models
            for model_name in model_names:
                output_file_name = '%s/bamboo/integration_tests/output/%s_%d_%d_output.txt' % (dir_name, model_name, mini_batch_size, procs_per_model)
                error_file_name = '%s/bamboo/integration_tests/error/%s_%d_%d_error.txt' % (dir_name, model_name, mini_batch_size, procs_per_model)
                command = tools.get_command(
                    cluster=cluster, executable=executables[compiler_name], num_nodes=2,
                    num_processes=num_ranks, dir_name=dir_name,
                    data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
                    data_reader_name='mnist', mini_batch_size=mini_batch_size,
                    model_folder='tests', model_name=model_name, num_epochs=5,
                    optimizer_name='adagrad',
                    processes_per_model=procs_per_model,
                    output_file_name=output_file_name, error_file_name=error_file_name)
                try:
                    common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) # Don't need return value
                    accuracy_dict = common_code.extract_data(output_file_name, ['test_accuracy'], should_log)
                    accuracies[model_name] = accuracy_dict['test_accuracy']
                except Exception:
                    # We want to keep running to see if any other mini_batch_size & procs_per_model combination crashes.
                    # However, it is now pointless to compare accuracies.
                    fatal_errors.append('Crashed running %s with mini_batch_size=%d, procs_per_model=%d' % (model_name, mini_batch_size, procs_per_model))
            # End model name loop
            if fatal_errors == []:
                partitioned_num_models = len(accuracies[partitioned].keys())
                distributed_num_models = len(accuracies[distributed].keys())
                assert partitioned_num_models == distributed_num_models

                min_partitioned_accuracy = float('inf')
                min_distributed_accuracy = float('inf')
                for model_num in sorted(accuracies[partitioned].keys()):
                    partitioned_accuracy = accuracies[partitioned][model_num]['overall']
                    distributed_accuracy = accuracies[distributed][model_num]['overall']
                    if partitioned_accuracy < min_partitioned_accuracy:
                        min_partitioned_accuracy = partitioned_accuracy
                    if distributed_accuracy < min_distributed_accuracy:
                        min_distributed_accuracy = distributed_accuracy
                    tolerance = 0.05
                    # Are we within tolerance * expected_value?
                    if abs(partitioned_accuracy - distributed_accuracy) > abs(tolerance * min(partitioned_accuracy, distributed_accuracy)):
                        errors.append('partitioned = %f != %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d' % (partitioned_accuracy, distributed_accuracy, model_num, mini_batch_size, procs_per_model))
                        all_values.append('partitioned = %f, %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d' % (partitioned_accuracy, distributed_accuracy, model_num, mini_batch_size, procs_per_model))
                # End model_num loop
                if min_partitioned_accuracy < overall_min_partitioned_accuracy:
                    overall_min_partitioned_accuracy = min_partitioned_accuracy
                if min_distributed_accuracy < overall_min_distributed_accuracy:
                    overall_min_distributed_accuracy = min_distributed_accuracy
            # End fatal_errors == [] block
        # End procs_per_model loop
    # End mini_batch_size loop
    for fatal_error in fatal_errors:
        print(fatal_error)
    assert fatal_errors == []
    # If there were no fatal errors, archive the accuracies.
    if os.environ['LOGNAME'] == 'lbannusr':
        key = 'bamboo_planKey'
        if key in os.environ:
            plan = os.environ[key]
            if plan in ['LBANN-NIGHTD', 'LBANN-WD']:
                archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/io_buffers.txt' % (plan, cluster, compiler_name)
                with open(archive_file, 'a') as archive:
                    archive.write('%s, %f, %f\n' % (os.environ['bamboo_buildNumber'], overall_min_partitioned_accuracy, overall_min_distributed_accuracy))
            else:
                print('The plan %s does not have archiving activated' % plan)
        else:
            print('%s is not in os.environ' % key)
    else:
        print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME'])

    print('Errors for: partitioned_and_distributed %s (%d)' % (compiler_name, len(errors)))
    for error in errors:
        print(error)
    if should_log:
        print('All values for: partitioned_and_distributed %s (%d)' % (compiler_name, len(all_values)))
        for value in all_values:
            print(value)
    assert errors == []
예제 #41
0
파일: test_tools.py 프로젝트: LLNL/lbann
def test_command_catalyst():
    actual = tools.get_command(cluster='catalyst', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existance=False)
    expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file'
    assert actual == expected
예제 #42
0
def skeleton_models(cluster, dir_name, executables, compiler_name):
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)
    opt = 'sgd'
    node_count = 1
    time_limit = 1
    defective_models = []
    working_models = []
    for subdir, dirs, files in os.walk(dir_name + '/model_zoo/models/'):
        for file_name in files:
            if file_name.endswith('.prototext') and "model" in file_name:
                model_path = subdir + '/' + file_name
                print('Attempting model setup for: ' + file_name )
                data_filedir_default = None
                data_filedir_train_default=None
                data_filename_train_default=None
                data_filedir_test_default=None
                data_filename_test_default=None
                data_reader_path=None
                if 'motif' in file_name:
                    print('Skipping %s because motifs are deprecated' % model_path)
                    continue
                elif 'mnist' in file_name:
                    data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
                    data_reader_name = 'mnist'
                elif 'adversarial' in file_name:
                    data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
                    data_reader_path = '%s/model_zoo/models/gan/mnist/adversarial_data.prototext' % (dir_name)
                    data_reader_name = None
                elif 'discriminator' in file_name:
                    data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
                    data_reader_path = '%s/model_zoo/models/gan/mnist/discriminator_data.prototext' % (dir_name)
                    data_reader_name = None
                elif 'triplet' in file_name:
                    data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/'
                    data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/train/train_list_8h.nfl.npz'
                    data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/'
                    data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/val/val_list_8h.nfl.npz'
                    data_reader_path = '%s/model_zoo/models/siamese/triplet/data_reader_triplet.prototext' % (dir_name)
                    data_reader_name = None
                elif 'siamese_alexnet' in file_name:
                    data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/'
                    data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt'
                    data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/'
                    data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt'
                    data_reader_path = '%s/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext' % (dir_name)
                    data_reader_name = None
                elif 'net' in file_name:
                    data_filedir_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/'
                    data_filename_train_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt'
                    data_filedir_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/'
                    data_filename_test_default = '/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt'
                    data_reader_name = 'imagenet'
                    node_count = 2
                    if(cluster == 'ray'):
                        time_limit = 3
                    if 'resnet50' in file_name:
                        node_count = 8
                elif 'cifar' in file_name:
                    data_filename_train_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin'
                    data_filename_test_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin'
                    data_reader_name = 'cifar10'
                elif 'char' in file_name:
                    data_filedir_default = '/p/lscratchh/brainusr/datasets/tinyshakespeare/'
                    data_reader_name = 'ascii'
                else:
                    print("Shared lbannusr account doesn't have access to dataset this model requires")
                    continue
                if (cluster == 'ray') and (data_reader_name in ['cifar10', 'ascii']):
                    print('Skipping %s because data is not available on ray' % model_path)
                elif (cluster == 'ray') or (cluster == 'pascal')  and ('conv_autoencoder' in file_name) or ('gan' in subdir):
                    print('Skipping %s because unpooling/noise is not implemented on gpu' % model_path)
                else:
                    output_file_name = '%s/bamboo/unit_tests/output/check_proto_models_%s_%s_output.txt' % (dir_name, file_name, compiler_name)
                    error_file_name = '%s/bamboo/unit_tests/error/check_proto_models_%s_%s_error.txt' % (dir_name, file_name, compiler_name)
                    cmd = tools.get_command(
                        cluster=cluster, executable=executables[compiler_name], num_nodes=node_count,
                        partition='pbatch', time_limit=time_limit, dir_name=dir_name,
                        data_filedir_default=data_filedir_default,
                        data_filedir_train_default=data_filedir_train_default,
                        data_filename_train_default=data_filename_train_default,
                        data_filedir_test_default=data_filedir_test_default,
                        data_filename_test_default=data_filename_test_default,
                        data_reader_name=data_reader_name, data_reader_path=data_reader_path,
                        exit_after_setup=True, model_path=model_path, optimizer_name=opt,
                        output_file_name=output_file_name, error_file_name=error_file_name)
                    if os.system(cmd) != 0:
                        print("Error detected in " + model_path)
                        #defective_models.append(file_name)
                        defective_models.append(cmd)
                    else:
                       working_models.append(cmd)
    num_defective = len(defective_models)
    if num_defective != 0:
        print('Working models: %d. Defective models: %d', len(working_models), num_defective)
        print('Errors for: The following models exited with errors %s' % compiler_name)
        for model in defective_models:
            print(model)
    assert num_defective == 0