示例#1
0
        # LTFB is evaluating on the correct models.
        tol = 1e-4
        for step in range(_num_epochs - 1):
            for trainer in range(num_trainers):
                partner = ltfb_partners[trainer][step]
                winner = ltfb_winners[trainer][step]
                local_val = tournament_metrics[trainer][2 * step]
                partner_val = tournament_metrics[trainer][2 * step + 1]
                winner_val = validation_metrics[trainer][step + 1]
                true_local_val = validation_metrics[trainer][step]
                true_partner_val = validation_metrics[partner][step]
                true_winner_val = validation_metrics[winner][step]
                assert true_local_val-tol < local_val < true_local_val+tol, \
                    'Incorrect metric value for LTFB local model'
                assert true_partner_val-tol < partner_val < true_partner_val+tol, \
                    'Incorrect metric value for LTFB partner model'
                assert true_winner_val-tol < winner_val < true_winner_val+tol, \
                    'Incorrect metric value for LTFB winner model'

    # Return test function from factory function
    func.__name__ = test_name
    return func


# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment,
                                     __file__,
                                     nodes=2,
                                     lbann_args='--procs_per_trainer=2'):
    globals()[_test_func.__name__] = augment_test_func(_test_func)
示例#2
0
    The Python data reader will import the current Python file to
    access the sample access functions.

    Args:
        lbann (module): Module for LBANN Python frontend

    """

    # Note: The training data reader should be removed when
    # https://github.com/LLNL/lbann/issues/1098 is resolved.
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'train')
    ])
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'test')
    ])
    return message


# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
for test in tools.create_tests(setup_experiment, __file__):
    globals()[test.__name__] = test
示例#3
0
        err = 0
        err_dirs = ''
        fileList = glob.glob('{base}/trainer0/*'.format(base=checkpoint_ckpt))
        fileList, tmp_err, tmp_err_str = tools.multidir_diff(
            checkpoint_ckpt, restart_ckpt, fileList)
        err += tmp_err
        err_dirs += tmp_err_str

        err_msg = "\nUnmatched checkpoints:\n"
        for f in fileList:
            err_msg += f + "\n"
        assert len(fileList) == 0, \
            'Extra checkpoint data in baseline directory: ' + err_msg
        assert err == 0, err_dirs

    # Return test function from factory function
    func.__name__ = test_name
    return func


# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(
        setup_experiment,
        __file__,
        test_name_base=test_name_base,
        nodes=num_nodes,
        work_subdir='baseline',
        lbann_args=['--disable_cuda',
                    ' --num_epochs=' + str(num_ckpt_epochs)]):
    globals()[_test_func.__name__] = create_test_func(_test_func)
示例#4
0
    def func(cluster, dirname, weekly):

        # Run LBANN experiment baseline
        print(
            '\n################################################################################'
        )
        print('Running model halfway ')
        print(
            '################################################################################\n'
        )
        baseline_test_output = test_func(cluster, dirname)
        baseline_training_metrics = tools.collect_metrics_from_log_func(
            baseline_test_output['stdout_log_file'],
            'training epoch [0-9]+ objective function')
        baseline_validation_metrics = tools.collect_metrics_from_log_func(
            baseline_test_output['stdout_log_file'],
            'validation objective function')
        baseline_test_metrics = tools.collect_metrics_from_log_func(
            baseline_test_output['stdout_log_file'], 'test objective function')

        # Run LBANN model to checkpoint
        print(
            '\n################################################################################'
        )
        print('Running model to checkpointed weights')
        print(
            '################################################################################\n'
        )
        test_func_checkpoint = tools.create_tests(
            setup_experiment,
            __file__,
            test_name_base=test_name_base,
            nodes=num_nodes,
            work_subdir='reload_weights_from_checkpoint',
            lbann_args=[
                '--disable_cuda', '--num_epochs=' + str(num_restart_epochs),
                '--load_model_weights_dir=' +
                os.path.join(baseline_test_output['work_dir'], checkpoint_dir,
                             'trainer0')
            ],
        )

        checkpoint_test_output = test_func_checkpoint[0](cluster, dirname)
        checkpoint_training_metrics = tools.collect_metrics_from_log_func(
            checkpoint_test_output['stdout_log_file'],
            'training epoch [0-9]+ objective function')
        checkpoint_validation_metrics = tools.collect_metrics_from_log_func(
            checkpoint_test_output['stdout_log_file'],
            'validation objective function')
        checkpoint_test_metrics = tools.collect_metrics_from_log_func(
            checkpoint_test_output['stdout_log_file'],
            'test objective function')

        print(
            '\n################################################################################'
        )
        print('Running model from save_model weights')
        print(
            '################################################################################\n'
        )
        test_func_restart = tools.create_tests(
            setup_experiment,
            __file__,
            test_name_base=test_name_base,
            nodes=num_nodes,
            work_subdir='reload_weights_from_save_model_cb',
            lbann_args=[
                '--disable_cuda', '--num_epochs=' + str(num_restart_epochs),
                '--load_model_weights_dir=' +
                os.path.join(baseline_test_output['work_dir'], save_model_dir,
                             'trainer0', 'model0/'),
                '--load_model_weights_dir_is_complete'
            ],
        )

        # Restart LBANN model and run to completion
        restart_test_output = test_func_restart[0](cluster, dirname)
        restart_training_metrics = tools.collect_metrics_from_log_func(
            restart_test_output['stdout_log_file'],
            'training epoch [0-9]+ objective function')
        restart_validation_metrics = tools.collect_metrics_from_log_func(
            restart_test_output['stdout_log_file'],
            'validation objective function')
        restart_test_metrics = tools.collect_metrics_from_log_func(
            restart_test_output['stdout_log_file'], 'test objective function')

        print(
            '\n################################################################################'
        )
        print('Comparing results of models')
        print(
            '################################################################################\n'
        )

        # Check if metrics are same in baseline and test experiments
        # Note: "Print statistics" callback will print up to 6 digits
        # of metric values.

        # Comparing training objective functions
        tools.compare_metrics(checkpoint_training_metrics,
                              restart_training_metrics)
        # Comparing validation objective functions
        tools.compare_metrics(checkpoint_validation_metrics,
                              restart_validation_metrics)
        # Comparing test objective functions
        tools.compare_metrics(checkpoint_test_metrics, restart_test_metrics)

        baseline_ckpt = os.path.join(baseline_test_output['work_dir'],
                                     checkpoint_dir)
        checkpoint_ckpt = os.path.join(checkpoint_test_output['work_dir'],
                                       checkpoint_dir)
        restart_ckpt = os.path.join(restart_test_output['work_dir'],
                                    checkpoint_dir)

        err = 0
        err_dirs = ''
        fileList = glob.glob('{base}/trainer0/*'.format(base=checkpoint_ckpt))
        fileList, tmp_err, tmp_err_str = tools.multidir_diff(
            checkpoint_ckpt, restart_ckpt, fileList)
        err += tmp_err
        err_dirs += tmp_err_str

        err_msg = "\nUnmatched checkpoints:\n"
        for f in fileList:
            err_msg += f + "\n"
        assert len(fileList) == 0, \
            'Extra checkpoint data in baseline directory: ' + err_msg
        assert err == 0, err_dirs
示例#5
0
        #Only tested on Ray. Skip if mini-batch test on another cluster. Change this when mini-batch values are available for other clusters 

        # Check if mini-batch time is within expected range
        # Note: Skip first epoch since its runtime is usually an outlier
        mini_batch_times = mini_batch_times[1:]
        mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
        assert (0.75 * expected_mini_batch_times[cluster]
                < mini_batch_time
                < 1.25 * expected_mini_batch_times[cluster]), \
                'average mini-batch time is outside expected range'
        # Check for GPU usage and memory leaks 
        # Note: Skip first epoch 
        gpu_usages = gpu_usages[1:] 
        gpu_usage = sum(gpu_usages)/len(gpu_usages)

        assert (0.75 * expected_gpu_usage[cluster] 
                < gpu_usage 
                < 1.25 * expected_gpu_usage[cluster]),\
                'average gpu usage is outside expected range'
    # Return test function from factory function
    func.__name__ = test_name
    return func

# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment,
                                     __file__,
                                     lbann_args=['--num_io_threads=1'],
                                     nodes=compute_nodes):
    globals()[_test_func.__name__] = augment_test_func(_test_func)

    Args:
        lbann (module): Module for LBANN Python frontend

    """

    # Note: The training data reader should be removed when
    # https://github.com/LLNL/lbann/issues/1098 is resolved.
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'train')
    ])
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'test')
    ])
    return message


# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
# Note: Create test name by removing ".py" from file name
_test_name = os.path.splitext(os.path.basename(current_file))[0]
for test in tools.create_tests(setup_experiment,
                               _test_name,
                               environment=tools.get_distconv_environment()):
    globals()[test.__name__] = test
示例#7
0
    The Python data reader will import the current Python file to
    access the sample access functions.

    Args:
        lbann (module): Module for LBANN Python frontend

    """

    # Note: The training data reader should be removed when
    # https://github.com/LLNL/lbann/issues/1098 is resolved.
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'train')
    ])
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'test')
    ])
    return message


# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
for test in tools.create_tests(setup_experiment, __file__, procs_per_node=4):
    globals()[test.__name__] = test
示例#8
0
    The Python data reader will import the current Python file to
    access the sample access functions.

    Args:
        lbann (module): Module for LBANN Python frontend

    """

    # Note: The training data reader should be removed when
    # https://github.com/LLNL/lbann/issues/1098 is resolved.
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'train')
    ])
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'test')
    ])
    return message


# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment, __file__):
    globals()[_test_func.__name__] = _test_func
        fileList = glob.glob('{base}/trainer0/*'.format(base=baseline_ckpt))
        fileList, tmp_err, tmp_err_str = tools.multidir_diff(
            baseline_ckpt, restart_ckpt, fileList)
        err += tmp_err
        err_dirs += tmp_err_str
        fileList, tmp_err, tmp_err_str = tools.multidir_diff(
            baseline_ckpt, checkpoint_ckpt, fileList)
        err += tmp_err
        err_dirs += tmp_err_str

        err_msg = "\nUnmatched checkpoints:\n"
        for f in fileList:
            err_msg += f + "\n"
        assert len(fileList) == 0, \
            'Extra checkpoint data in baseline directory: ' + err_msg
        assert err == 0, err_dirs

    # Return test function from factory function
    func.__name__ = test_name
    return func


# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment,
                                     __file__,
                                     test_name_base=test_name_base,
                                     nodes=num_nodes,
                                     work_subdir='baseline',
                                     lbann_args=['--disable_cuda=True']):
    globals()[_test_func.__name__] = create_test_func(_test_func)
                'train reconstruction error is outside expected range'

        # Check if testing reconstruction  is within expected range
        assert (expected_test_pc_range[0]
                < test_pc
                < expected_test_pc_range[1]), \
                'test reconstruction error is outside expected range'

        # Check if mini-batch time is within expected range
        # Note: Skip first epoch since its runtime is usually an outlier
        mini_batch_times = mini_batch_times[1:]
        mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
        assert (0.75 * expected_mini_batch_times[cluster]
                < mini_batch_time
                < 1.25 * expected_mini_batch_times[cluster]), \
                'average mini-batch time is outside expected range'

    # Return test function from factory function
    func.__name__ = test_name
    return func


m_lbann_args = f"--use_data_store --preload_data_store --metadata={metadata_prototext}"
# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment,
                                     __file__,
                                     lbann_args=[m_lbann_args],
                                     procs_per_node=procs_per_node,
                                     nodes=num_nodes):
    globals()[_test_func.__name__] = augment_test_func(_test_func)
示例#11
0
    access the sample access functions.

    Args:
        lbann (module): Module for LBANN Python frontend

    """

    # Note: The training data reader should be removed when
    # https://github.com/LLNL/lbann/issues/1098 is resolved.
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'train')
    ])
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'test')
    ])
    return message


# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
# Note: Create test name by removing ".py" from file name
_test_name = os.path.splitext(os.path.basename(current_file))[0]
for _test_func in tools.create_tests(setup_experiment, _test_name):
    globals()[_test_func.__name__] = _test_func
    Args:
        lbann (module): Module for LBANN Python frontend

    """

    # Note: The training data reader should be removed when
    # https://github.com/LLNL/lbann/issues/1098 is resolved.
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'train')
    ])
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'test')
    ])
    return message


# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
### @todo Run on >1 proc when https://github.com/LLNL/lbann/issues/1548 is resolved
for _test_func in tools.create_tests(setup_experiment,
                                     __file__,
                                     procs_per_node=1,
                                     nodes=1):
    globals()[_test_func.__name__] = _test_func
        tools.create_python_data_reader(
            lbann,
            current_file,
            'get_sample',
            'num_samples',
            'sample_dims',
            'train'
        )
    ])
    message.reader.extend([
        tools.create_python_data_reader(
            lbann,
            current_file,
            'get_sample',
            'num_samples',
            'sample_dims',
            'test'
        )
    ])
    return message

# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
# Note: Create test name by removing ".py" from file name
_test_name = os.path.splitext(os.path.basename(current_file))[0]
for test in tools.create_tests(setup_experiment, _test_name, procs_per_node=4):
    globals()[test.__name__] = test
示例#14
0
        if (cluster == 'ray'):
            # Check if mini-batch time is within expected range
            # Note: Skip first epoch since its runtime is usually an outlier
            mini_batch_times = mini_batch_times[1:]
            mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
            assert (0.75 * expected_mini_batch_times[cluster]
                    < mini_batch_time
                    < 1.25 * expected_mini_batch_times[cluster]), \
                    'average mini-batch time is outside expected range'
            # Check for GPU usage and memory leaks
            # Note: Skip first epoch
            gpu_usages = gpu_usages[1:]
            gpu_usage = sum(gpu_usages) / len(gpu_usages)

            assert (0.75 * expected_gpu_usage[cluster]
                    < gpu_usage
                    < 1.25 * expected_gpu_usage[cluster]),\
                    'average gpu usage is outside expected range'

    # Return test function from factory function
    func.__name__ = test_name
    return func


# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment,
                                     __file__,
                                     nodes=num_nodes):
    globals()[_test_func.__name__] = augment_test_func(_test_func)
示例#15
0
    access the sample access functions.

    Args:
        lbann (module): Module for LBANN Python frontend

    """

    # Note: The training data reader should be removed when
    # https://github.com/LLNL/lbann/issues/1098 is resolved.
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'train')
    ])
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'test')
    ])
    return message


# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
# Note: Create test name by removing ".py" from file name
_test_name = os.path.splitext(os.path.basename(current_file))[0]
for test in tools.create_tests(setup_experiment, _test_name):
    globals()[test.__name__] = test
示例#16
0
                < expected_train_accuracy_range[1]), \
                'train accuracy is outside expected range'

        # Check if testing accuracy is within expected range
        assert (expected_test_accuracy_range[0]
                < test_accuracy
                < expected_test_accuracy_range[1]), \
                'test accuracy is outside expected range'

        # Check if mini-batch time is within expected range
        # Note: Skip first epoch since its runtime is usually an outlier
        mini_batch_times = mini_batch_times[1:]
        mini_batch_time = sum(mini_batch_times) / len(mini_batch_times)
        assert (0.75 * expected_mini_batch_times[cluster]
                < mini_batch_time
                < 1.25 * expected_mini_batch_times[cluster]), \
                'average mini-batch time is outside expected range'

    # Return test function from factory function
    func.__name__ = test_name
    return func


# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(
        setup_experiment,
        __file__,
        nodes=num_nodes,
        lbann_args=['--load_full_sample_list_once']):
    globals()[_test_func.__name__] = augment_test_func(_test_func)
示例#17
0
    Args:
        lbann (module): Module for LBANN Python frontend

    """

    # Note: The training data reader should be removed when
    # https://github.com/LLNL/lbann/issues/1098 is resolved.
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'train')
    ])
    message.reader.extend([
        tools.create_python_data_reader(lbann, current_file, 'get_sample',
                                        'num_samples', 'sample_dims', 'test')
    ])
    return message


# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(
        setup_experiment,
        __file__,
        environment={"LBANN_KEEP_ERROR_SIGNALS": 1},
):
    globals()[_test_func.__name__] = _test_func
    message.reader.extend([
        tools.create_python_data_reader(
            lbann,
            current_file,
            'get_sample',
            'num_samples',
            'sample_dims',
            'train'
        )
    ])
    message.reader.extend([
        tools.create_python_data_reader(
            lbann,
            current_file,
            'get_sample',
            'num_samples',
            'sample_dims',
            'test'
        )
    ])
    return message

# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment, __file__,
                               environment=tools.get_distconv_environment()):
    globals()[_test_func.__name__] = _test_func
示例#19
0
    message = lbann.reader_pb2.DataReader()
    message.reader.extend([
        tools.create_python_data_reader(
            lbann,
            current_file,
            'get_sample',
            'num_samples',
            'sample_dims',
            'train'
        )
    ])
    message.reader.extend([
        tools.create_python_data_reader(
            lbann,
            current_file,
            'get_sample',
            'num_samples',
            'sample_dims',
            'test'
        )
    ])
    return message

# ==============================================
# Setup PyTest
# ==============================================

# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment, __file__, skip_clusters=["corona"]):
    globals()[_test_func.__name__] = _test_func