def get_model(dataset_args: Dict[str, Any],
              network_args: Dict[str, Any]) -> ProjectModel:
    """Returns the model.
    :param dataset_args: the dataset arguments; see DEFAULT_DATASET_ARGS for
    available arguments.
    :param network_args: the network arguments; see DEFAULT_NETWORK_ARGS for
    available arguments.
    :return: the model.
    """
    dataset_args = {**DEFAULT_DATASET_ARGS, **dataset_args}
    network_args = {**DEFAULT_NETWORK_ARGS, **network_args}
    print('Dataset args: {0}'.format(dataset_args))
    print('Network args: {0}'.format(network_args))
    print('Loading dataset from {0}'.format(DEFAULT_DATASET_PATH))
    dataset = ILSVRCDataset(DEFAULT_DATASET_PATH)
    if dataset_args['dataset_fraction'] < 1.0:
        dataset.trim_dataset(dataset_args['dataset_fraction'])
    print('Num training examples: {0}'.format(
        dataset.partition[TRAIN_KEY].shape[0]))
    print('Num validation examples: {0}'.format(
        dataset.partition[VAL_KEY].shape[0]))
    print('Num test examples: {0}'.format(
        dataset.partition[TEST_KEY].shape[0]))
    if network_args['architecture'] == ARCHITECTURE_MLP:
        network = MLP(network_args)
    elif network_args['architecture'] == ARCHITECTURE_LENET:
        network = LeNet(network_args)
    else:
        raise ValueError('Unrecognized architecture: {0}'.format(
            network_args['architecture']))
    return ImageModel(dataset, network)
def test_shuffle(dataset: ILSVRCDataset) -> None:
    """Tests that the shuffling flag works as expected. Also tests that
    filenames and labels are still properly mapped.
    :param dataset: the dataset.
    """
    dataset.trim_dataset(DATASET_FRACTION)
    x_train_filenames = dataset.partition[TRAIN_KEY]
    y_train = dataset.get_labels(x_train_filenames, True, NUM_CLASSES)
    train_sequence = ImageDatasetSequence(x_train_filenames,
                                          y=y_train,
                                          batch_size=BATCH_SIZE,
                                          image_target_size=IMAGE_TARGET_SIZE,
                                          batch_augment_fn=None,
                                          batch_format_fn=None,
                                          overfit_single_batch=False,
                                          shuffle_on_epoch_end=True)
    img_to_label_before = {}
    for batch in train_sequence:
        x_batch, y_batch = batch
        for i in range(x_batch.shape[0]):
            img_data = tuple(x_batch[i].flatten())
            label = tuple(y_batch[i])
            img_to_label_before[img_data] = label
    # Test shuffle.
    first_batch_before = train_sequence.__getitem__(0)
    train_sequence.on_epoch_end()
    first_batch_after = train_sequence.__getitem__(0)
    assert (first_batch_before[0] != first_batch_after[0]).any()
    # Test filename/label mappings.
    for batch in train_sequence:
        x_batch, y_batch = batch
        for i in range(x_batch.shape[0]):
            img_data = tuple(x_batch[i].flatten())
            label = tuple(y_batch[i])
            assert img_to_label_before[img_data] == label
def test_trim_dataset(dataset: ILSVRCDataset) -> None:
    """Tests that the dataset is being trimmed properly. The trimmed
    dataset should be shuffled so that the classes retain the same
    approximate distribution.
    :param dataset: the dataset.
    """
    train_size_before = dataset.partition[TRAIN_KEY].shape[0]
    val_size_before = dataset.partition[VAL_KEY].shape[0]
    test_size_before = dataset.partition[TEST_KEY].shape[0]
    train_subset_before = dataset.partition[TRAIN_KEY][:5]
    val_subset_before = dataset.partition[VAL_KEY][:5]
    test_subset_before = dataset.partition[TEST_KEY][:5]
    dataset.trim_dataset(DATASET_FRACTION, trim_val=True, trim_test=False)
    train_size_after = dataset.partition[TRAIN_KEY].shape[0]
    val_size_after = dataset.partition[VAL_KEY].shape[0]
    test_size_after = dataset.partition[TEST_KEY].shape[0]
    train_subset_after = dataset.partition[TRAIN_KEY][:5]
    val_subset_after = dataset.partition[VAL_KEY][:5]
    test_subset_after = dataset.partition[TEST_KEY][:5]
    # Check that trimming occurred (or didn't).
    assert (train_size_before * (DATASET_FRACTION - DELTA)) < \
        train_size_after < \
        (train_size_before * (DATASET_FRACTION + DELTA))
    assert (val_size_before * (DATASET_FRACTION - DELTA)) < \
        val_size_after < \
        (val_size_before * (DATASET_FRACTION + DELTA))
    assert test_size_before == test_size_after
    # Check that the datasets were shuffled (or weren't).
    # We're just going to use the first 5 filenames to check for shuffling;
    # it's extremely unlikely that all are the same after shuffling.
    assert (train_subset_before != train_subset_after).any()
    assert (val_subset_before != val_subset_after).any()
    assert (test_subset_before == test_subset_after).all()
def test_training_reproducible() -> None:
    """Tests that training results are reproducible."""
    set_random_seed(SEED)
    dataset_args = {'dataset_fraction': 0.001}
    network_args = {'input_shape': (128, 128, 3), 'num_classes': 1000}
    train_args = {'epochs': 10, 'batch_size': 32, 'early_stopping': True}
    dataset = ILSVRCDataset(DEFAULT_DATASET_PATH)
    dataset.trim_dataset(dataset_args['dataset_fraction'])
    network = MLP(network_args)
    model = ProjectModel(dataset, network)
    history = train_model.train_model(model, train_args)
    assert str(history.history) == SEED_HISTORY
def test_images(dataset: ILSVRCDataset) -> None:
    """Tests that the sequence output images meet expected standards.
    :param dataset: the dataset.
    """
    dataset.trim_dataset(DATASET_FRACTION)
    x_train_filenames = dataset.partition[TRAIN_KEY]
    y_train = dataset.get_labels(x_train_filenames, True, NUM_CLASSES)
    train_sequence = ImageDatasetSequence(x_train_filenames,
                                          y=y_train,
                                          batch_size=BATCH_SIZE,
                                          image_target_size=IMAGE_TARGET_SIZE,
                                          batch_augment_fn=None,
                                          batch_format_fn=None,
                                          overfit_single_batch=False,
                                          shuffle_on_epoch_end=True)
    # Test that only the last batch is not of length BATCH_SIZE.
    # Also test that there are the correct number of batches.
    on_last_batch = False
    num_batches_seen = 0
    for batch in train_sequence:
        assert not on_last_batch
        x_batch, y_batch = batch
        # Take the first image/label pair and check that it meets standards.
        # Check that the image is of the right size.
        assert x_batch[0].shape == IMAGE_TARGET_SIZE + (3, )
        # Check that the image is of the right datatype.
        assert x_batch.dtype == np.float32
        # Check that the image is normalized.
        assert (0.0 <= x_batch.flatten()).all()
        assert (x_batch.flatten() <= 1.0).all()
        # Check that the label is categorical and of the right dimension.
        assert y_batch.shape[1] == NUM_CLASSES
        # Check that the label is of the right datatype.
        assert y_batch.dtype == np.float32
        # Check that the label is one-hot.
        for label in y_batch:
            assert sum(label) == 1
        on_last_batch = not (x_batch.shape[0] == BATCH_SIZE
                             and y_batch.shape[0] == BATCH_SIZE)
        num_batches_seen += 1
    assert num_batches_seen == len(train_sequence)
def test_overfit_single_batch(dataset: ILSVRCDataset) -> None:
    """Tests that the same batch of images is always presented to the
    model if overfitting on a single batch.
    :param dataset: the dataset.
    """
    dataset.trim_dataset(DATASET_FRACTION)
    x_train_filenames = dataset.partition[TRAIN_KEY]
    y_train = dataset.get_labels(x_train_filenames, True, NUM_CLASSES)
    # Test that you can't set overfit and shuffle flags together.
    train_sequence = ImageDatasetSequence(x_train_filenames,
                                          y=y_train,
                                          batch_size=BATCH_SIZE,
                                          image_target_size=IMAGE_TARGET_SIZE,
                                          batch_augment_fn=None,
                                          batch_format_fn=None,
                                          overfit_single_batch=True,
                                          shuffle_on_epoch_end=True)
    with pytest.raises(ValueError):
        for _ in train_sequence:
            pass
    # Test that you always get the same batch, even after multiple epochs.
    train_sequence = ImageDatasetSequence(x_train_filenames,
                                          y=y_train,
                                          batch_size=BATCH_SIZE,
                                          image_target_size=IMAGE_TARGET_SIZE,
                                          batch_augment_fn=None,
                                          batch_format_fn=None,
                                          overfit_single_batch=True,
                                          shuffle_on_epoch_end=False)
    num_batches_epoch_1 = 0
    for batch in train_sequence:
        assert (batch[0] == train_sequence.__getitem__(0)[0]).all()
        num_batches_epoch_1 += 1
    train_sequence.on_epoch_end()
    num_batches_epoch_2 = 0
    for batch in train_sequence:
        assert (batch[0] == train_sequence.__getitem__(0)[0]).all()
        num_batches_epoch_2 += 1
    assert num_batches_epoch_1 == num_batches_epoch_2