def test_resume_training(optimizer, generated_data, tmp_path):
    input_features, output_features = get_feature_configs()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat"
        },
        "training": {
            "epochs": 2,
            "early_stop": 1000,
            "batch_size": 16,
            "optimizer": {
                "type": optimizer
            }
        },
    }

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    _, _, _, _, output_dir1 = experiment_cli(
        config,
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
    )

    config["training"]["epochs"] = 4

    experiment_cli(
        config,
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
        model_resume_path=output_dir1,
    )

    _, _, _, _, output_dir2 = experiment_cli(
        config,
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
    )

    # compare learning curves with and without resuming
    ts1 = load_json(os.path.join(output_dir1, "training_statistics.json"))
    ts2 = load_json(os.path.join(output_dir2, "training_statistics.json"))
    print("ts1", ts1)
    print("ts2", ts2)
    assert ts1["training"]["combined"]["loss"] == ts2["training"]["combined"][
        "loss"]

    # compare predictions with and without resuming
    y_pred1 = np.load(os.path.join(output_dir1, "y_predictions.npy"))
    y_pred2 = np.load(os.path.join(output_dir2, "y_predictions.npy"))
    print("y_pred1", y_pred1)
    print("y_pred2", y_pred2)
    assert np.all(np.isclose(y_pred1, y_pred2))
Пример #2
0
def test_experiment_model_resume(csv_filename):
    # Single sequence input, single category output
    # Tests saving a model file, loading it to rerun training and predict
    input_features = [sequence_feature(encoder='rnn', reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }

    _, _, _, _, output_dir = experiment_cli(config, dataset=rel_path)
    logger.info('Experiment Directory: {0}'.format(output_dir))

    experiment_cli(config, dataset=rel_path, model_resume_path=output_dir)

    predict_cli(os.path.join(output_dir, 'model'), dataset=rel_path)
    shutil.rmtree(output_dir, ignore_errors=True)
Пример #3
0
def test_sequence_generator(
    enc_encoder,
    enc_cell_type,
    dec_cell_type,
    dec_attention,
    dec_beam_width,
    dec_num_layers,
    loss_sampler,
    generate_deterministic_sequence,
):
    # Define input and output features
    input_features = [
        {
            "name": "in_seq",
            "type": "sequence",
            "encoder": enc_encoder,
            "cell_type": enc_cell_type,
            "reduce_output": None,
        }
    ]
    output_features = [
        {
            "name": "out_seq",
            "type": "sequence",
            "cell_type": dec_cell_type,
            "num_layers": dec_num_layers,
            "beam_width": dec_beam_width,
            "decoder": "generator",
            "attention": dec_attention,
            "reduce_input": None,
            "loss": {"type": "sampled_softmax_cross_entropy", "negative_samples": 10, "sampler": loss_sampler},
        }
    ]
    model_definition = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "fc_size": 14},  # 'concat'
        "training": {
            "" "epochs": 2,
            "early_stop": 5,
            "batch_size": 80,
            "optimizer": {"type": "adam"},
            "learning_rate": 0.001,
        },
    }
    args = {
        "config": model_definition,
        "skip_save_processed_input": True,
        "skip_save_progress": True,
        "skip_save_unprocessed_output": True,
        "skip_save_model": True,
        "skip_save_log": True,
        "debug": False,
    }
    # Generate test data
    np.random.seed(42)  # 13
    df = generate_deterministic_sequence

    # run the experiment
    experiment_cli(dataset=df, **args)
Пример #4
0
def test_resume_training(optimizer, generated_data, tmp_path):
    input_features, output_features = get_feature_configs()
    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat'
        },
        'training': {
            'epochs': 2,
            'early_stop': 1000,
            'batch_size': 16,
            'optimizer': {
                'type': optimizer
            }
        }
    }

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    _, _, _, _, output_dir1 = experiment_cli(
        config,
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
    )

    config['training']['epochs'] = 4

    experiment_cli(
        config,
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
        model_resume_path=output_dir1,
    )

    _, _, _, _, output_dir2 = experiment_cli(
        config,
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
    )

    # compare learning curves with and without resuming
    ts1 = load_json(os.path.join(output_dir1, 'training_statistics.json'))
    ts2 = load_json(os.path.join(output_dir2, 'training_statistics.json'))
    print('ts1', ts1)
    print('ts2', ts2)
    assert ts1['training']['combined']['loss'] == ts2['training']['combined'][
        'loss']

    # compare predictions with and without resuming
    y_pred1 = np.load(os.path.join(output_dir1, 'y_predictions.npy'))
    y_pred2 = np.load(os.path.join(output_dir2, 'y_predictions.npy'))
    print('y_pred1', y_pred1)
    print('y_pred2', y_pred2)
    assert np.all(np.isclose(y_pred1, y_pred2))
Пример #5
0
def test_experiment_model_resume(tmpdir):
    # Single sequence input, single category output
    # Tests saving a model file, loading it to rerun training and predict
    input_features = [sequence_feature(encoder="rnn", reduce_output="sum")]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]
    # Generate test data
    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        TRAINER: {
            "epochs": 2
        },
    }

    _, _, _, _, output_dir = experiment_cli(config,
                                            dataset=rel_path,
                                            output_directory=tmpdir)

    experiment_cli(config, dataset=rel_path, model_resume_path=output_dir)

    predict_cli(os.path.join(output_dir, "model"), dataset=rel_path)
    shutil.rmtree(output_dir, ignore_errors=True)
Пример #6
0
def test_numeric_transformer(transformer_key, tmpdir):
    Transformer = get_from_registry(transformer_key,
                                    numeric_transformation_registry)
    transformer_name = Transformer().__class__.__name__
    if transformer_name == 'Log1pTransformer':
        raw_values = np.random.lognormal(5, 2, size=100)
    else:
        raw_values = np.random.normal(5, 2, size=100)

    backend = LOCAL_BACKEND
    parameters = Transformer.fit_transform_params(raw_values, backend)
    if transformer_name in {'Log1pTransformer', 'IdentityTransformer'}:
        # should be empty
        assert not bool(parameters)
    else:
        # should not be empty
        assert bool(parameters)

    # instantiate numeric transformer
    numeric_transfomer = Transformer(**parameters)

    # transform values
    transformed_values = numeric_transfomer.transform(raw_values)

    # inverse transform the prior transformed values
    reconstructed_values = \
        numeric_transfomer.inverse_transform(transformed_values)

    # should now match
    assert np.allclose(raw_values, reconstructed_values)

    # now test numeric transformer with output feature
    df = pd.DataFrame(np.array([raw_values, raw_values]).T, columns=['x', 'y'])
    config = {
        'input_features': [
            {'name': 'x', 'type': 'numerical'}
        ],
        'output_features': [
            {'name': 'y', 'type': 'numerical',
             'preprocessing': {'normalization': transformer_key}}
        ],
        'combiner': {
            'type': 'concat',
        },
        'training': {
            'epochs': 2,
            'batch_size': 16,
        }
    }

    args = {
        'config': config,
        'skip_save_processed_input': True,
        'output_directory': os.path.join(tmpdir, 'results'),
        'logging_level': logging.WARN
    }

    # ensure no exceptions are raised
    experiment_cli(dataset=df, **args)
Пример #7
0
def train_model():
    train_df = pd.read_csv("../hand_dataset_training.csv")
    # print(train_df.shape)
    # print(train_df.head())
    test_df = pd.read_csv("../hand_dataset_testing.csv")
    experiment_cli(config="../model.yaml",
                   training_set=train_df,
                   test_set=test_df,
                   output_directory="results",
                   experiment_name="exp",
                   random_seed=100)
Пример #8
0
def run_experiment(input_features=None,
                   output_features=None,
                   config=None,
                   skip_save_processed_input=True,
                   backend=None,
                   **kwargs):
    """Helper method to avoid code repetition in running an experiment. Deletes the data saved to disk related to
    running an experiment.

    :param input_features: list of input feature dictionaries
    :param output_features: list of output feature dictionaries
    **kwargs you may also pass extra parameters to the experiment as keyword
    arguments
    :return: None
    """
    if input_features is None and output_features is None and config is None:
        raise ValueError(
            "Cannot run test experiment without features nor config.")

    if config is None:
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "output_size": 14
            },
            TRAINER: {
                "epochs": 2
            },
        }

    with tempfile.TemporaryDirectory() as tmpdir:
        args = {
            "config": config,
            "backend": backend or LocalTestBackend(),
            "skip_save_training_description": True,
            "skip_save_training_statistics": True,
            "skip_save_processed_input": skip_save_processed_input,
            "skip_save_progress": True,
            "skip_save_unprocessed_output": True,
            "skip_save_model": True,
            "skip_save_predictions": True,
            "skip_save_eval_stats": True,
            "skip_collect_predictions": True,
            "skip_collect_overall_stats": True,
            "skip_save_log": True,
            "output_directory": tmpdir,
        }
        args.update(kwargs)

        experiment_cli(**args)
Пример #9
0
def test_numeric_transformer(transformer_key, tmpdir):
    Transformer = get_from_registry(transformer_key, numeric_transformation_registry)
    transformer_name = Transformer().__class__.__name__
    if transformer_name == "Log1pTransformer":
        raw_values = np.random.lognormal(5, 2, size=100)
    else:
        raw_values = np.random.normal(5, 2, size=100)

    backend = LOCAL_BACKEND
    parameters = Transformer.fit_transform_params(raw_values, backend)
    if transformer_name in {"Log1pTransformer", "IdentityTransformer"}:
        # should be empty
        assert not bool(parameters)
    else:
        # should not be empty
        assert bool(parameters)

    # instantiate numeric transformer
    numeric_transfomer = Transformer(**parameters)

    # transform values
    transformed_values = numeric_transfomer.transform(raw_values)

    # inverse transform the prior transformed values
    reconstructed_values = numeric_transfomer.inverse_transform(transformed_values)

    # should now match
    assert np.allclose(raw_values, reconstructed_values)

    # now test numeric transformer with output feature
    df = pd.DataFrame(np.array([raw_values, raw_values]).T, columns=["x", "y"])
    config = {
        "input_features": [{"name": "x", "type": "number"}],
        "output_features": [{"name": "y", "type": "number", "preprocessing": {"normalization": transformer_key}}],
        "combiner": {
            "type": "concat",
        },
        TRAINER: {
            "epochs": 2,
            "batch_size": 16,
        },
    }

    args = {
        "config": config,
        "skip_save_processed_input": True,
        "output_directory": os.path.join(tmpdir, "results"),
        "logging_level": logging.WARN,
    }

    # ensure no exceptions are raised
    experiment_cli(dataset=df, **args)
Пример #10
0
def run_experiment(input_features, output_features, **kwargs):
    """Helper method to avoid code repetition in running an experiment. Deletes the data saved to disk after
    running the experiment.

    :param input_features: list of input feature dictionaries
    :param output_features: list of output feature dictionaries
    **kwargs you may also pass extra parameters to the experiment as keyword
    arguments
    :return: None
    """
    config = None
    if input_features is not None and output_features is not None:
        # This if is necessary so that the caller can call with
        # config_file (and not config)
        config = {
            "backend": "local",
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {"type": "concat", "fc_size": 64, "num_fc_layers": 5},
            "training": {"epochs": 2},
        }

    args = {
        "config": config,
        "skip_save_processed_input": True,
        "skip_save_progress": True,
        "skip_save_unprocessed_output": True,
        "skip_save_model": True,
        "skip_save_log": True,
    }
    args.update(kwargs)

    exp_dir_name = experiment_cli(**args)
    shutil.rmtree(exp_dir_name, ignore_errors=True)
Пример #11
0
def test_experiment_sequence_combiner(sequence_combiner_encoder, csv_filename):
    # Sequence combiner
    input_features = [
        sequence_feature(
            name='seq1',
            min_len=5,
            max_len=5,
            encoder='rnn',
            cell_type='lstm',
            reduce_output=None
        ),
        sequence_feature(
            name='seq2',
            min_len=5,
            max_len=5,
            encoder='rnn',
            cell_type='lstm',
            reduce_output=None
        ),
        category_feature(vocab_size=5)
    ]
    output_features = [
        category_feature(reduce_input='sum', vocab_size=5)
    ]

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        },
        'combiner': {
            'type': 'sequence',
            'encoder': 'rnn',
            'main_sequence_feature': 'seq1',
            'reduce_output': None,
        }
    }

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    for encoder in ENCODERS[:-2]:
        logger.error('sequence combiner. encoders: {0}, {1}'.format(
            encoder,
            encoder
        ))
        input_features[0]['encoder'] = encoder
        input_features[1]['encoder'] = encoder

        model_definition['input_features'] = input_features

        exp_dir_name = experiment_cli(
            model_definition,
            skip_save_processed_input=False,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            dataset=rel_path
        )
        shutil.rmtree(exp_dir_name, ignore_errors=True)
Пример #12
0
def test_experiment_sequence_combiner(sequence_encoder, csv_filename):
    config = {
        "input_features": [
            sequence_feature(
                name="seq1", min_len=5, max_len=5, encoder=sequence_encoder, cell_type="lstm", reduce_output=None
            ),
            sequence_feature(
                name="seq2", min_len=5, max_len=5, encoder=sequence_encoder, cell_type="lstm", reduce_output=None
            ),
            category_feature(vocab_size=5),
        ],
        "output_features": [category_feature(reduce_input="sum", vocab_size=5)],
        "training": {"epochs": 2},
        "combiner": {
            "type": "sequence",
            "encoder": "rnn",
            "main_sequence_feature": "seq1",
            "reduce_output": None,
        },
    }

    # Generate test data
    rel_path = generate_data(config["input_features"], config["output_features"], csv_filename)

    exp_dir_name = experiment_cli(
        config,
        skip_save_processed_input=False,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        dataset=rel_path,
    )
    shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_resume_training_mlflow(optimizer, generated_data, tmp_path):
    input_features, output_features = get_feature_configs()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat"
        },
        TRAINER: {
            "epochs": 2,
            "batch_size": 16,
            "optimizer": {
                "type": optimizer
            }
        },
    }

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()
    mlflow_uri = f"file://{tmp_path}/mlruns"
    experiment_name = optimizer + "_experiment"

    _, _, _, _, output_dir1 = experiment_cli(
        config,
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
        callbacks=[MlflowCallback(mlflow_uri)],
        experiment_name=experiment_name,
    )
    # Can't change any artifact spec on a run once it has been logged to mlflow, so skipping changing epochs

    _, _, _, _, output_dir2 = experiment_cli(
        config,
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
        model_resume_path=output_dir1,
        callbacks=[MlflowCallback(mlflow_uri)],
        experiment_name=experiment_name,
    )

    # make sure there is only one mlflow run id
    experiment = mlflow.get_experiment_by_name(experiment_name)
    previous_runs = mlflow.search_runs([experiment.experiment_id])
    assert len(previous_runs) == 1
def test_sequence_generator(enc_encoder, enc_cell_type, dec_cell_type,
                            dec_attention, dec_beam_width, dec_num_layers,
                            loss_sampler, generate_deterministic_sequence):
    # Define input and output features
    input_features = [{
        'name': 'in_seq',
        'type': 'sequence',
        'encoder': enc_encoder,
        'cell_type': enc_cell_type,
        'reduce_output': None
    }]
    output_features = [{
        'name': 'out_seq',
        'type': 'sequence',
        'cell_type': dec_cell_type,
        'num_layers': dec_num_layers,
        'beam_width': dec_beam_width,
        'decoder': 'generator',
        'attention': dec_attention,
        'reduce_input': None,
        'loss': {
            'type': 'sampled_softmax_cross_entropy',
            'negative_samples': 10,
            'sampler': loss_sampler
        }
    }]
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',  # 'concat'
            'fc_size': 14
        },
        'training': {
            ''
            'epochs': 2,
            'early_stop': 5,
            'batch_size': 80,
            'optimizer': {
                'type': 'adam'
            },
            'learning_rate': 0.001,
        }
    }
    args = {
        'config': model_definition,
        'skip_save_processed_input': True,
        'skip_save_progress': True,
        'skip_save_unprocessed_output': True,
        'skip_save_model': True,
        'skip_save_log': True,
        'debug': False
    }
    # Generate test data
    np.random.seed(42)  # 13
    df = generate_deterministic_sequence

    # run the experiment
    results = experiment_cli(dataset=df, **args)
Пример #15
0
def test_early_stopping(early_stop, generated_data, tmp_path):
    input_features, output_features = get_feature_configs()

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat'
        },
        'training': {
            'epochs': 30,
            'early_stop': early_stop,
            'batch_size': 16
        }
    }

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # run experiment
    _, _, _, _, output_dir = experiment_cli(
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
        output_directory=str(results_dir),
        config=config,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_model=True,
        skip_save_log=True
    )

    # test existence of required files
    train_stats_fp = os.path.join(output_dir, 'training_statistics.json')
    metadata_fp = os.path.join(output_dir, 'description.json')
    assert os.path.isfile(train_stats_fp)
    assert os.path.isfile(metadata_fp)

    # retrieve results so we can validate early stopping
    with open(train_stats_fp, 'r') as f:
        train_stats = json.load(f)
    with open(metadata_fp, 'r') as f:
        metadata = json.load(f)

    # get early stopping value
    early_stop_value = metadata['config']['training']['early_stop']

    # retrieve validation losses
    vald_losses = np.array(train_stats['validation']['combined']['loss'])
    last_epoch = vald_losses.shape[0]
    best_epoch = np.argmin(vald_losses)

    # confirm early stopping
    assert (last_epoch - best_epoch - 1) == early_stop_value
Пример #16
0
def test_model_progress_save(
        skip_save_progress,
        skip_save_model,
        generated_data,
        tmp_path
):
    input_features, output_features = get_feature_configs()

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {'type': 'concat'},
        'training': {'epochs': 5}
    }

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # run experiment
    _, _, _, _, output_dir = experiment_cli(
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
        output_directory=str(results_dir),
        config=config,
        skip_save_processed_input=True,
        skip_save_progress=skip_save_progress,
        skip_save_unprocessed_output=True,
        skip_save_model=skip_save_model,
        skip_save_log=True
    )

    # ========== Check for required result data sets =============
    if skip_save_model:
        model_dir = os.path.join(output_dir, 'model')
        files = [f for f in os.listdir(model_dir) if
                 re.match(r'model_weights', f)]
        assert len(files) == 0
    else:
        model_dir = os.path.join(output_dir, 'model')
        files = [f for f in os.listdir(model_dir) if
                 re.match(r'model_weights', f)]
        # at least one .index and one .data file, but .data may be more
        assert len(files) >= 2
        assert os.path.isfile(
            os.path.join(output_dir, 'model', 'checkpoint'))

    if skip_save_progress:
        assert not os.path.isdir(
            os.path.join(output_dir, 'model', 'training_checkpoints')
        )
    else:
        assert os.path.isdir(
            os.path.join(output_dir, 'model', 'training_checkpoints')
        )
def test_early_stopping(early_stop, generated_data, tmp_path):
    input_features, output_features = get_feature_configs()

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat"
        },
        TRAINER: {
            "epochs": 30,
            "early_stop": early_stop,
            "batch_size": 16
        },
    }

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # run experiment
    _, _, _, _, output_dir = experiment_cli(
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
        output_directory=str(results_dir),
        config=config,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_model=True,
        skip_save_log=True,
    )

    # test existence of required files
    train_stats_fp = os.path.join(output_dir, "training_statistics.json")
    metadata_fp = os.path.join(output_dir, DESCRIPTION_FILE_NAME)
    assert os.path.isfile(train_stats_fp)
    assert os.path.isfile(metadata_fp)

    # retrieve results so we can validate early stopping
    with open(train_stats_fp) as f:
        train_stats = json.load(f)
    with open(metadata_fp) as f:
        metadata = json.load(f)

    # get early stopping value
    early_stop_value = metadata["config"][TRAINER]["early_stop"]

    # retrieve validation losses
    vald_losses_data = train_stats["validation"]["combined"]["loss"]

    last_evaluation = len(vald_losses_data) - 1
    best_evaluation = np.argmin(vald_losses_data)

    assert last_evaluation - best_evaluation == early_stop_value
Пример #18
0
def run_experiment(
    input_features,
    output_features,
    skip_save_processed_input=True,
    config=None,
    backend=None,
    **kwargs,
):
    """
    Helper method to avoid code repetition in running an experiment. Deletes
    the data saved to disk after running the experiment
    :param input_features: list of input feature dictionaries
    :param output_features: list of output feature dictionaries
    **kwargs you may also pass extra parameters to the experiment as keyword
    arguments
    :return: None
    """
    if input_features is not None and output_features is not None:
        # This if is necessary so that the caller can call with
        # config_file (and not config)
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "fc_size": 14
            },
            "training": {
                "epochs": 2
            },
        }

    args = {
        "config": config,
        "backend": backend or LocalTestBackend(),
        "skip_save_training_description": True,
        "skip_save_training_statistics": True,
        "skip_save_processed_input": skip_save_processed_input,
        "skip_save_progress": True,
        "skip_save_unprocessed_output": True,
        "skip_save_model": True,
        "skip_save_predictions": True,
        "skip_save_eval_stats": True,
        "skip_collect_predictions": True,
        "skip_collect_overall_stats": True,
        "skip_save_log": True,
    }
    args.update(kwargs)

    _, _, _, _, exp_dir_name = experiment_cli(**args)
    shutil.rmtree(exp_dir_name, ignore_errors=True)
Пример #19
0
def run_experiment(input_features,
                   output_features,
                   skip_save_processed_input=True,
                   config=None,
                   backend=None,
                   **kwargs):
    """
    Helper method to avoid code repetition in running an experiment. Deletes
    the data saved to disk after running the experiment
    :param input_features: list of input feature dictionaries
    :param output_features: list of output feature dictionaries
    **kwargs you may also pass extra parameters to the experiment as keyword
    arguments
    :return: None
    """
    if input_features is not None and output_features is not None:
        # This if is necessary so that the caller can call with
        # config_file (and not config)
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
            'training': {
                'epochs': 2
            }
        }

    args = {
        'config': config,
        'backend': backend or LocalTestBackend(),
        'skip_save_training_description': True,
        'skip_save_training_statistics': True,
        'skip_save_processed_input': skip_save_processed_input,
        'skip_save_progress': True,
        'skip_save_unprocessed_output': True,
        'skip_save_model': True,
        'skip_save_predictions': True,
        'skip_save_eval_stats': True,
        'skip_collect_predictions': True,
        'skip_collect_overall_stats': True,
        'skip_save_log': True
    }
    args.update(kwargs)

    _, _, _, _, exp_dir_name = experiment_cli(**args)
    shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_model_progress_save(skip_save_progress, skip_save_model,
                             generated_data, tmp_path):
    input_features, output_features = get_feature_configs()

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat"
        },
        TRAINER: {
            "epochs": 5
        },
    }

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # run experiment
    _, _, _, _, output_dir = experiment_cli(
        training_set=generated_data.train_df,
        validation_set=generated_data.validation_df,
        test_set=generated_data.test_df,
        output_directory=str(results_dir),
        config=config,
        skip_save_processed_input=True,
        skip_save_progress=skip_save_progress,
        skip_save_unprocessed_output=True,
        skip_save_model=skip_save_model,
        skip_save_log=True,
    )

    # ========== Check for required result data sets =============
    model_dir = os.path.join(output_dir, "model")
    files = [f for f in os.listdir(model_dir) if re.match(r"model_weights", f)]
    if skip_save_model:
        assert len(files) == 0
    else:
        assert len(files) == 1

    training_checkpoints_dir = os.path.join(output_dir, "model",
                                            "training_checkpoints")
    training_checkpoints = os.listdir(training_checkpoints_dir)
    if skip_save_progress:
        assert len(training_checkpoints) == 0
    else:
        assert len(training_checkpoints) > 0
Пример #21
0
def test_experiment_sequence_combiner_with_embed_encoder_fails(csv_filename):
    config = {
        "input_features": [
            sequence_feature(
                name="seq1",
                min_len=5,
                max_len=5,
                encoder="embed",
                cell_type="lstm",
                reduce_output=None,
            ),
            sequence_feature(name="seq2",
                             min_len=5,
                             max_len=5,
                             encoder="embed",
                             cell_type="lstm",
                             reduce_output=None),
            category_feature(vocab_size=5),
        ],
        "output_features":
        [category_feature(reduce_input="sum", vocab_size=5)],
        "training": {
            "epochs": 2
        },
        "combiner": {
            "type": "sequence",
            "encoder": "rnn",
            "main_sequence_feature": "seq1",
            "reduce_output": None,
        },
    }

    # Generate test data
    rel_path = generate_data(config["input_features"],
                             config["output_features"], csv_filename)

    # Encoding sequence features with 'embed' should fail with SequenceConcatCombiner, since at least one sequence feature should be rank 3.
    with pytest.raises(ValueError):
        exp_dir_name = experiment_cli(
            config,
            skip_save_processed_input=False,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            dataset=rel_path,
        )
        shutil.rmtree(exp_dir_name, ignore_errors=True)
Пример #22
0
def run_experiment(input_features, output_features, **kwargs):
    """
    Helper method to avoid code repetition in running an experiment. Deletes
    the data saved to disk after running the experiment
    :param input_features: list of input feature dictionaries
    :param output_features: list of output feature dictionaries
    **kwargs you may also pass extra parameters to the experiment as keyword
    arguments
    :return: None
    """
    config = None
    if input_features is not None and output_features is not None:
        # This if is necessary so that the caller can call with
        # config_file (and not config)
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 64,
                'num_fc_layers': 5
            },
            'training': {
                'epochs': 2
            }
        }

    args = {
        'config': config,
        'skip_save_processed_input': True,
        'skip_save_progress': True,
        'skip_save_unprocessed_output': True,
        'skip_save_model': True,
        'skip_save_log': True
    }
    args.update(kwargs)

    exp_dir_name = experiment_cli(**args)
    shutil.rmtree(exp_dir_name, ignore_errors=True)
Пример #23
0
def test_regularization(generated_data, tmp_path):
    input_features, output_features = get_feature_configs()

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat'
        },
        'training': {
            'epochs': 1,
            'batch_size': 16,
            'regularization_lambda': 1
        }
    }

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    regularization_losses = []
    for regularizer in [None, 'l1', 'l2', 'l1_l2']:
        tf.keras.backend.clear_session()
        np.random.seed(RANDOM_SEED)
        tf.random.set_seed(RANDOM_SEED)

        # setup regularization parameters
        config['output_features'][0][
            'weights_regularizer'] = regularizer
        config['output_features'][0][
            'bias_regularizer'] = regularizer
        config['output_features'][0][
            'activity_regularizer'] = regularizer

        # run experiment
        _, _, _, _, output_dir = experiment_cli(
            training_set=generated_data.train_df,
            validation_set=generated_data.validation_df,
            test_set=generated_data.test_df,
            output_directory=str(results_dir),
            config=config,
            experiment_name='regularization',
            model_name=str(regularizer),
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            skip_save_model=True,
            skip_save_log=True
        )

        # test existence of required files
        train_stats_fp = os.path.join(output_dir, 'training_statistics.json')
        metadata_fp = os.path.join(output_dir, 'description.json')
        assert os.path.isfile(train_stats_fp)
        assert os.path.isfile(metadata_fp)

        # retrieve results so we can compare training loss with regularization
        with open(train_stats_fp, 'r') as f:
            train_stats = json.load(f)

        # retrieve training losses for all epochs
        train_losses = np.array(train_stats['training']['combined']['loss'])
        regularization_losses.append(train_losses[0])

    # create a set of losses
    regularization_losses_set = set(regularization_losses)

    # ensure all losses obtained with the different methods are different
    assert len(regularization_losses) == len(regularization_losses_set)
Пример #24
0
<<<<<<< HEAD
    rel_path = generate_data(input_features, output_features, csv_filename)

    logger.error('sequence combiner. encoders: {0}, {1}'.format(
        sequence_combiner_encoder,
        sequence_combiner_encoder
    ))
    input_features[0]['encoder'] = sequence_combiner_encoder
    input_features[1]['encoder'] = sequence_combiner_encoder

    config['input_features'] = input_features

    exp_dir_name = experiment_cli(
        config,
        skip_save_processed_input=False,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        dataset=rel_path
    )
    shutil.rmtree(exp_dir_name, ignore_errors=True)
=======
    rel_path = generate_data(
        config['input_features'], config['output_features'], csv_filename)
=======
    rel_path = generate_data(config["input_features"], config["output_features"], csv_filename)
>>>>>>> upstream/master

    # Encoding sequence features with 'embed' should fail with SequenceConcatCombiner, since at least one sequence
    # feature should be rank 3.
    with pytest.raises(ValueError):
        exp_dir_name = experiment_cli(
Пример #25
0
def test_regularization(generated_data, tmp_path):
    input_features, output_features = get_feature_configs()

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat"},
        TRAINER: {
            "epochs": 1,
            "batch_size": 16,
            "regularization_lambda": 1,
        },
    }

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    regularization_losses = []
    for regularizer in [None, "l1", "l2", "l1_l2"]:
        np.random.seed(RANDOM_SEED)
        torch.manual_seed(RANDOM_SEED)

        # setup regularization parameters
        config[TRAINER]["regularization_type"] = regularizer

        # run experiment
        _, _, _, _, output_dir = experiment_cli(
            training_set=generated_data.train_df,
            validation_set=generated_data.validation_df,
            test_set=generated_data.test_df,
            output_directory=str(results_dir),
            config=config,
            experiment_name="regularization",
            model_name=str(regularizer),
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            skip_save_model=True,
            skip_save_log=True,
        )

        # test existence of required files
        train_stats_fp = os.path.join(output_dir, "training_statistics.json")
        metadata_fp = os.path.join(output_dir, "description.json")
        assert os.path.isfile(train_stats_fp)
        assert os.path.isfile(metadata_fp)

        # retrieve results so we can compare training loss with regularization
        with open(train_stats_fp) as f:
            train_stats = json.load(f)

        # retrieve training losses for all epochs
        train_losses = np.array(train_stats[TRAINING]["combined"]["loss"])
        regularization_losses.append(train_losses[0])

    # create a set of losses
    regularization_losses_set = set(regularization_losses)

    # ensure all losses obtained with the different methods are different
    assert len(regularization_losses) == len(regularization_losses_set)