Exemplo n.º 1
0
def test_sequence_tagger(
        enc_cell_type,
        attention,
        csv_filename
):
    # Define input and output features
    input_features = [
        sequence_feature(
            max_len=10,
            encoder='rnn',
            cell_type=enc_cell_type,
            reduce_output=None
        )
    ]
    output_features = [
        sequence_feature(
            max_len=10,
            decoder='tagger',
            attention=attention,
            reduce_input=None
        )
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
def test_sequence_tagger(enc_cell_type, attention, csv_filename):
    # Define input and output features
    input_features = [
        sequence_feature(max_len=10,
                         encoder='rnn',
                         cell_type=enc_cell_type,
                         reduce_output=None)
    ]
    output_features = [
        sequence_feature(
            max_len=10,
            decoder='tagger',
            attention=attention,
            reduce_input=None,
        )
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # setup sampled softmax loss
    output_features[0].update({
        'loss': {
            'type': 'sampled_softmax_cross_entropy',
            'negative_samples': 7
        }
    })

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
def generate_sequence_training_data():
    input_features = [
        sequence_feature(
            vocab_size=TEST_VOCAB_SIZE,
            embedding_size=TEST_EMBEDDING_SIZE,
            state_size=TEST_STATE_SIZE,
            hidden_size=TEST_HIDDEN_SIZE,
            num_filters=TEST_NUM_FILTERS,
            min_len=5,
            max_len=10,
            encoder="rnn",
            cell_type="lstm",
            reduce_output=None,
        )
    ]

    output_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         decoder="generator",
                         cell_type="lstm",
                         attention="bahdanau",
                         reduce_input=None)
    ]

    # generate synthetic data set testing
    dataset = build_synthetic_dataset(
        150,
        copy.deepcopy(input_features) + copy.deepcopy(output_features))
    raw_data = "\n".join([r[0] + "," + r[1] for r in dataset])
    df = pd.read_csv(StringIO(raw_data))

    return df, input_features, output_features
def test_sequence_generator(enc_encoder, enc_cell_type, dec_cell_type,
                            dec_attention, dec_beam_width, dec_num_layers,
                            csv_filename):
    # Define input and output features
    input_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         encoder="rnn",
                         cell_type="lstm",
                         reduce_output=None)
    ]
    output_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         decoder="generator",
                         cell_type="lstm",
                         attention="bahdanau",
                         reduce_input=None)
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # setup encoder specification
    input_features[0]["encoder"] = enc_encoder
    input_features[0]["cell_type"] = enc_cell_type

    # setup decoder specification
    output_features[0]["cell_type"] = dec_cell_type
    output_features[0]["attention"] = dec_attention
    output_features[0]["beam_width"] = dec_beam_width
    output_features[0]["num_layers"] = dec_num_layers

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
Exemplo n.º 5
0
def test_experiment_sequence_combiner(sequence_encoder, csv_filename):
    config = {
        "input_features": [
            sequence_feature(
                name="seq1", min_len=5, max_len=5, encoder=sequence_encoder, cell_type="lstm", reduce_output=None
            ),
            sequence_feature(
                name="seq2", min_len=5, max_len=5, encoder=sequence_encoder, cell_type="lstm", reduce_output=None
            ),
            category_feature(vocab_size=5),
        ],
        "output_features": [category_feature(reduce_input="sum", vocab_size=5)],
        "training": {"epochs": 2},
        "combiner": {
            "type": "sequence",
            "encoder": "rnn",
            "main_sequence_feature": "seq1",
            "reduce_output": None,
        },
    }

    # Generate test data
    rel_path = generate_data(config["input_features"], config["output_features"], csv_filename)

    exp_dir_name = experiment_cli(
        config,
        skip_save_processed_input=False,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        dataset=rel_path,
    )
    shutil.rmtree(exp_dir_name, ignore_errors=True)
Exemplo n.º 6
0
def test_experiment_sequence_combiner(sequence_encoder, csv_filename):
    config = {
        "input_features": [
            sequence_feature(name="seq1",
                             min_len=5,
                             max_len=5,
                             encoder=sequence_encoder,
                             cell_type="lstm",
                             reduce_output=None),
            sequence_feature(name="seq2",
                             min_len=5,
                             max_len=5,
                             encoder=sequence_encoder,
                             cell_type="lstm",
                             reduce_output=None),
            category_feature(vocab_size=5),
        ],
        "output_features":
        [category_feature(reduce_input="sum", vocab_size=5)],
        TRAINER: {
            "epochs": 2
        },
        "combiner": {
            "type": "sequence",
            "encoder": "rnn",
            "main_sequence_feature": "seq1",
            "reduce_output": None,
        },
    }

    # Generate test data
    rel_path = generate_data(config["input_features"],
                             config["output_features"], csv_filename)

    run_experiment(config=config, dataset=rel_path)
Exemplo n.º 7
0
def test_sequence_tagger(
        enc_cell_type,
        csv_filename
):
    # Define input and output features
    input_features = [
        sequence_feature(
            max_len=10,
            encoder='rnn',
            cell_type='lstm',
            reduce_output=None
        )
    ]
    output_features = [
        sequence_feature(
            max_len=10,
            decoder='tagger',
            reduce_input=None
        )
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # setup encoder specification
    input_features[0]['cell_type'] = enc_cell_type

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
Exemplo n.º 8
0
def test_experiment_sequence_combiner(sequence_combiner_encoder, csv_filename):
    # Sequence combiner
    input_features = [
        sequence_feature(
            name='seq1',
            min_len=5,
            max_len=5,
            encoder='rnn',
            cell_type='lstm',
            reduce_output=None
        ),
        sequence_feature(
            name='seq2',
            min_len=5,
            max_len=5,
            encoder='rnn',
            cell_type='lstm',
            reduce_output=None
        ),
        category_feature(vocab_size=5)
    ]
    output_features = [
        category_feature(reduce_input='sum', vocab_size=5)
    ]

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        },
        'combiner': {
            'type': 'sequence',
            'encoder': 'rnn',
            'main_sequence_feature': 'seq1',
            'reduce_output': None,
        }
    }

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    for encoder in ENCODERS[:-2]:
        logger.error('sequence combiner. encoders: {0}, {1}'.format(
            encoder,
            encoder
        ))
        input_features[0]['encoder'] = encoder
        input_features[1]['encoder'] = encoder

        model_definition['input_features'] = input_features

        exp_dir_name = experiment_cli(
            model_definition,
            skip_save_processed_input=False,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            dataset=rel_path
        )
        shutil.rmtree(exp_dir_name, ignore_errors=True)
Exemplo n.º 9
0
def test_sequence_generator(enc_encoder, enc_cell_type, dec_cell_type,
                            dec_attention, dec_beam_width, dec_num_layers,
                            csv_filename):
    # Define input and output features
    input_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         encoder='rnn',
                         cell_type='lstm',
                         reduce_output=None)
    ]
    output_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         decoder='generator',
                         cell_type='lstm',
                         attention='bahdanau',
                         reduce_input=None)
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # setup encoder specification
    input_features[0]['encoder'] = enc_encoder
    input_features[0]['cell_type'] = enc_cell_type

    # setup decoder specification
    output_features[0]['cell_type'] = dec_cell_type
    output_features[0]['attention'] = dec_attention
    output_features[0]['beam_width'] = dec_beam_width
    output_features[0]['num_layers'] = dec_num_layers

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
Exemplo n.º 10
0
def generate_sequence_training_data():
    input_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         encoder='rnn',
                         cell_type='lstm',
                         reduce_output=None)
    ]

    output_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         decoder='generator',
                         cell_type='lstm',
                         attention='bahdanau',
                         reduce_input=None)
    ]

    # generate synthetic data set testing
    dataset = build_synthetic_dataset(
        150,
        copy.deepcopy(input_features) + copy.deepcopy(output_features))
    raw_data = '\n'.join([r[0] + ',' + r[1] for r in dataset])
    df = pd.read_csv(StringIO(raw_data))

    return df, input_features, output_features
Exemplo n.º 11
0
def test_config_features():
    all_input_features = [
        audio_feature('/tmp/destination_folder'),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature('/tmp/destination_folder'),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        'input_features': all_input_features,
        'output_features': all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature['type'] not in OUTPUT_FEATURE_TYPES
    ]
    for input_feature in input_only_features:
        config = {
            'input_features': all_input_features,
            'output_features': all_output_features + [input_feature],
        }

        dtype = input_feature['type']
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
Exemplo n.º 12
0
def test_config_features():
    all_input_features = [
        audio_feature("/tmp/destination_folder"),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature("/tmp/destination_folder"),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature["type"] not in output_type_registry.keys()
    ]
    for input_feature in input_only_features:
        config = {
            "input_features": all_input_features,
            "output_features": all_output_features + [input_feature],
        }

        dtype = input_feature["type"]
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
Exemplo n.º 13
0
def test_ray_sequence():
    input_features = [
        sequence_feature(max_len=10,
                         encoder='rnn',
                         cell_type='lstm',
                         reduce_output=None)
    ]
    output_features = [
        sequence_feature(max_len=10,
                         decoder='tagger',
                         attention=False,
                         reduce_input=None)
    ]
    run_test_parquet(input_features, output_features)
Exemplo n.º 14
0
def test_scale_lr(learning_rate_scaling, expected_lr, tmpdir, ray_test_cluster):
    base_lr = 1.0
    num_workers = 4

    outdir = os.path.join(tmpdir, "output")

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    csv_filename = os.path.join(tmpdir, "training.csv")
    data_csv = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "output_size": 14},
        TRAINER: {
            "epochs": 2,
            "learning_rate": base_lr,
            "learning_rate_scaling": learning_rate_scaling,
        },
    }

    actual_lr = ray.get(run_scale_lr.remote(config, data_csv, num_workers, outdir))
    assert actual_lr == expected_lr
Exemplo n.º 15
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        INPUT_FEATURES: all_input_features,
        OUTPUT_FEATURES: all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINER] = {"batch_size": 42}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT][EXECUTOR][SCHEDULER] = SCHEDULER_DICT

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else ECDTrainerConfig().early_stop
    assert merged_config[TRAINER]["early_stop"] == expected
Exemplo n.º 16
0
def test_api_training_set(csv_filename):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [sequence_feature(reduce_output="sum")]
        output_features = [category_feature(vocab_size=5, reduce_input="sum")]

        data_csv = generate_data(input_features, output_features, csv_filename)
        val_csv = shutil.copyfile(data_csv,
                                  os.path.join(tmpdir, "validation.csv"))
        test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "fc_size": 14
            },
        }
        model = LudwigModel(config)
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv)
        model.predict(dataset=test_csv)

        # Train again, this time the HDF5 cache will be used
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv)
Exemplo n.º 17
0
def _prepare_data(csv_filename, model_definition_filename):
    # Single sequence input, single category output
    input_features = [sequence_feature(reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]

    # Generate test data
    dataset_filename = generate_data(input_features, output_features,
                                     csv_filename)

    # generate model definition file
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }

    with open(model_definition_filename, 'w') as f:
        yaml.dump(model_definition, f)

    return dataset_filename
Exemplo n.º 18
0
def test_api_training_set(csv_filename):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [sequence_feature(reduce_output='sum')]
        output_features = [category_feature(vocab_size=2, reduce_input='sum')]

        data_csv = generate_data(input_features, output_features, csv_filename)
        val_csv = shutil.copyfile(data_csv,
                                  os.path.join(tmpdir, 'validation.csv'))
        test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv'))

        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
        }
        model = LudwigModel(config)
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv)
        model.predict(dataset=test_csv)

        # Train again, this time the HDF5 cache will be used
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv)
Exemplo n.º 19
0
def _run_test(input_features=None, output_features=None, combiner=None):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = input_features or [
            sequence_feature(reduce_output="sum"),
            number_feature(),
        ]
        output_features = output_features or [
            category_feature(vocab_size=2, reduce_input="sum")
        ]
        combiner = combiner or {"type": "concat"}

        csv_filename = os.path.join(tmpdir, "training.csv")
        data_csv = generate_data(input_features, output_features, csv_filename)

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": combiner,
            TRAINER: {
                "epochs": 2
            },
        }

        model = LudwigModel(config, backend=LocalTestBackend())
        _, _, output_directory = model.train(
            dataset=data_csv,
            output_directory=tmpdir,
        )
        model.predict(dataset=data_csv, output_directory=output_directory)
Exemplo n.º 20
0
def test_experiment_model_resume(tmpdir):
    # Single sequence input, single category output
    # Tests saving a model file, loading it to rerun training and predict
    input_features = [sequence_feature(encoder="rnn", reduce_output="sum")]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]
    # Generate test data
    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        TRAINER: {
            "epochs": 2
        },
    }

    _, _, _, _, output_dir = experiment_cli(config,
                                            dataset=rel_path,
                                            output_directory=tmpdir)

    experiment_cli(config, dataset=rel_path, model_resume_path=output_dir)

    predict_cli(os.path.join(output_dir, "model"), dataset=rel_path)
    shutil.rmtree(output_dir, ignore_errors=True)
Exemplo n.º 21
0
def test_sample_ratio(backend, tmpdir):
    num_examples = 100
    sample_ratio = 0.25

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=5, reduce_input="sum")]
    data_csv = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, "dataset.csv"),
                             num_examples=num_examples)
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "trainer": {
            "epochs": 2,
        },
        "preprocessing": {
            "sample_ratio": sample_ratio
        },
    }

    with init_backend(backend):
        model = LudwigModel(config, backend=backend)
        train_set, val_set, test_set, _ = model.preprocess(
            data_csv,
            skip_save_processed_input=True,
        )

        sample_size = num_examples * sample_ratio
        count = len(train_set) + len(val_set) + len(test_set)
        assert sample_size == count
Exemplo n.º 22
0
def _prepare_data(csv_filename, config_filename):
    # Single sequence input, single category output
    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    # Generate test data
    dataset_filename = generate_data(input_features, output_features,
                                     csv_filename)

    # generate config file
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "fc_size": 14
        },
        "training": {
            "epochs": 2
        },
    }

    with open(config_filename, "w") as f:
        yaml.dump(config, f)

    return dataset_filename
Exemplo n.º 23
0
def test_config_bad_preprocessing_param():
    config = {
        'input_features': [
            sequence_feature(reduce_output='sum', encoder='fake'),
            image_feature(
                '/tmp/destination_folder',
                preprocessing={
                    'in_memory': True,
                    'height': 12,
                    'width': 12,
                    'num_channels': 3,
                    'tokenizer': 'space',
                },
            ),
        ],
        'output_features':
        [category_feature(vocab_size=2, reduce_input='sum')],
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
    }

    with pytest.raises(ValidationError, match=r"^'fake' is not one of .*"):
        validate_config(config)
Exemplo n.º 24
0
def test_experiment_model_resume(csv_filename):
    # Single sequence input, single category output
    # Tests saving a model file, loading it to rerun training and predict
    input_features = [sequence_feature(encoder='rnn', reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }

    _, _, _, _, output_dir = experiment_cli(config, dataset=rel_path)
    logger.info('Experiment Directory: {0}'.format(output_dir))

    experiment_cli(config, dataset=rel_path, model_resume_path=output_dir)

    predict_cli(os.path.join(output_dir, 'model'), dataset=rel_path)
    shutil.rmtree(output_dir, ignore_errors=True)
Exemplo n.º 25
0
def test_config_bad_preprocessing_param():
    config = {
        "input_features": [
            sequence_feature(reduce_output="sum", encoder="fake"),
            image_feature(
                "/tmp/destination_folder",
                preprocessing={
                    "in_memory": True,
                    "height": 12,
                    "width": 12,
                    "num_channels": 3,
                    "tokenizer": "space",
                },
            ),
        ],
        "output_features":
        [category_feature(vocab_size=2, reduce_input="sum")],
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
    }

    with pytest.raises(ValidationError, match=r"^'fake' is not one of .*"):
        validate_config(config)
Exemplo n.º 26
0
def test_validate_with_preprocessing_defaults():
    config = {
        "input_features": [
            audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults),
            bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults),
            binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults),
            category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults),
            date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults),
            h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults),
            image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults),
            numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults),
            sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults),
            set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults),
            text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults),
            timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults),
            vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults),
        ],
        "output_features": [{"name": "target", "type": "category"}],
        "training": {
            "decay": True,
            "learning_rate": 0.001,
            "validation_field": "target",
            "validation_metric": "accuracy",
        },
    }

    validate_config(config)
    config = merge_with_defaults(config)
    validate_config(config)
Exemplo n.º 27
0
def test_experiment_model_resume(csv_filename):
    # Single sequence input, single category output
    # Tests saving a model file, loading it to rerun training and predict
    input_features = [sequence_feature(encoder='rnn', reduce_output='sum')]
    output_features = [categorical_feature(vocab_size=2, reduce_input='sum')]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }

    exp_dir_name = experiment(model_definition, data_csv=rel_path)
    logging.info('Experiment Directory: {0}'.format(exp_dir_name))

    experiment(model_definition,
               data_csv=rel_path,
               model_resume_path=exp_dir_name)

    full_predict(os.path.join(exp_dir_name, 'model'), data_csv=rel_path)
    shutil.rmtree(exp_dir_name, ignore_errors=True)
Exemplo n.º 28
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINING] = {"batch_size": "42"}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT]["sampler"]["scheduler"] = SCHEDULER

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else default_early_stop
    assert merged_config[TRAINING]["early_stop"] == expected
Exemplo n.º 29
0
def test_missing_values_drop_rows(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}}
    input_features = [
        number_feature(),
        binary_feature(),
        category_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(**kwargs),
        number_feature(**kwargs),
        category_feature(vocab_size=3, **kwargs),
        sequence_feature(vocab_size=3, **kwargs),
        text_feature(vocab_size=3, **kwargs),
        set_feature(vocab_size=3, **kwargs),
        vector_feature(),
    ]
    backend = LocalTestBackend()
    config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}}

    training_data_csv_path = generate_data(input_features, output_features, data_csv_path)
    df = read_csv_with_nan(training_data_csv_path, nan_percent=0.1)

    # run preprocessing
    ludwig_model = LudwigModel(config, backend=backend)
    ludwig_model.preprocess(dataset=df)
Exemplo n.º 30
0
def test_confidence_thresholding_2thresholds_3d_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [
        text_feature(vocab_size=10, min_len=1, encoder='stacked_cnn'),
        numerical_feature(),
        category_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder='embed')
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        category_feature(vocab_size=2, reduce_input='sum')
    ]
    encoder = 'parallel_cnn'
    # Generate test data
    data_csv = generate_data(input_features, output_features, csv_filename)
    input_features[0]['encoder'] = encoder
    model = run_api_experiment(input_features, output_features)
    test_df, train_df, val_df = obtain_df_splits(data_csv)
    _, _, output_dir = model.train(training_set=train_df,
                                   validation_set=val_df)
    test_stats, predictions, _ = model.evaluate(dataset=test_df,
                                                collect_predictions=True,
                                                output_directory=output_dir)

    output_feature_name1 = output_features[0]['name']
    output_feature_name2 = output_features[1]['name']
    # probabilities need to be list of lists containing each row data from the
    # probability columns ref: https://ludwig-ai.github.io/ludwig-docs/api/#test - Return
    probability1 = predictions.iloc[:, [2, 3, 4]].values
    probability2 = predictions.iloc[:, [7, 8, 9]].values

    ground_truth_metadata = model.training_set_metadata
    target_predictions1 = test_df[output_feature_name1]
    target_predictions2 = test_df[output_feature_name2]
    ground_truth1 = np.asarray([
        ground_truth_metadata[output_feature_name1]['str2idx'][prediction]
        for prediction in target_predictions1
    ])
    ground_truth2 = np.asarray([
        ground_truth_metadata[output_feature_name2]['str2idx'][prediction]
        for prediction in target_predictions2
    ])
    viz_outputs = ('pdf', 'png')
    for viz_output in viz_outputs:
        vis_output_pattern_pdf = os.path.join(output_dir,
                                              '*.{}'.format(viz_output))
        visualize.confidence_thresholding_2thresholds_3d(
            [probability1, probability2], [ground_truth1, ground_truth2],
            [output_feature_name1, output_feature_name2],
            labels_limit=0,
            output_directory=output_dir,
            file_format=viz_output)
        figure_cnt = glob.glob(vis_output_pattern_pdf)
        assert 1 == len(figure_cnt)
    shutil.rmtree(output_dir, ignore_errors=True)