示例#1
0
def test_server_integration_with_audio(single_record, csv_filename):
    # Audio Inputs
    audio_dest_folder = os.path.join(os.getcwd(), "generated_audio")

    # Resnet encoder
    input_features = [
        audio_feature(
            folder=audio_dest_folder,
        ),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="zscore"),
    ]
    output_features = [category_feature(vocab_size=4), numerical_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)
    model, output_dir = train_model(input_features, output_features, data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.get("/")
    assert response.status_code == 200

    response = client.post("/predict")
    # expect the HTTP 400 error code for this situation
    assert response.status_code == 400
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)

    if single_record:
        # Single record prediction
        first_entry = data_df.T.to_dict()[0]
        data, files = convert_to_form(first_entry)
        server_response = client.post("/predict", data=data, files=files)
        assert server_response.status_code == 200
        server_response = server_response.json()

        server_response_keys = sorted(list(server_response.keys()))
        assert server_response_keys == sorted(output_keys_for(output_features))

        model_output, _ = model.predict(dataset=[first_entry], data_format=dict)
        model_output = model_output.to_dict("records")[0]
        assert model_output == server_response
    else:
        # Batch prediction
        assert len(data_df) > 1
        files = convert_to_batch_form(data_df)
        server_response = client.post("/batch_predict", files=files)
        assert server_response.status_code == 200
        server_response = server_response.json()

        server_response_keys = sorted(server_response["columns"])
        assert server_response_keys == sorted(output_keys_for(output_features))
        assert len(data_df) == len(server_response["data"])

        model_output, _ = model.predict(dataset=data_df)
        model_output = model_output.to_dict("split")
        assert model_output == server_response

    # Cleanup
    shutil.rmtree(output_dir, ignore_errors=True)
    shutil.rmtree(audio_dest_folder, ignore_errors=True)
示例#2
0

# test for single output feature
@pytest.mark.parametrize(
    'test_case',
    [
        TestCase(
            [numerical_feature()],
            ['loss', 'mean_squared_error', 'mean_absolute_error', 'r2']
        ),
        TestCase(
            [binary_feature()],
            ['loss', 'accuracy']
        ),
        TestCase(
            [category_feature()],
            ['loss', 'accuracy', 'hits_at_k']
        ),
        TestCase(
            [text_feature()],
            ['loss', 'token_accuracy', 'last_accuracy', 'edit_distance',
             'perplexity']
        )
    ]
)
def test_validation_metrics(test_case: TestCase, csv_filename: str):
    # setup test scenarios
    test_scenarios = []
    for output_feature in test_case.output_features:
        # single output feature capture feature specific metrics
        of_name = output_feature[NAME]
示例#3
0
def test_cache_dataset(use_cache_dir, use_split, tmpdir):
    dataset_manager = PandasDatasetManager(backend=LocalTestBackend())
    cache_dir = os.path.join(tmpdir, "cache") if use_cache_dir else None
    manager = CacheManager(dataset_manager, cache_dir=cache_dir)

    config = {
        "input_features": [sequence_feature(reduce_output="sum")],
        "output_features":
        [category_feature(vocab_size=2, reduce_input="sum")],
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        "preprocessing": {},
    }

    def touch(basename):
        path = os.path.join(tmpdir, f"{basename}.csv")
        Path(path).touch()
        return path

    dataset = training_set = test_set = validation_set = None
    if not use_split:
        dataset = touch("dataset")
        cache_key = manager.get_cache_key(dataset, config)
    else:
        training_set = touch("train")
        test_set = touch("test")
        validation_set = touch("validation")
        cache_key = manager.get_cache_key(training_set, config)

    training_set_metadata = {
        CHECKSUM: cache_key,
    }

    cache = manager.get_dataset_cache(config, dataset, training_set, test_set,
                                      validation_set)
    cache_map = cache.cache_map
    assert len(cache_map) == 4

    train_path = os.path.join(
        cache_dir, alphanum(cache_key)) if use_cache_dir else os.path.join(
            tmpdir, "dataset")
    test_path = val_path = train_path

    if use_split and not use_cache_dir:
        train_path = os.path.join(tmpdir, "train")
        test_path = os.path.join(tmpdir, "test")
        val_path = os.path.join(tmpdir, "validation")

    assert cache_map[META] == f"{train_path}.meta.json"
    assert cache_map[TRAINING] == f"{train_path}.{TRAINING_PREPROC_FILE_NAME}"
    assert cache_map[TEST] == f"{test_path}.test.hdf5"
    assert cache_map[VALIDATION] == f"{val_path}.validation.hdf5"

    for cache_path in cache_map.values():
        assert not os.path.exists(cache_path)

    training_set = pd.DataFrame()
    test_set = pd.DataFrame()
    validation_set = pd.DataFrame()

    if use_cache_dir:
        os.makedirs(cache_dir)
    cache.put(training_set, test_set, validation_set, training_set_metadata)

    for cache_path in cache_map.values():
        assert os.path.exists(cache_path)

    cache.delete()

    for cache_path in cache_map.values():
        assert not os.path.exists(cache_path)
示例#4
0
def hyperopt_results():
    """
    This function generates hyperopt results
    """
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    csv_filename = uuid.uuid4().hex[:10].upper() + '.csv'
    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "num_fc_layers": 2},
        "training": {"epochs": 2, "learning_rate": 0.001}
    }

    output_feature_name = output_features[0]['name']

    hyperopt_configs = {
        "parameters": {
            "training.learning_rate": {
                "type": "float",
                "low": 0.0001,
                "high": 0.01,
                "space": "log",
                "steps": 3,
            },
            output_feature_name + ".fc_size": {
                "type": "int",
                "low": 32,
                "high": 256,
                "steps": 5
            },
            output_feature_name + ".num_fc_layers": {
                'type': 'int',
                'low': 1,
                'high': 5,
                'space': 'linear',
                'steps': 4
            }
        },
        "goal": "minimize",
        'output_feature': output_feature_name,
        'validation_metrics': 'loss',
        'executor': {'type': 'serial'},
        'sampler': {'type': 'random', 'num_samples': 2}
    }

    # add hyperopt parameter space to the config
    config['hyperopt'] = hyperopt_configs

    hyperopt(
        config,
        dataset=rel_path,
        output_directory='results'
    )

    return os.path.abspath('results')
示例#5
0
# test for single output feature
@pytest.mark.parametrize(
    "test_case",
    [
        TestCase(
            [numerical_feature()],
            [
                "root_mean_squared_percentage_error",
                "mean_squared_error",
                "mean_absolute_error",
                "r2",
                "root_mean_squared_error",
            ],
        ),
        TestCase([binary_feature()], ["loss", "accuracy"]),
        TestCase([category_feature()], ["loss", "accuracy", "hits_at_k"]),
        # TODO(#1333): Re-enable.
        # TestCase(
        #     [text_feature()],
        #     ['loss', 'token_accuracy', 'last_accuracy', 'edit_distance',
        #      'perplexity']
        # )
    ],
)
def test_validation_metrics(test_case: TestCase, csv_filename: str):
    # setup test scenarios
    test_scenarios = []
    for output_feature in test_case.output_features:
        # single output feature capture feature specific metrics
        of_name = output_feature[COLUMN]
        for metric in test_case.validation_metrics:
示例#6
0
    input_features = [category_feature(vocab_size=10)]
    output_features = [
        set_feature(vocab_size=3, loss={"class_weights": [0, 1, 2, 3]})
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)


@pytest.mark.parametrize(
    "output_features",
    [
        # baseline test case
        [
            category_feature(vocab_size=2, reduce_input="sum"),
            sequence_feature(vocab_size=10, max_len=5),
            number_feature(),
        ],
        # use generator as decoder
        [
            category_feature(vocab_size=2, reduce_input="sum"),
            sequence_feature(vocab_size=10, max_len=5, decoder="generator"),
            number_feature(),
        ],
        # Generator decoder and reduce_input = None
        [
            category_feature(vocab_size=2, reduce_input="sum"),
            sequence_feature(max_len=5, decoder="generator",
                             reduce_input=None),
            number_feature(normalization="minmax"),
示例#7
0
            numerical_feature(),
            {'loss': {'type': 'mean_absolute_error'}}
        ),
        #
        #     # binary feature
        #     (binary_feature(), binary_feature(), None),
        #
        #     # Categorical feature
=======
        (numerical_feature(normalization="minmax"), numerical_feature(), {"loss": {"type": "mean_squared_error"}}),
        (numerical_feature(normalization="zscore"), numerical_feature(), {"loss": {"type": "mean_absolute_error"}}),
        # binary feature
        (binary_feature(), binary_feature(), None),
        # Categorical feature
>>>>>>> upstream/master
        (category_feature(), category_feature(), None),
        (category_feature(), category_feature(), {"loss": {"type": "softmax_cross_entropy"}}),
        #     (
        #             category_feature(),
        #             category_feature(),
        #             {'loss': {
        #                 'type': 'sampled_softmax_cross_entropy',
        #                 'sampler': 'fixed_unigram',
        #                 'negative_samples': 10
        #             }
        #             }
        #     ),
        #     (
        #             category_feature(),
        #             category_feature(),
        #             {'loss': {
示例#8
0
def run_hyperopt_executor(sampler,
                          executor,
                          csv_filename,
                          validate_output_feature=False,
                          validation_metric=None):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        },
        "hyperopt": {
            **HYPEROPT_CONFIG,
            "executor": executor,
            "sampler": sampler,
        },
    }

    config = merge_with_defaults(config)

    hyperopt_config = config["hyperopt"]

    if validate_output_feature:
        hyperopt_config['output_feature'] = output_features[0]['name']
    if validation_metric:
        hyperopt_config['validation_metric'] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    if sampler.get("search_alg", {}).get("type", "") == 'bohb':
        # bohb does not support grid_search search space
        del parameters['utterance.cell_type']

    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal,
                                                                   parameters,
                                                                   **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_executor.execute(config, dataset=rel_path)
示例#9
0
def test_hyperopt_run_hyperopt(csv_filename, ray_start_4_cpus):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        }
    }

    output_feature_name = output_features[0]['name']

    hyperopt_configs = {
        "parameters": {
            "training.learning_rate": {
                "space": "loguniform",
                "lower": 0.001,
                "upper": 0.1,
            },
            output_feature_name + ".fc_size": {
                "space": "randint",
                "lower": 32,
                "upper": 256
            },
            output_feature_name + ".num_fc_layers": {
                "space": "randint",
                "lower": 2,
                "upper": 6
            }
        },
        "goal": "minimize",
        'output_feature': output_feature_name,
        'validation_metrics': 'loss',
        'executor': {
            'type': 'ray'
        },
        'sampler': {
            'type': 'ray',
            'num_samples': 2
        }
    }

    # add hyperopt parameter space to the config
    config['hyperopt'] = hyperopt_configs

    hyperopt_results = hyperopt(config,
                                dataset=rel_path,
                                output_directory='results_hyperopt')

    # check for return results
    assert isinstance(hyperopt_results, list)

    # check for existence of the hyperopt statistics file
    assert os.path.isfile(
        os.path.join('results_hyperopt', 'hyperopt_statistics.json'))
示例#10
0
def test_torchscript(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, "generated_images")
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            set_feature(vocab_size=3),
            vector_feature()
            # TODO(#1333): Re-enable.
            # sequence_feature(vocab_size=3),
            # text_feature(vocab_size=3),
        ]

        predictions_column_name = "{}_predictions".format(output_features[0]["name"])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features, data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, "ludwigmodel")
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path)
        original_weights = deepcopy(list(ludwig_model.model.parameters()))

        #################
        # save torchscript
        #################
        torchscript_path = os.path.join(dir_path, "torchscript")
        shutil.rmtree(torchscript_path, ignore_errors=True)
        ludwig_model.model.save_torchscript(torchscript_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(list(ludwig_model.model.parameters()))

        #####################################################
        # restore torchscript, obtain predictions and weights
        #####################################################
        training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = torch.jit.load(torchscript_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: torch.from_numpy(dataset.dataset[feature.proc_column])
            for name, feature in ludwig_model.model.input_features.items()
        }

        # Get predictions from restored torchscript.
        logits = restored_model(data_to_predict)
        restored_predictions = torch.argmax(
            output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1
        )

        restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions]

        restored_weights = deepcopy(list(restored_model.parameters()))

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(torchscript_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # Check to weight values match the original model.
        assert utils.is_all_close(original_weights, loaded_weights)
        assert utils.is_all_close(original_weights, restored_weights)

        # Check that predictions are identical to the original model.
        assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name])

        assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
示例#11
0
def test_gbm_model_save_reload_api(tmpdir, csv_filename, tmp_path):
    torch.manual_seed(1)
    random.seed(1)
    np.random.seed(1)

    input_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3)
    ]
    output_features = [category_feature(vocab_size=3)]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train tree model
    #############
    config = {
        "model_type": "gbm",
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "num_boost_round": 2
        },
    }

    data_df = read_csv(data_csv_path)
    splitter = get_splitter("random")
    training_set, validation_set, test_set = splitter.split(
        data_df, LocalTestBackend())

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory="results",  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b
                          for a, b in zip(preds_1[key], preds_2[key])), key

        # Compare model weights
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.parameters(),
                                    if2.encoder_obj.parameters()):
                assert torch.allclose(if1_w, if2_w)

        tree1 = ludwig_model1.model.compiled_model
        tree2 = ludwig_model2.model.compiled_model
        for t1_w, t2_w in zip(tree1.parameters(), tree2.parameters()):
            assert torch.allclose(t1_w, t2_w)

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.parameters(),
                                    of2.decoder_obj.parameters()):
                assert torch.allclose(of1_w, of2_w)

    # Test saving and loading the model explicitly
    ludwig_model1.save(tmpdir)
    ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
    check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"),
                                        backend=backend)
    check_model_equal(ludwig_model_exp)
示例#12
0
def test_hyperopt_run_hyperopt(csv_filename, samplers):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        TRAINER: {
            "epochs": 2,
            "learning_rate": 0.001
        },
    }

    output_feature_name = output_features[0]["name"]

    hyperopt_configs = {
        "parameters": {
            "trainer.learning_rate": {
                "type": "float",
                "low": 0.0001,
                "high": 0.01,
                "space": "log",
                "steps": 3,
            },
            output_feature_name + ".fc_layers": {
                "type":
                "category",
                "values": [
                    [{
                        "output_size": 64
                    }, {
                        "output_size": 32
                    }],
                    [{
                        "output_size": 64
                    }],
                    [{
                        "output_size": 32
                    }],
                ],
            },
            output_feature_name + ".output_size": {
                "type": "int",
                "low": 16,
                "high": 36,
                "steps": 5
            },
            output_feature_name + ".num_fc_layers": {
                "type": "int",
                "low": 1,
                "high": 5,
                "space": "linear",
                "steps": 4
            },
        },
        "goal": "minimize",
        "output_feature": output_feature_name,
        "validation_metrics": "loss",
        "executor": {
            "type": "serial"
        },
        "sampler": {
            "type": samplers["type"],
            "num_samples": 2
        },
    }

    # add hyperopt parameter space to the config
    config["hyperopt"] = hyperopt_configs

    hyperopt_results = hyperopt(config,
                                dataset=rel_path,
                                output_directory="results_hyperopt")

    # check for return results
    assert isinstance(hyperopt_results, HyperoptResults)

    # check for existence of the hyperopt statistics file
    assert os.path.isfile(
        os.path.join("results_hyperopt", "hyperopt_statistics.json"))

    if os.path.isfile(
            os.path.join("results_hyperopt", "hyperopt_statistics.json")):
        os.remove(os.path.join("results_hyperopt", "hyperopt_statistics.json"))
示例#13
0
def test_confidence_thresholding_2thresholds_3d_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [
        text_feature(vocab_size=10, min_len=1, encoder="stacked_cnn"),
        numerical_feature(),
        category_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder="embed"),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
    ]
    encoder = "parallel_cnn"
    with TemporaryDirectory() as tmpvizdir:
        # Generate test data
        data_csv = generate_data(input_features, output_features,
                                 os.path.join(tmpvizdir, csv_filename))
        input_features[0]["encoder"] = encoder
        model = run_api_experiment(input_features, output_features)
        test_df, train_df, val_df = obtain_df_splits(data_csv)
        _, _, output_dir = model.train(training_set=train_df,
                                       validation_set=val_df,
                                       output_directory=os.path.join(
                                           tmpvizdir, "results"))
        test_stats, predictions, _ = model.evaluate(
            dataset=test_df,
            collect_predictions=True,
            output_directory=output_dir)

        output_feature_name1 = output_features[0]["name"]
        output_feature_name2 = output_features[1]["name"]

        ground_truth_metadata = model.training_set_metadata
        feature1_cols = [
            f"{output_feature_name1}_probabilities_{label}"
            for label in ground_truth_metadata[output_feature_name1]["idx2str"]
        ]
        feature2_cols = [
            f"{output_feature_name2}_probabilities_{label}"
            for label in ground_truth_metadata[output_feature_name2]["idx2str"]
        ]

        # probabilities need to be list of lists containing each row data from the
        # probability columns ref: https://ludwig-ai.github.io/ludwig-docs/api/#test - Return
        probability1 = predictions.loc[:, feature1_cols].values
        probability2 = predictions.loc[:, feature2_cols].values

        target_predictions1 = test_df[output_feature_name1]
        target_predictions2 = test_df[output_feature_name2]
        ground_truth1 = np.asarray([
            ground_truth_metadata[output_feature_name1]["str2idx"][prediction]
            for prediction in target_predictions1
        ])
        ground_truth2 = np.asarray([
            ground_truth_metadata[output_feature_name2]["str2idx"][prediction]
            for prediction in target_predictions2
        ])
        viz_outputs = ("pdf", "png")
        for viz_output in viz_outputs:
            vis_output_pattern_pdf = os.path.join(output_dir,
                                                  f"*.{viz_output}")
            visualize.confidence_thresholding_2thresholds_3d(
                [probability1, probability2],
                [ground_truth1, ground_truth2],
                model.training_set_metadata,
                [output_feature_name1, output_feature_name2],
                labels_limit=0,
                output_directory=output_dir,
                file_format=viz_output,
            )
            figure_cnt = glob.glob(vis_output_pattern_pdf)
            assert 1 == len(figure_cnt)
示例#14
0

@pytest.mark.parametrize(
    "input_features,output_features",
    [
        (
            [
                number_feature(num_layers=2,
                               encoder="dense",
                               preprocessing={"normalization": "zscore"})
            ],
            [number_feature()],
        ),
        ([image_feature(IMAGE_DIR, encoder="stacked_cnn")], [number_feature()
                                                             ]),
        ([image_feature(IMAGE_DIR, encoder="resnet")], [category_feature()]),
        (
            [category_feature(representation="dense")],
            [
                number_feature(decoder="regressor",
                               loss={"type": "mean_squared_error"},
                               num_fc_layers=5)
            ],
        ),
        ([date_feature()], [binary_feature()]),
        ([sequence_feature(encoder="parallel_cnn", cell_type="gru")
          ], [binary_feature()]),
        ([set_feature()], [set_feature()]),
    ],
)
def test_regularizers(
示例#15
0
def test_savedmodel(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)

    # Single sequence input, single category output
    sf = sequence_feature()
    sf['encoder'] = 'parallel_cnn'
    input_features = [sf]
    input_feature_name = input_features[0]['name']
    input_feature_tensor_name = '{}/{}_placeholder:0'.format(
        input_feature_name, input_feature_name)
    output_features = [category_feature(vocab_size=2)]
    output_feature_name = output_features[0]['name']
    output_feature_tensor_name = '{}/predictions_{}/predictions_{}:0'.format(
        output_feature_name, output_feature_name, output_feature_name)
    predictions_column_name = '{}_predictions'.format(output_feature_name)
    weight_tensor_name = '{}/fc_0/weights:0'.format(input_feature_name)

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    #################
    # save savedmodel
    #################
    savedmodel_path = os.path.join(dir_path, 'savedmodel')
    shutil.rmtree(savedmodel_path, ignore_errors=True)
    ludwig_model.model.save_savedmodel(savedmodel_path)

    ##############################
    # collect weight tensors names
    ##############################
    with ludwig_model.model.session as sess:
        all_variables = tf.compat.v1.trainable_variables()
        all_variables_names = [v.name for v in all_variables]
    ludwig_model.close()

    ###################################################
    # load Ludwig model, obtain predictions and weights
    ###################################################
    ludwig_model = LudwigModel.load(ludwigmodel_path)
    ludwig_prediction_df = ludwig_model.predict(data_csv=data_csv_path)
    ludwig_weights = ludwig_model.model.collect_weights(all_variables_names)
    ludwig_model.close()

    #################################################
    # load savedmodel, obtain predictions and weights
    #################################################
    train_set_metadata_json_fp = os.path.join(ludwigmodel_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    dataset, train_set_metadata = preprocess_for_prediction(
        ludwigmodel_path,
        split=FULL,
        data_csv=data_csv_path,
        train_set_metadata=train_set_metadata_json_fp,
        evaluate_performance=False)

    with tf.compat.v1.Session() as sess:
        tf.saved_model.loader.load(sess, [tf.saved_model.SERVING],
                                   savedmodel_path)

        predictions = sess.run(output_feature_tensor_name,
                               feed_dict={
                                   input_feature_tensor_name:
                                   dataset.get(input_feature_name),
                               })

        savedmodel_prediction_df = pd.DataFrame(
            data=[
                train_set_metadata[output_feature_name]["idx2str"][p]
                for p in predictions
            ],
            columns=[predictions_column_name])

        savedmodel_weights = sess.run({n: n for n in all_variables_names})

    #########
    # Cleanup
    #########
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    shutil.rmtree(savedmodel_path, ignore_errors=True)

    ###############################################
    # Check if weights and predictions are the same
    ###############################################

    for var in all_variables_names:
        print("Are the weights in {} identical?".format(var),
              np.all(ludwig_weights[var] == savedmodel_weights[var]))
    print(
        "Are loaded model predictions identical to original ones?",
        np.all(
            original_predictions_df[predictions_column_name] == \
            ludwig_prediction_df[predictions_column_name]
        )
    )
    print(
        "Are savedmodel predictions identical to loaded model?",
        np.all(
            ludwig_prediction_df[predictions_column_name] == \
            savedmodel_prediction_df[predictions_column_name]
        )
    )

    for var in all_variables_names:
        assert np.all(ludwig_weights[var] == savedmodel_weights[var])
    assert np.all(
        original_predictions_df[predictions_column_name] == \
        ludwig_prediction_df[predictions_column_name]
    )
    assert np.all(
        ludwig_prediction_df[predictions_column_name] == \
        savedmodel_prediction_df[predictions_column_name]
    )
示例#16
0
def test_config_bad_combiner_types_enums():
    config = {
        "input_features": [
            category_feature(vocab_size=2, reduce_input="sum"),
            number_feature(),
        ],
        "output_features": [binary_feature(weight_regularization=None)],
        "combiner": {
            "type": "concat",
            "weights_initializer": "zeros"
        },
    }

    # config is valid at this point
    validate_config(config)

    # Test weights initializer:
    config["combiner"]["weights_initializer"] = {"test": "fail"}
    with pytest.raises(ValidationError, match=r"{'test': 'fail'} is not of*"):
        validate_config(config)
    config["combiner"]["weights_initializer"] = "fail"
    with pytest.raises(ValidationError, match=r"'fail' is not of*"):
        validate_config(config)
    config["combiner"]["weights_initializer"] = {}
    with pytest.raises(ValidationError, match=r"Failed validating 'type'"):
        validate_config(config)
    config["combiner"]["weights_initializer"] = {"type": "fail"}
    with pytest.raises(ValidationError, match=r"'fail' is not one of*"):
        validate_config(config)
    config["combiner"]["weights_initializer"] = {"type": "normal", "stddev": 0}
    validate_config(config)

    # Test bias initializer:
    del config["combiner"]["weights_initializer"]
    config["combiner"]["bias_initializer"] = "kaiming_uniform"
    validate_config(config)
    config["combiner"]["bias_initializer"] = "fail"
    with pytest.raises(ValidationError, match=r"'fail' is not of*"):
        validate_config(config)
    config["combiner"]["bias_initializer"] = {}
    with pytest.raises(ValidationError, match=r"Failed validating 'type'"):
        validate_config(config)
    config["combiner"]["bias_initializer"] = {"type": "fail"}
    with pytest.raises(ValidationError, match=r"'fail' is not one of*"):
        validate_config(config)
    config["combiner"]["bias_initializer"] = {"type": "zeros", "stddev": 0}
    validate_config(config)

    # Test norm:
    del config["combiner"]["bias_initializer"]
    config["combiner"]["norm"] = "batch"
    validate_config(config)
    config["combiner"]["norm"] = "fail"
    with pytest.raises(ValidationError, match=r"'fail' is not one of*"):
        validate_config(config)

    # Test activation:
    del config["combiner"]["norm"]
    config["combiner"]["activation"] = "relu"
    validate_config(config)
    config["combiner"]["activation"] = 123
    with pytest.raises(ValidationError, match=r"123 is not of type*"):
        validate_config(config)

    # Test reduce_output:
    del config["combiner"]["activation"]
    config2 = {**config}
    config2["combiner"]["type"] = "tabtransformer"
    config2["combiner"]["reduce_output"] = "sum"
    validate_config(config)
    config2["combiner"]["reduce_output"] = "fail"
    with pytest.raises(ValidationError, match=r"'fail' is not one of*"):
        validate_config(config2)

    # Test reduce_output = None:
    config2["combiner"]["reduce_output"] = None
    validate_config(config2)
示例#17
0
def t_neuropod(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder),
        timeseries_feature(),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature()
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    ################
    # build neuropod
    ################
    neuropod_path = os.path.join(dir_path, 'neuropod')
    export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path)

    ########################
    # predict using neuropod
    ########################
    data_df = pd.read_csv(data_csv_path)
    if_dict = {
        input_feature['name']: np.expand_dims(
            np.array([str(x) for x in data_df[input_feature['name']].tolist()],
                     dtype='str'), 1)
        for input_feature in input_features
    }

    from neuropod.loader import load_neuropod
    neuropod_model = load_neuropod(neuropod_path)
    preds = neuropod_model.infer(if_dict)

    for key in preds:
        preds[key] = np.squeeze(preds[key])

    #########
    # cleanup
    #########
    # Delete the temporary data created
    for path in [
            ludwigmodel_path, neuropod_path, image_dest_folder,
            audio_dest_folder
    ]:
        if os.path.exists(path):
            if os.path.isfile(path):
                os.remove(path)
            else:
                shutil.rmtree(path, ignore_errors=True)

    ########
    # checks
    ########
    for output_feature in output_features:
        output_feature_name = output_feature['name']
        output_feature_type = output_feature['type']

        if (output_feature_name + "_predictions" in preds
                and output_feature_name + "_predictions"
                in original_predictions_df):
            neuropod_pred = preds[output_feature_name +
                                  "_predictions"].tolist()
            if output_feature_type == BINARY:
                neuropod_pred = list(map(lambda x: str2bool(x), neuropod_pred))
            if output_feature_type in {SEQUENCE, TEXT, SET}:
                neuropod_pred = list(map(lambda x: x.split(), neuropod_pred))

            original_pred = original_predictions_df[output_feature_name +
                                                    "_predictions"].tolist()

            assert neuropod_pred == original_pred

        if (output_feature_name + "_probability" in preds
                and output_feature_name + "_probability"
                in original_predictions_df):
            neuropod_prob = preds[output_feature_name +
                                  "_probability"].tolist()
            if output_feature_type in {SEQUENCE, TEXT, SET}:
                neuropod_prob = list(
                    map(lambda x: [float(n) for n in x.split()],
                        neuropod_prob))
            if any(isinstance(el, list) for el in neuropod_prob):
                neuropod_prob = np.array(
                    list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T

            original_prob = original_predictions_df[output_feature_name +
                                                    "_probability"].tolist()
            if any(isinstance(el, list) for el in original_prob):
                original_prob = np.array(
                    list(itertools.zip_longest(*original_prob, fillvalue=0))).T

            assert np.isclose(neuropod_prob, original_prob).all()

        if (output_feature_name + "_probabilities" in preds
                and output_feature_name + "_probabilities"
                in original_predictions_df):
            neuropod_prob = preds[output_feature_name +
                                  "_probabilities"].tolist()

            original_prob = original_predictions_df[output_feature_name +
                                                    "_probabilities"].tolist()

            assert np.isclose(neuropod_prob, original_prob).all()
示例#18
0
def test_hyperopt_run_hyperopt(csv_filename, samplers):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        }
    }

    output_feature_name = output_features[0]['name']

    hyperopt_configs = {
        "parameters": {
            "training.learning_rate": {
                "type": "float",
                "low": 0.0001,
                "high": 0.01,
                "space": "log",
                "steps": 3,
            },
            output_feature_name + ".fc_layers": {
                'type':
                'category',
                'values': [[{
                    'fc_size': 512
                }, {
                    'fc_size': 256
                }], [{
                    'fc_size': 512
                }], [{
                    'fc_size': 256
                }]]
            },
            output_feature_name + ".fc_size": {
                "type": "int",
                "low": 32,
                "high": 256,
                "steps": 5
            },
            output_feature_name + ".num_fc_layers": {
                'type': 'int',
                'low': 1,
                'high': 5,
                'space': 'linear',
                'steps': 4
            }
        },
        "goal": "minimize",
        'output_feature': output_feature_name,
        'validation_metrics': 'loss',
        'executor': {
            'type': 'serial'
        },
        'sampler': {
            'type': samplers["type"],
            'num_samples': 2
        }
    }

    # add hyperopt parameter space to the config
    config['hyperopt'] = hyperopt_configs

    hyperopt_results = hyperopt(config,
                                dataset=rel_path,
                                output_directory='results_hyperopt')

    # check for return results
    assert isinstance(hyperopt_results, list)

    # check for existence of the hyperopt statistics file
    assert os.path.isfile(
        os.path.join('results_hyperopt', 'hyperopt_statistics.json'))

    if os.path.isfile(
            os.path.join('results_hyperopt', 'hyperopt_statistics.json')):
        os.remove(os.path.join('results_hyperopt', 'hyperopt_statistics.json'))
示例#19
0
def test_experiment_image_dataset(train_format, train_in_memory, test_format,
                                  test_in_memory, tmpdir):
    # Image Inputs
    image_dest_folder = os.path.join(tmpdir, "generated_images")

    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="stacked_cnn",
            preprocessing={
                "in_memory": True,
                "height": 12,
                "width": 12,
                "num_channels": 3,
                "num_processes": 5
            },
            output_size=16,
            num_filters=8,
        ),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
    ]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        "preprocessing": {},
        TRAINER: {
            "epochs": 2
        },
    }

    # create temporary name for train and test data sets
    train_csv_filename = os.path.join(
        tmpdir, "train_" + uuid.uuid4().hex[:10].upper() + ".csv")
    test_csv_filename = os.path.join(
        tmpdir, "test_" + uuid.uuid4().hex[:10].upper() + ".csv")

    # setup training data format to test
    train_data = generate_data(input_features, output_features,
                               train_csv_filename)
    config["input_features"][0]["preprocessing"]["in_memory"] = train_in_memory
    training_set_metadata = None

    backend = LocalTestBackend()
    if train_format == "hdf5":
        # hdf5 format
        train_set, _, _, training_set_metadata = preprocess_for_training(
            config,
            dataset=train_data,
            backend=backend,
        )
        train_dataset_to_use = train_set.data_hdf5_fp
    else:
        train_dataset_to_use = create_data_set_to_use(train_format, train_data)

    # define Ludwig model
    model = LudwigModel(
        config=config,
        backend=backend,
    )
    model.train(dataset=train_dataset_to_use,
                training_set_metadata=training_set_metadata)

    model.config["input_features"][0]["preprocessing"][
        "in_memory"] = test_in_memory

    # setup test data format to test
    test_data = generate_data(input_features, output_features,
                              test_csv_filename)

    if test_format == "hdf5":
        # hdf5 format
        # create hdf5 data set
        _, test_set, _, training_set_metadata_for_test = preprocess_for_training(
            model.config,
            dataset=test_data,
            backend=backend,
        )
        test_dataset_to_use = test_set.data_hdf5_fp
    else:
        test_dataset_to_use = create_data_set_to_use(test_format, test_data)

    # run functions with the specified data format
    model.evaluate(dataset=test_dataset_to_use)
    model.predict(dataset=test_dataset_to_use)
示例#20
0
     # input feature
     [
         number_feature(normalization="zscore"),
         number_feature(normalization="zscore")
     ],
     # output feature
     [binary_feature()],
 ),
 FeaturesToUse(
     # input feature
     [
         number_feature(normalization="zscore"),
         number_feature(normalization="zscore")
     ],
     # output feature
     [category_feature(vocab_size=4, reduce_input="sum")],
 ),
 FeaturesToUse(
     # input feature
     # [sequence_feature(min_len=5, max_len=10, encoder="rnn", cell_type="lstm", reduce_output=None)],
     [
         number_feature(normalization="zscore"),
         number_feature(normalization="zscore")
     ],
     # output feature
     [
         sequence_feature(min_len=5,
                          max_len=10,
                          decoder="generator",
                          cell_type="lstm",
                          attention="bahdanau",
示例#21
0
def test_mlflow_callback(tmpdir):
    epochs = 2
    batch_size = 8
    num_examples = 32

    input_features = [sequence_feature(reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': epochs,
            'batch_size': batch_size
        },
    }

    data_csv = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, 'train.csv'),
                             num_examples=num_examples)
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'validation.csv'))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv'))

    mlflow_uri = f'file://{tmpdir}/mlruns'
    mlflow.set_tracking_uri(mlflow_uri)
    client = MlflowClient(tracking_uri=mlflow_uri)

    exp_name = 'mlflow_test'
    callback = MlflowCallback()

    model = LudwigModel(config, callbacks=[callback])
    model.train(training_set=data_csv,
                validation_set=val_csv,
                test_set=test_csv,
                experiment_name=exp_name)
    expected_df, _ = model.predict(test_csv)

    # Check mlflow artifacts
    assert callback.experiment_id is not None
    assert callback.run is not None

    experiment = mlflow.get_experiment_by_name(exp_name)
    assert experiment.experiment_id == callback.experiment_id

    df = mlflow.search_runs([experiment.experiment_id])
    assert len(df) == 1

    run_id = df.run_id[0]
    assert run_id == callback.run.info.run_id

    artifacts = [
        f.path for f in client.list_artifacts(callback.run.info.run_id, "")
    ]
    local_dir = f'{tmpdir}/local_artifacts'
    os.makedirs(local_dir)

    assert 'config.yaml' in artifacts
    local_config_path = client.download_artifacts(callback.run.info.run_id,
                                                  "config.yaml", local_dir)

    with open(local_config_path, 'r') as f:
        config_artifact = yaml.safe_load(f)
    assert config_artifact == config

    model_path = f'runs:/{callback.run.info.run_id}/model'
    loaded_model = mlflow.pyfunc.load_model(model_path)

    assert 'ludwig' in loaded_model.metadata.flavors
    flavor = loaded_model.metadata.flavors['ludwig']

    def compare_features(key):
        assert len(model.config[key]) == len(flavor['ludwig_schema'][key])
        for feature, schema_feature in zip(model.config[key],
                                           flavor['ludwig_schema'][key]):
            assert feature['name'] == schema_feature['name']
            assert feature['type'] == schema_feature['type']

    compare_features('input_features')
    compare_features('output_features')

    test_df = pd.read_csv(test_csv)
    pred_df = loaded_model.predict(test_df)
    assert (pred_df.equals(expected_df))
示例#22
0
def test_server_integration(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder,
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3
                      },
                      fc_size=16,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [category_feature(vocab_size=2), numerical_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)
    model, output_dir = train_model(input_features,
                                    output_features,
                                    data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.get('/')
    assert response.status_code == 200

    response = client.post('/predict')
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)

    # One-off prediction
    first_entry = data_df.T.to_dict()[0]
    data, files = convert_to_form(first_entry)
    server_response = client.post('/predict', data=data, files=files)
    server_response = server_response.json()

    server_response_keys = sorted(list(server_response.keys()))
    assert server_response_keys == sorted(output_keys_for(output_features))

    model_output, _ = model.predict(dataset=[first_entry], data_format=dict)
    model_output = model_output.to_dict('records')[0]
    assert model_output == server_response

    # Batch prediction
    assert len(data_df) > 1
    files = convert_to_batch_form(data_df)
    server_response = client.post('/batch_predict', files=files)
    server_response = server_response.json()

    server_response_keys = sorted(server_response['columns'])
    assert server_response_keys == sorted(output_keys_for(output_features))
    assert len(data_df) == len(server_response['data'])

    model_output, _ = model.predict(dataset=data_df)
    model_output = model_output.to_dict('split')
    assert model_output == server_response

    # Cleanup
    shutil.rmtree(output_dir, ignore_errors=True)
    shutil.rmtree(image_dest_folder, ignore_errors=True)
示例#23
0
def test_savedmodel(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)

    # Single sequence input, single category output
    sf = sequence_feature()
    sf['encoder'] = 'parallel_cnn'
    input_features = [sf]

    output_features = [category_feature(vocab_size=2)]

    predictions_column_name = '{}_predictions'.format(
        output_features[0]['name'])

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    #################
    # save savedmodel
    #################
    savedmodel_path = os.path.join(dir_path, 'savedmodel')
    shutil.rmtree(savedmodel_path, ignore_errors=True)
    ludwig_model.model.save_savedmodel(savedmodel_path)

    ##############################
    # collect weight tensors names
    ##############################
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)
    original_weights = deepcopy(ludwig_model.model.model.trainable_variables)
    ludwig_model.close()

    ###################################################
    # load Ludwig model, obtain predictions and weights
    ###################################################
    ludwig_model = LudwigModel.load(ludwigmodel_path)
    loaded_prediction_df = ludwig_model.predict(data_csv=data_csv_path)
    loaded_weights = deepcopy(ludwig_model.model.model.trainable_variables)

    #################################################
    # restore savedmodel, obtain predictions and weights
    #################################################
    train_set_metadata_json_fp = os.path.join(ludwigmodel_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    dataset, train_set_metadata = preprocess_for_prediction(
        ludwigmodel_path,
        split=FULL,
        data_csv=data_csv_path,
        train_set_metadata=train_set_metadata_json_fp,
        evaluate_performance=False)

    restored_model = tf.saved_model.load(savedmodel_path)

    if_name = list(ludwig_model.model.model.input_features.keys())[0]
    of_name = list(ludwig_model.model.model.output_features.keys())[0]

    data_to_predict = {
        if_name: tf.convert_to_tensor(dataset.dataset[if_name], dtype=tf.int32)
    }

    logits = restored_model(data_to_predict, False, None)

    restored_predictions = tf.argmax(logits[of_name]['logits'],
                                     -1,
                                     name='predictions_{}'.format(of_name))
    restored_predictions = tf.map_fn(
        lambda idx: train_set_metadata[of_name]['idx2str'][idx],
        restored_predictions,
        dtype=tf.string)

    restored_weights = deepcopy(restored_model.trainable_variables)

    #########
    # Cleanup
    #########
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    shutil.rmtree(savedmodel_path, ignore_errors=True)

    ###############################################
    # Check if weights and predictions are the same
    ###############################################

    # check for same number of weights as original model
    assert len(original_weights) == len(loaded_weights)
    assert len(original_weights) == len(restored_weights)

    # check to ensure weight valuess match the original model
    loaded_weights_match = np.all([
        np.all(
            np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy()))
        for i in range(len(original_weights))
    ])
    restored_weights_match = np.all([
        np.all(
            np.isclose(original_weights[i].numpy(),
                       restored_weights[i].numpy()))
        for i in range(len(original_weights))
    ])

    assert loaded_weights_match and restored_weights_match

    #  Are predictions identical to original ones?
    loaded_predictions_match = np.all(
        original_predictions_df[predictions_column_name] ==
        loaded_prediction_df[predictions_column_name])

    restored_predictions_match = np.all(
        original_predictions_df[predictions_column_name] ==
        restored_predictions.numpy().astype('str'))

    assert loaded_predictions_match and restored_predictions_match
示例#24
0
def test_model_save_reload_api(csv_filename, tmp_path):
    torch.manual_seed(1)
    random.seed(1)
    np.random.seed(1)

    image_dest_folder = os.path.join(os.getcwd(), "generated_images")
    audio_dest_folder = os.path.join(os.getcwd(), "generated_audio")

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3,
                     encoder="rnn",
                     cell_type="lstm",
                     num_layers=2,
                     bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder="stacked_cnn"),
        timeseries_feature(encoder="parallel_cnn"),
        sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        # TODO(#1333): Reintroduce sequence and text after sequence output feature.
        # sequence_feature(vocab_size=3),
        # text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "training": {
            "epochs": 2
        }
    }

    data_df = read_csv(data_csv_path)
    data_df[SPLIT] = get_split(data_df)
    training_set, test_set, validation_set = split_dataset_ttv(data_df, SPLIT)
    training_set = pd.DataFrame(training_set)
    validation_set = pd.DataFrame(validation_set)
    test_set = pd.DataFrame(test_set)

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory="results",  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b
                          for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        # this has to be done after predicts because of TF2 lazy restoration
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.parameters(),
                                    if2.encoder_obj.parameters()):
                assert torch.allclose(if1_w, if2_w)

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.parameters(), c2.parameters()):
            assert torch.allclose(c1_w, c2_w)

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.parameters(),
                                    of2.decoder_obj.parameters()):
                assert torch.allclose(of1_w, of2_w)

    # Test saving and loading the model explicitly
    with tempfile.TemporaryDirectory() as tmpdir:
        ludwig_model1.save(tmpdir)
        ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
        check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"),
                                        backend=backend)
    check_model_equal(ludwig_model_exp)
示例#25
0
def test_experiment_image_dataset(
        train_format, train_in_memory,
        test_format, test_in_memory
):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting

    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder='stacked_cnn',
            preprocessing={
                'in_memory': True,
                'height': 12,
                'width': 12,
                'num_channels': 3,
                'num_processes': 5
            },
            fc_size=16,
            num_filters=8
        ),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
    ]

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'preprocessing': {},
        'training': {'epochs': 2}
    }

    # create temporary name for train and test data sets
    train_csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv'
    test_csv_filename = 'test_' + uuid.uuid4().hex[:10].upper() + '.csv'

    # setup training data format to test
    train_data = generate_data(input_features, output_features,
                               train_csv_filename)
    model_definition['input_features'][0]['preprocessing']['in_memory'] \
        = train_in_memory
    training_set_metadata = None
    if train_format == 'csv':
        train_dataset_to_use = train_data

    elif train_format in {'df', 'dict'}:
        train_dataset_to_use = pd.read_csv(train_data)
        if train_format == 'dict':
            train_dataset_to_use = train_dataset_to_use.to_dict(orient='list')

    else:
        # hdf5 format
        train_set, _, _, training_set_metadata = preprocess_for_training(
            model_definition,
            dataset=train_data
        )
        train_dataset_to_use = train_set.data_hdf5_fp

    # define Ludwig model
    model = LudwigModel(
        model_definition=model_definition,
        random_seed=default_random_seed
    )
    model.train(
        dataset=train_dataset_to_use,
        training_set_metadata=training_set_metadata
    )

    model.model_definition['input_features'][0]['preprocessing']['in_memory'] \
        = test_in_memory

    # setup test data format to test
    test_data = generate_data(input_features, output_features,
                              test_csv_filename)
    if test_format == 'csv':
        test_dataset_to_use = test_data

    elif test_format in {'df', 'dict'}:
        test_dataset_to_use = pd.read_csv(test_data)
        if test_format == 'dict':
            test_dataset_to_use = test_dataset_to_use.to_dict(orient='list')

    else:
        # hdf5 format
        # create hdf5 data set
        _, test_set, _, training_set_metadata_for_test = preprocess_for_training(
            model.model_definition,
            dataset=test_data
        )
        test_dataset_to_use = test_set.data_hdf5_fp

    # run functions with the specified data format
    model.evaluate(dataset=test_dataset_to_use)
    model.predict(dataset=test_dataset_to_use)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
    delete_temporary_data(train_csv_filename)
    delete_temporary_data(test_csv_filename)
示例#26
0
def test_cache_dataset(use_cache_dir, use_split, tmpdir):
    dataset_manager = PandasDatasetManager(backend=LocalTestBackend())
    cache_dir = os.path.join(tmpdir, 'cache') if use_cache_dir else None
    manager = CacheManager(dataset_manager, cache_dir=cache_dir)

    config = {
        'input_features': [sequence_feature(reduce_output='sum')],
        'output_features': [category_feature(vocab_size=2, reduce_input='sum')],
        'combiner': {'type': 'concat', 'fc_size': 14},
        'preprocessing': {},
    }

    def touch(basename):
        path = os.path.join(tmpdir, f'{basename}.csv')
        Path(path).touch()
        return path

    dataset = training_set = test_set = validation_set = None
    if not use_split:
        dataset = touch('dataset')
        cache_key = manager.get_cache_key(dataset, config)
    else:
        training_set = touch('train')
        test_set = touch('test')
        validation_set = touch('validation')
        cache_key = manager.get_cache_key(training_set, config)

    training_set_metadata = {
        CHECKSUM: cache_key,
    }

    cache = manager.get_dataset_cache(
        config, dataset, training_set, test_set, validation_set
    )
    cache_map = cache.cache_map
    assert len(cache_map) == 4

    train_path = os.path.join(cache_dir, alphanum(cache_key)) if \
        use_cache_dir else \
        os.path.join(tmpdir, 'dataset')
    test_path = val_path = train_path

    if use_split and not use_cache_dir:
        train_path = os.path.join(tmpdir, 'train')
        test_path = os.path.join(tmpdir, 'test')
        val_path = os.path.join(tmpdir, 'validation')

    assert cache_map[META] == f'{train_path}.meta.json'
    assert cache_map[TRAINING] == f'{train_path}.training.hdf5'
    assert cache_map[TEST] == f'{test_path}.test.hdf5'
    assert cache_map[VALIDATION] == f'{val_path}.validation.hdf5'

    for cache_path in cache_map.values():
        assert not os.path.exists(cache_path)

    training_set = pd.DataFrame()
    test_set = pd.DataFrame()
    validation_set = pd.DataFrame()

    if use_cache_dir:
        os.makedirs(cache_dir)
    cache.put(training_set, test_set, validation_set, training_set_metadata)

    for cache_path in cache_map.values():
        assert os.path.exists(cache_path)

    cache.delete()

    for cache_path in cache_map.values():
        assert not os.path.exists(cache_path)
示例#27
0
@contextlib.contextmanager
def graph_mode():
    prev_mode = tf.config.experimental_functions_run_eagerly()
    try:
        tf.config.experimental_run_functions_eagerly(False)
        yield
    finally:
        tf.config.experimental_run_functions_eagerly(prev_mode)


@pytest.mark.parametrize(
    'output_features',
    [
        # baseline test case
        [
            category_feature(vocab_size=2, reduce_input='sum'),
            sequence_feature(vocab_size=10, max_len=5),
            numerical_feature()
        ],

        # use generator as decoder
        [
            category_feature(vocab_size=2, reduce_input='sum'),
            sequence_feature(vocab_size=10, max_len=5, decoder='generator'),
            numerical_feature()
        ],

        # Generator decoder and reduce_input = None
        [
            category_feature(vocab_size=2, reduce_input='sum'),
            sequence_feature(max_len=5, decoder='generator',
示例#28
0
def test_binary_predictions_with_number_dtype(tmpdir, backend,
                                              distinct_values):
    input_features = [
        category_feature(vocab_size=3),
    ]

    feature = binary_feature()
    output_features = [
        feature,
    ]

    data_csv_path = generate_data(
        input_features,
        output_features,
        os.path.join(tmpdir, "dataset.csv"),
        num_examples=100,
    )
    data_df = pd.read_csv(data_csv_path)

    # Optionally convert bool values to strings, e.g., {'Yes', 'No'}
    false_value, true_value = distinct_values
    data_df[feature[NAME]] = data_df[feature[NAME]].map(lambda x: true_value
                                                        if x else false_value)
    data_df.to_csv(data_csv_path, index=False)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 1
        }
    }

    patch_args = (
        "ludwig.features.binary_feature.BinaryOutputFeature.logits",
        partial(random_binary_logits, num_predict_samples=len(data_df)),
    )

    preds_df, _ = predict_with_backend(tmpdir,
                                       config,
                                       data_csv_path,
                                       backend,
                                       patch_args=patch_args)
    cols = set(preds_df.columns)
    assert f"{feature[NAME]}_predictions" in cols
    assert f"{feature[NAME]}_probabilities_False" in cols
    assert f"{feature[NAME]}_probabilities_True" in cols
    assert f"{feature[NAME]}_probability" in cols

    for pred, prob_0, prob_1, prob in zip(
            preds_df[f"{feature[NAME]}_predictions"],
            preds_df[f"{feature[NAME]}_probabilities_False"],
            preds_df[f"{feature[NAME]}_probabilities_True"],
            preds_df[f"{feature[NAME]}_probability"],
    ):
        assert isinstance(pred, bool)
        if pred:
            assert prob_1 == prob
        else:
            assert prob_0 == prob
        assert np.allclose(prob_0, 1 - prob_1)
示例#29
0
def test_kfold_cv():
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        training_data_fp = os.path.join(tmpdir, 'train.csv')
        model_definition_fp = os.path.join(tmpdir, 'model_definition.yaml')
        results_dir = os.path.join(tmpdir, 'results')
        statistics_fp = os.path.join(results_dir,
                                     'kfold_training_statistics.json')
        indices_fp = os.path.join(results_dir, 'kfold_split_indices.json')

        # generate synthetic data for the test
        input_features = [
            numerical_feature(normalization='zscore'),
            numerical_feature(normalization='zscore')
        ]

        output_features = [category_feature(vocab_size=2, reduce_input='sum')]

        generate_data(input_features, output_features, training_data_fp)

        # generate model definition file
        model_definition = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
            'training': {
                'epochs': 2
            }
        }

        with open(model_definition_fp, 'w') as f:
            yaml.dump(model_definition, f)

        # run k-fold cv
        kfold_cross_validate(k_fold=num_folds,
                             model_definition_file=model_definition_fp,
                             data_csv=training_data_fp,
                             output_directory=results_dir,
                             logging_level='warn')

        # check for expected results
        # check for existence and structure of statistics file
        assert os.path.isfile(statistics_fp)

        # check for required keys
        cv_statistics = load_json(statistics_fp)
        for key in ['fold_' + str(i + 1)
                    for i in range(num_folds)] + ['overall']:
            assert key in cv_statistics

        # check for existence and structure of split indices file
        assert os.path.isfile(indices_fp)

        # check for required keys
        cv_indices = load_json(indices_fp)
        for key in ['fold_' + str(i + 1) for i in range(num_folds)]:
            assert key in cv_indices
示例#30
0
def test_server_integration_with_images(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            preprocessing={"in_memory": True, "height": 8, "width": 8, "num_channels": 3},
            fc_size=16,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="zscore"),
    ]
    output_features = [category_feature(vocab_size=4), numerical_feature()]

    np.random.seed(123)  # reproducible synthetic data
    rel_path = generate_data(input_features, output_features, csv_filename)
    model, output_dir = train_model(input_features, output_features, data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.get("/")
    assert response.status_code == 200

    response = client.post("/predict")
    # expect the HTTP 400 error code for this situation
    assert response.status_code == 400
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)

    # One-off prediction
    first_entry = data_df.T.to_dict()[0]
    data, files = convert_to_form(first_entry)
    server_response = client.post("/predict", data=data, files=files)
    assert server_response.status_code == 200
    server_response = server_response.json()

    server_response_keys = sorted(list(server_response.keys()))
    assert server_response_keys == sorted(output_keys_for(output_features))

    model_output, _ = model.predict(dataset=[first_entry], data_format=dict)
    model_output = model_output.to_dict("records")[0]
    assert model_output == server_response

    # Batch prediction
    assert len(data_df) > 1
    files = convert_to_batch_form(data_df)
    server_response = client.post("/batch_predict", files=files)
    assert server_response.status_code == 200
    server_response = server_response.json()

    server_response_keys = sorted(server_response["columns"])
    assert server_response_keys == sorted(output_keys_for(output_features))
    assert len(data_df) == len(server_response["data"])

    model_output, _ = model.predict(dataset=data_df)
    model_output = model_output.to_dict("split")
    assert model_output == server_response

    # Cleanup
    shutil.rmtree(output_dir, ignore_errors=True)
    shutil.rmtree(image_dest_folder, ignore_errors=True)