예제 #1
0
def test_torchscript_e2e_audio(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")

    input_features = [
        audio_feature(audio_dest_folder),
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    # NOTE: audio preprocessing mismatches by very small margins ~O(1e-6) but causes flakiness in e2e test.
    # Increasing tolerance is a workaround to reduce flakiness for now.
    # TODO: remove this workaround when audio preprocessing is fixed.
    validate_torchscript_outputs(tmpdir,
                                 config,
                                 backend,
                                 training_data_csv_path,
                                 tolerance=1e-6)
예제 #2
0
def test_validate_with_preprocessing_defaults():
    config = {
        "input_features": [
            audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults),
            bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults),
            binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults),
            category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults),
            date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults),
            h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults),
            image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults),
            numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults),
            sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults),
            set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults),
            text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults),
            timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults),
            vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults),
        ],
        "output_features": [{"name": "target", "type": "category"}],
        "training": {
            "decay": True,
            "learning_rate": 0.001,
            "validation_field": "target",
            "validation_metric": "accuracy",
        },
    }

    validate_config(config)
    config = merge_with_defaults(config)
    validate_config(config)
예제 #3
0
def test_audio_feature(enc_encoder):
    # synthetic audio tensor
    audio_tensor = torch.randn([BATCH_SIZE, SEQ_SIZE, AUDIO_W_SIZE],
                               dtype=torch.float32)

    # generate audio feature config
    audio_feature_config = audio_feature(folder=".",
                                         encoder=enc_encoder,
                                         max_sequence_length=SEQ_SIZE,
                                         embedding_size=AUDIO_W_SIZE)

    # instantiate audio input feature object
    audio_input_feature = AudioInputFeature(audio_feature_config)

    # pass synthetic audio tensor through the audio input feature
    encoder_output = audio_input_feature(audio_tensor)

    # confirm correctness of the the audio encoder output
    assert isinstance(encoder_output, dict)
    assert "encoder_output" in encoder_output
    assert isinstance(encoder_output["encoder_output"], torch.Tensor)
    if enc_encoder == "passthrough":
        assert encoder_output["encoder_output"].shape == (BATCH_SIZE, SEQ_SIZE,
                                                          AUDIO_W_SIZE)
    else:
        assert encoder_output["encoder_output"].shape == (BATCH_SIZE,
                                                          DEFAULT_OUTPUT_SIZE)
예제 #4
0
def test_experiment_audio_inputs(tmpdir):
    # Audio Inputs
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")

    input_features = [audio_feature(folder=audio_dest_folder)]
    output_features = [binary_feature()]

    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    run_experiment(input_features, output_features, dataset=rel_path)
예제 #5
0
def test_dask_lazy_load_audio_error():
    with tempfile.TemporaryDirectory() as tmpdir:
        audio_dest_folder = os.path.join(tmpdir, 'generated_audio')
        input_features = [
            audio_feature(folder=audio_dest_folder,
                          preprocessing={
                              'in_memory': False,
                          })
        ]
        output_features = [binary_feature()]
        run_test_parquet(input_features, output_features, expect_error=True)
예제 #6
0
def test_experiment_audio_inputs(csv_filename):
    # Audio Inputs
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [audio_feature(folder=audio_dest_folder)]
    output_features = [binary_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(audio_dest_folder)
예제 #7
0
def test_ray_lazy_load_audio_error(tmpdir):
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")
    input_features = [
        audio_feature(
            folder=audio_dest_folder,
            preprocessing={
                "in_memory": False,
            },
        )
    ]
    output_features = [binary_feature()]
    run_test_with_features(input_features, output_features, expect_error=True)
예제 #8
0
파일: test_schema.py 프로젝트: cxz/ludwig
def test_config_features():
    all_input_features = [
        audio_feature('/tmp/destination_folder'),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature('/tmp/destination_folder'),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        'input_features': all_input_features,
        'output_features': all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature['type'] not in OUTPUT_FEATURE_TYPES
    ]
    for input_feature in input_only_features:
        config = {
            'input_features': all_input_features,
            'output_features': all_output_features + [input_feature],
        }

        dtype = input_feature['type']
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
예제 #9
0
def test_config_features():
    all_input_features = [
        audio_feature("/tmp/destination_folder"),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature("/tmp/destination_folder"),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature["type"] not in output_type_registry.keys()
    ]
    for input_feature in input_only_features:
        config = {
            "input_features": all_input_features,
            "output_features": all_output_features + [input_feature],
        }

        dtype = input_feature["type"]
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
예제 #10
0
def test_ray_audio(tmpdir, dataset_type, feature_type):
    preprocessing_params = {
        "audio_file_length_limit_in_s": 3.0,
        "missing_value_strategy": BACKFILL,
        "in_memory": True,
        "padding_value": 0,
        "norm": "per_file",
        "type": feature_type,
        "window_length_in_s": 0.04,
        "window_shift_in_s": 0.02,
        "num_filter_bands": 80,
    }
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")
    input_features = [audio_feature(folder=audio_dest_folder, preprocessing=preprocessing_params)]
    output_features = [binary_feature()]
    run_test_with_features(input_features, output_features, dataset_type=dataset_type, nan_percent=0.1)
예제 #11
0
def test_add_feature_data(feature_type, tmpdir):
    preprocessing_params = {
        "audio_file_length_limit_in_s": 3.0,
        "missing_value_strategy": BACKFILL,
        "in_memory": True,
        "padding_value": 0,
        "norm": "per_file",
        "type": feature_type,
        "window_length_in_s": 0.04,
        "window_shift_in_s": 0.02,
        "num_fft_points": None,
        "window_type": "hamming",
        "num_filter_bands": 80,
    }
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")
    audio_feature_config = audio_feature(audio_dest_folder, preprocessing=preprocessing_params)
    data_df_path = generate_data(
        [audio_feature_config],
        [category_feature(vocab_size=5, reduce_input="sum")],
        os.path.join(tmpdir, "data.csv"),
        num_examples=10,
    )
    data_df = pd.read_csv(data_df_path)
    metadata = {
        audio_feature_config["name"]: AudioFeatureMixin.get_feature_meta(
            data_df[audio_feature_config["name"]], preprocessing_params, LOCAL_BACKEND
        )
    }

    proc_df = {}
    AudioFeatureMixin.add_feature_data(
        feature_config=audio_feature_config,
        input_df=data_df,
        proc_df=proc_df,
        metadata=metadata,
        preprocessing_parameters=preprocessing_params,
        backend=LOCAL_BACKEND,
        skip_save_processed_input=False,
    )

    assert len(proc_df[audio_feature_config[PROC_COLUMN]]) == 10
예제 #12
0
def test_ray_read_binary_files(tmpdir, df_engine):
    preprocessing_params = {
        "audio_file_length_limit_in_s": 3.0,
        "missing_value_strategy": BACKFILL,
        "in_memory": True,
        "padding_value": 0,
        "norm": "per_file",
        "audio_feature": {
            "type": "fbank",
            "window_length_in_s": 0.04,
            "window_shift_in_s": 0.02,
            "num_filter_bands": 80,
        },
    }
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")
    audio_params = audio_feature(folder=audio_dest_folder, preprocessing=preprocessing_params)

    dataset_path = os.path.join(tmpdir, "dataset.csv")
    dataset_path = generate_data([audio_params], [], dataset_path, num_examples=100)
    dataset_path = create_data_set_to_use("csv", dataset_path, nan_percent=0.1)

    with ray_start(num_cpus=2, num_gpus=None):
        backend_config = {**RAY_BACKEND_CONFIG}
        backend_config["processor"]["type"] = df_engine
        backend = initialize_backend(backend_config)
        df = backend.df_engine.df_lib.read_csv(dataset_path)
        series = df[audio_params[COLUMN]]
        proc_col = backend.read_binary_files(series)
        proc_col = backend.df_engine.compute(proc_col)

        backend = initialize_backend(LOCAL_BACKEND)
        df = backend.df_engine.df_lib.read_csv(dataset_path)
        series = df[audio_params[COLUMN]]
        proc_col_expected = backend.read_binary_files(series)

        assert proc_col.equals(proc_col_expected)
예제 #13
0
def t_neuropod(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder),
        timeseries_feature(),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature()
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    ################
    # build neuropod
    ################
    neuropod_path = os.path.join(dir_path, 'neuropod')
    export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path)

    ########################
    # predict using neuropod
    ########################
    data_df = pd.read_csv(data_csv_path)
    if_dict = {
        input_feature['name']: np.expand_dims(
            np.array([str(x) for x in data_df[input_feature['name']].tolist()],
                     dtype='str'), 1)
        for input_feature in input_features
    }

    from neuropod.loader import load_neuropod
    neuropod_model = load_neuropod(neuropod_path)
    preds = neuropod_model.infer(if_dict)

    for key in preds:
        preds[key] = np.squeeze(preds[key])

    #########
    # cleanup
    #########
    # Delete the temporary data created
    for path in [
            ludwigmodel_path, neuropod_path, image_dest_folder,
            audio_dest_folder
    ]:
        if os.path.exists(path):
            if os.path.isfile(path):
                os.remove(path)
            else:
                shutil.rmtree(path, ignore_errors=True)

    ########
    # checks
    ########
    for output_feature in output_features:
        output_feature_name = output_feature['name']
        output_feature_type = output_feature['type']

        if (output_feature_name + "_predictions" in preds
                and output_feature_name + "_predictions"
                in original_predictions_df):
            neuropod_pred = preds[output_feature_name +
                                  "_predictions"].tolist()
            if output_feature_type == BINARY:
                neuropod_pred = list(map(lambda x: str2bool(x), neuropod_pred))
            if output_feature_type in {SEQUENCE, TEXT, SET}:
                neuropod_pred = list(map(lambda x: x.split(), neuropod_pred))

            original_pred = original_predictions_df[output_feature_name +
                                                    "_predictions"].tolist()

            assert neuropod_pred == original_pred

        if (output_feature_name + "_probability" in preds
                and output_feature_name + "_probability"
                in original_predictions_df):
            neuropod_prob = preds[output_feature_name +
                                  "_probability"].tolist()
            if output_feature_type in {SEQUENCE, TEXT, SET}:
                neuropod_prob = list(
                    map(lambda x: [float(n) for n in x.split()],
                        neuropod_prob))
            if any(isinstance(el, list) for el in neuropod_prob):
                neuropod_prob = np.array(
                    list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T

            original_prob = original_predictions_df[output_feature_name +
                                                    "_probability"].tolist()
            if any(isinstance(el, list) for el in original_prob):
                original_prob = np.array(
                    list(itertools.zip_longest(*original_prob, fillvalue=0))).T

            assert np.isclose(neuropod_prob, original_prob).all()

        if (output_feature_name + "_probabilities" in preds
                and output_feature_name + "_probabilities"
                in original_predictions_df):
            neuropod_prob = preds[output_feature_name +
                                  "_probabilities"].tolist()

            original_prob = original_predictions_df[output_feature_name +
                                                    "_probabilities"].tolist()

            assert np.isclose(neuropod_prob, original_prob).all()
예제 #14
0
def test_dask_audio():
    with tempfile.TemporaryDirectory() as tmpdir:
        audio_dest_folder = os.path.join(tmpdir, 'generated_audio')
        input_features = [audio_feature(folder=audio_dest_folder)]
        output_features = [binary_feature()]
        run_test_parquet(input_features, output_features, num_examples=25)
예제 #15
0
def test_ray_audio():
    with tempfile.TemporaryDirectory() as tmpdir:
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")
        input_features = [audio_feature(folder=audio_dest_folder)]
        output_features = [binary_feature()]
        run_test_parquet(input_features, output_features)
예제 #16
0
def test_model_save_reload_api(tmpdir, csv_filename, tmp_path):
    torch.manual_seed(1)
    random.seed(1)
    np.random.seed(1)

    image_dest_folder = os.path.join(os.getcwd(), "generated_images")
    audio_dest_folder = os.path.join(os.getcwd(), "generated_audio")

    input_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3,
                     encoder="rnn",
                     cell_type="lstm",
                     num_layers=2,
                     bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder="stacked_cnn"),
        timeseries_feature(encoder="parallel_cnn"),
        sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features,
                                  output_features,
                                  csv_filename,
                                  num_examples=50)

    #############
    # Train model
    #############
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    data_df = read_csv(data_csv_path)
    splitter = get_splitter("random")
    training_set, validation_set, test_set = splitter.split(
        data_df, LocalTestBackend())

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory="results",  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b
                          for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.parameters(),
                                    if2.encoder_obj.parameters()):
                assert torch.allclose(if1_w, if2_w)

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.parameters(), c2.parameters()):
            assert torch.allclose(c1_w, c2_w)

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.parameters(),
                                    of2.decoder_obj.parameters()):
                assert torch.allclose(of1_w, of2_w)

    ludwig_model1.save(tmpdir)
    ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
    check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"),
                                        backend=backend)
    check_model_equal(ludwig_model_exp)
예제 #17
0
def test_model_save_reload_api(csv_filename, tmp_path):
    torch.manual_seed(1)
    random.seed(1)
    np.random.seed(1)

    image_dest_folder = os.path.join(os.getcwd(), "generated_images")
    audio_dest_folder = os.path.join(os.getcwd(), "generated_audio")

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3, encoder="rnn", cell_type="lstm", num_layers=2, bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder="stacked_cnn"),
        timeseries_feature(encoder="parallel_cnn"),
        sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
<<<<<<< HEAD
        # TODO(shreya): Reintroduce sequence and text after sequence output feature.
=======
        # TODO(#1333): Reintroduce sequence and text after sequence output feature.
>>>>>>> upstream/master
        # sequence_feature(vocab_size=3),
        # text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features, csv_filename)

    #############
    # Train model
    #############
    config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}

    data_df = read_csv(data_csv_path)
    data_df[SPLIT] = get_split(data_df)
    training_set, test_set, validation_set = split_dataset_ttv(data_df, SPLIT)
    training_set = pd.DataFrame(training_set)
    validation_set = pd.DataFrame(validation_set)
    test_set = pd.DataFrame(test_set)

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory="results",  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        # this has to be done after predicts because of TF2 lazy restoration
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.parameters(), if2.encoder_obj.parameters()):
                assert torch.allclose(if1_w, if2_w)

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.parameters(), c2.parameters()):
            assert torch.allclose(c1_w, c2_w)

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.parameters(), of2.decoder_obj.parameters()):
                assert torch.allclose(of1_w, of2_w)

    # Test saving and loading the model explicitly
    with tempfile.TemporaryDirectory() as tmpdir:
        ludwig_model1.save(tmpdir)
        ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
        check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"), backend=backend)
    check_model_equal(ludwig_model_exp)
예제 #18
0
def test_neuropod(csv_filename):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, "generated_images")
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")

        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            set_feature(vocab_size=3),
            vector_feature(),
        ]

        # Generate test data
        data_csv_path = generate_data(input_features, output_features, data_csv_path)

        #############
        # Train model
        #############
        config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}
        ludwig_model = LudwigModel(config, backend=LocalTestBackend())
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
            output_directory=dir_path,
        )

        data_df = pd.read_csv(data_csv_path)
        original_predictions_df, _ = ludwig_model.predict(dataset=data_df)

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, "ludwigmodel")
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ################
        # build neuropod
        ################
        neuropod_path = os.path.join(dir_path, "neuropod")
        shutil.rmtree(neuropod_path, ignore_errors=True)
        export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path, entrypoint="get_test_model")

        ########################
        # predict using neuropod
        ########################
        if_dict = {
            input_feature["name"]: np.expand_dims(
                np.array([str(x) for x in data_df[input_feature["name"]].tolist()], dtype="str"), 1
            )
            for input_feature in input_features
        }

        from neuropod.loader import load_neuropod

        neuropod_model = load_neuropod(neuropod_path, _always_use_native=False)
        preds = neuropod_model.infer(if_dict)

        for key in preds:
            preds[key] = np.squeeze(preds[key])

        #########
        # cleanup
        #########
        # Delete the temporary data created
        for path in [ludwigmodel_path, neuropod_path, image_dest_folder, audio_dest_folder]:
            if os.path.exists(path):
                if os.path.isfile(path):
                    os.remove(path)
                else:
                    shutil.rmtree(path, ignore_errors=True)

        ########
        # checks
        ########
        for output_feature in output_features:
            output_feature_name = output_feature["name"]
            output_feature_type = output_feature["type"]

            if (
                output_feature_name + "_predictions" in preds
                and output_feature_name + "_predictions" in original_predictions_df
            ):
                neuropod_pred = preds[output_feature_name + "_predictions"].tolist()
                if output_feature_type == BINARY:
                    neuropod_pred = [str2bool(x) for x in neuropod_pred]
                if output_feature_type in {SEQUENCE, TEXT, SET}:
                    neuropod_pred = [x.split() for x in neuropod_pred]

                original_pred = original_predictions_df[output_feature_name + "_predictions"].tolist()

                assert neuropod_pred == original_pred

            if (
                output_feature_name + "_probability" in preds
                and output_feature_name + "_probability" in original_predictions_df
            ):
                neuropod_prob = preds[output_feature_name + "_probability"].tolist()
                if output_feature_type in {SEQUENCE, TEXT, SET}:
                    neuropod_prob = [[float(n) for n in x.split()] for x in neuropod_prob]
                if any(isinstance(el, list) for el in neuropod_prob):
                    neuropod_prob = np.array(list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T

                original_prob = original_predictions_df[output_feature_name + "_probability"].tolist()
                if any(isinstance(el, list) for el in original_prob):
                    original_prob = np.array(list(itertools.zip_longest(*original_prob, fillvalue=0))).T

                assert np.allclose(neuropod_prob, original_prob)

            if (
                output_feature_name + "_probabilities" in preds
                and output_feature_name + "_probabilities" in original_predictions_df
            ):
                neuropod_prob = preds[output_feature_name + "_probabilities"].tolist()

                original_prob = original_predictions_df[output_feature_name + "_probabilities"].tolist()

                assert np.allclose(neuropod_prob, original_prob)
예제 #19
0
def test_model_save_reload_api(csv_filename, tmp_path):
    tf.random.set_seed(1234)

    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3, encoder='rnn', cell_type='lstm',
                     num_layers=2, bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder='stacked_cnn'),
        timeseries_feature(encoder='parallel_cnn'),
        sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    config = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {'epochs': 2}
    }

    data_df = read_csv(data_csv_path)
    data_df[SPLIT] = get_split(data_df)
    training_set, test_set, validation_set = split_dataset_ttv(
        data_df,
        SPLIT
    )
    training_set = pd.DataFrame(training_set)
    validation_set = pd.DataFrame(validation_set)
    test_set = pd.DataFrame(test_set)

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory='results'  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        # this has to be done after predicts because of TF2 lazy restoration
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.weights,
                                    if2.encoder_obj.weights):
                assert np.allclose(if1_w.numpy(), if2_w.numpy())

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.weights, c2.weights):
            assert np.allclose(c1_w.numpy(), c2_w.numpy())

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.weights,
                                    of2.decoder_obj.weights):
                assert np.allclose(of1_w.numpy(), of2_w.numpy())

    # Test saving and loading the model explicitly
    with tempfile.TemporaryDirectory() as tmpdir:
        ludwig_model1.save(tmpdir)
        ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
        check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(
        os.path.join(output_dir, 'model'),
        backend=backend
    )
    check_model_equal(ludwig_model_exp)
예제 #20
0
def test_server_integration_with_audio(single_record, csv_filename):
    # Audio Inputs
    audio_dest_folder = os.path.join(os.getcwd(), "generated_audio")

    # Resnet encoder
    input_features = [
        audio_feature(
            folder=audio_dest_folder,
        ),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="zscore"),
    ]
    output_features = [category_feature(vocab_size=4), numerical_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)
    model, output_dir = train_model(input_features, output_features, data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.get("/")
    assert response.status_code == 200

    response = client.post("/predict")
    # expect the HTTP 400 error code for this situation
    assert response.status_code == 400
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)

    if single_record:
        # Single record prediction
        first_entry = data_df.T.to_dict()[0]
        data, files = convert_to_form(first_entry)
        server_response = client.post("/predict", data=data, files=files)
        assert server_response.status_code == 200
        server_response = server_response.json()

        server_response_keys = sorted(list(server_response.keys()))
        assert server_response_keys == sorted(output_keys_for(output_features))

        model_output, _ = model.predict(dataset=[first_entry], data_format=dict)
        model_output = model_output.to_dict("records")[0]
        assert model_output == server_response
    else:
        # Batch prediction
        assert len(data_df) > 1
        files = convert_to_batch_form(data_df)
        server_response = client.post("/batch_predict", files=files)
        assert server_response.status_code == 200
        server_response = server_response.json()

        server_response_keys = sorted(server_response["columns"])
        assert server_response_keys == sorted(output_keys_for(output_features))
        assert len(data_df) == len(server_response["data"])

        model_output, _ = model.predict(dataset=data_df)
        model_output = model_output.to_dict("split")
        assert model_output == server_response

    # Cleanup
    shutil.rmtree(output_dir, ignore_errors=True)
    shutil.rmtree(audio_dest_folder, ignore_errors=True)
예제 #21
0
def test_savedmodel(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, 'generated_images')
        audio_dest_folder = os.path.join(tmpdir, 'generated_audio')

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            set_feature(vocab_size=3),
            vector_feature()
        ]

        predictions_column_name = '{}_predictions'.format(
            output_features[0]['name'])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features,
                                      data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'training': {
                'epochs': 2
            }
        }
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(
            dataset=data_csv_path)
        original_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################
        # save savedmodel
        #################
        savedmodel_path = os.path.join(dir_path, 'savedmodel')
        shutil.rmtree(savedmodel_path, ignore_errors=True)
        ludwig_model.model.save_savedmodel(savedmodel_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################################################
        # restore savedmodel, obtain predictions and weights
        #################################################
        training_set_metadata_json_fp = os.path.join(
            ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = tf.saved_model.load(savedmodel_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: tf.convert_to_tensor(dataset.dataset[feature.proc_column],
                                       dtype=feature.get_input_dtype())
            for name, feature in ludwig_model.model.input_features.items()
        }

        logits = restored_model(data_to_predict, False, None)

        restored_predictions = tf.argmax(logits[of_name]['logits'],
                                         -1,
                                         name='predictions_{}'.format(of_name))
        restored_predictions = tf.map_fn(
            lambda idx: training_set_metadata[of_name]['idx2str'][idx],
            restored_predictions,
            dtype=tf.string)

        restored_weights = deepcopy(restored_model.trainable_variables)

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(savedmodel_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # check for same number of weights as original model
        assert len(original_weights) == len(loaded_weights)
        assert len(original_weights) == len(restored_weights)

        # check to ensure weight valuess match the original model
        loaded_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           loaded_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        original_weights = sorted(original_weights, key=lambda w: w.name)
        restored_weights = sorted(restored_weights, key=lambda w: w.name)

        restored_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           restored_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        assert loaded_weights_match and restored_weights_match

        #  Are predictions identical to original ones?
        loaded_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            loaded_prediction_df[predictions_column_name])

        restored_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            restored_predictions.numpy().astype('str'))

        assert loaded_predictions_match and restored_predictions_match
예제 #22
0
def test_torchscript(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, "generated_images")
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            set_feature(vocab_size=3),
            vector_feature()
            # TODO(#1333): Re-enable.
            # sequence_feature(vocab_size=3),
            # text_feature(vocab_size=3),
        ]

        predictions_column_name = "{}_predictions".format(output_features[0]["name"])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features, data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, "ludwigmodel")
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path)
        original_weights = deepcopy(list(ludwig_model.model.parameters()))

        #################
        # save torchscript
        #################
        torchscript_path = os.path.join(dir_path, "torchscript")
        shutil.rmtree(torchscript_path, ignore_errors=True)
        ludwig_model.model.save_torchscript(torchscript_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(list(ludwig_model.model.parameters()))

        #####################################################
        # restore torchscript, obtain predictions and weights
        #####################################################
        training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = torch.jit.load(torchscript_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: torch.from_numpy(dataset.dataset[feature.proc_column])
            for name, feature in ludwig_model.model.input_features.items()
        }

        # Get predictions from restored torchscript.
        logits = restored_model(data_to_predict)
        restored_predictions = torch.argmax(
            output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1
        )

        restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions]

        restored_weights = deepcopy(list(restored_model.parameters()))

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(torchscript_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # Check to weight values match the original model.
        assert utils.is_all_close(original_weights, loaded_weights)
        assert utils.is_all_close(original_weights, restored_weights)

        # Check that predictions are identical to the original model.
        assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name])

        assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)