示例#1
0
def test_incorrect_input_features_config():
    config = {
        "input_features": [
            category_feature(preprocessing={"normalization": "zscore"}),
        ],
        "output_features": [binary_feature()],
    }

    # Not a preprocessing param for category feature
    with pytest.raises(ValidationError):
        validate_config(config)

    config = {
        "input_features": [
            text_feature(preprocessing={"padding_symbol": 0}),
        ],
        "output_features": [binary_feature()],
    }

    # Incorrect type for padding_symbol preprocessing param
    with pytest.raises(ValidationError):
        validate_config(config)

    config = {
        "input_features": [
            binary_feature(),
        ],
        "output_features": [binary_feature()],
    }
    del config["input_features"][0]["type"]

    # Incorrect type for padding_symbol preprocessing param
    with pytest.raises(ValidationError):
        validate_config(config)
示例#2
0
def test_config_fill_values():
    vector_fill_values = ["1.0 0.0 1.04 10.49", "1 2 3 4 5" "0" "1.0" ""]
    binary_fill_values = ["yes", "No", "1", "TRUE", 1]
    for vector_fill_value, binary_fill_value in zip(vector_fill_values,
                                                    binary_fill_values):
        config = {
            "input_features": [
                vector_feature(
                    preprocessing={"fill_value": vector_fill_value}),
            ],
            "output_features":
            [binary_feature(preprocessing={"fill_value": binary_fill_value})],
        }
        validate_config(config)

    bad_vector_fill_values = ["one two three", "1,2,3", 0]
    bad_binary_fill_values = ["one", 2, "maybe"]
    for vector_fill_value, binary_fill_value in zip(
            vector_fill_values[:3] + bad_vector_fill_values,
            bad_binary_fill_values + binary_fill_values[:3]):
        config = {
            "input_features": [
                vector_feature(
                    preprocessing={"fill_value": vector_fill_value}),
            ],
            "output_features":
            [binary_feature(preprocessing={"fill_value": binary_fill_value})],
        }
        with pytest.raises(ValidationError):
            validate_config(config)
示例#3
0
def test_missing_values_drop_rows(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}}
    input_features = [
        number_feature(),
        binary_feature(),
        category_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(**kwargs),
        number_feature(**kwargs),
        category_feature(vocab_size=3, **kwargs),
        sequence_feature(vocab_size=3, **kwargs),
        text_feature(vocab_size=3, **kwargs),
        set_feature(vocab_size=3, **kwargs),
        vector_feature(),
    ]
    backend = LocalTestBackend()
    config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}}

    training_data_csv_path = generate_data(input_features, output_features, data_csv_path)
    df = read_csv_with_nan(training_data_csv_path, nan_percent=0.1)

    # run preprocessing
    ludwig_model = LudwigModel(config, backend=backend)
    ludwig_model.preprocess(dataset=df)
示例#4
0
def test_roc_curves_from_test_statistics_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [binary_feature(), bag_feature()]
    output_features = [binary_feature()]
    encoder = 'parallel_cnn'

    # Generate test data
    data_csv = generate_data(input_features, output_features, csv_filename)
    output_feature_name = output_features[0]['name']
    input_features[0]['encoder'] = encoder
    model = run_api_experiment(input_features, output_features)
    data_df = read_csv(data_csv)
    model.train(data_df=data_df)
    test_stats = model.test(data_df=data_df)[1]
    viz_outputs = ('pdf', 'png')
    for viz_output in viz_outputs:
        vis_output_pattern_pdf = model.exp_dir_name + '/*.{}'.format(
            viz_output)
        visualize.roc_curves_from_test_statistics(
            [test_stats, test_stats],
            output_feature_name,
            model_namess=['Model1', 'Model2'],
            output_directory=model.exp_dir_name,
            file_format=viz_output)
        figure_cnt = glob.glob(vis_output_pattern_pdf)
        assert 1 == len(figure_cnt)
    shutil.rmtree(model.exp_dir_name, ignore_errors=True)
示例#5
0
文件: test_schema.py 项目: cxz/ludwig
def test_config_fill_values():
    vector_fill_values = ['1.0 0.0 1.04 10.49', '1 2 3 4 5' '0' '1.0' '']
    binary_fill_values = ['yes', 'No', '1', 'TRUE', 1]
    for vector_fill_value, binary_fill_value in zip(vector_fill_values,
                                                    binary_fill_values):
        config = {
            'input_features': [
                vector_feature(
                    preprocessing={'fill_value': vector_fill_value}),
            ],
            'output_features':
            [binary_feature(preprocessing={'fill_value': binary_fill_value})],
        }
        validate_config(config)

    bad_vector_fill_values = ['one two three', '1,2,3', 0]
    bad_binary_fill_values = ['one', 2, 'maybe']
    for vector_fill_value, binary_fill_value in zip(
            vector_fill_values[:3] + bad_vector_fill_values,
            bad_binary_fill_values + binary_fill_values[:3]):
        config = {
            'input_features': [
                vector_feature(
                    preprocessing={'fill_value': vector_fill_value}),
            ],
            'output_features':
            [binary_feature(preprocessing={'fill_value': binary_fill_value})],
        }
        with pytest.raises(ValidationError):
            validate_config(config)
示例#6
0
def test_torchscript_e2e_tabnet_combiner(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    # Configure features to be tested:
    input_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        bag_feature(vocab_size=3),
        set_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        COMBINER: {
            "type": "tabnet",
            "num_total_blocks": 2,
            "num_shared_blocks": 2,
        },
        TRAINER: {
            "epochs": 2
        },
    }

    # Generate training data
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    validate_torchscript_outputs(tmpdir, config, backend,
                                 training_data_csv_path)
示例#7
0
def test_roc_curves_from_test_statistics_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [binary_feature(), bag_feature()]
    output_features = [binary_feature()]

    # Generate test data
    data_csv = generate_data(input_features, output_features, csv_filename)
    output_feature_name = output_features[0]['name']

    model = run_api_experiment(input_features, output_features)
    data_df = read_csv(data_csv)
    _, _, output_dir = model.train(dataset=data_df)
    # extract test metrics
    test_stats, _, _ = model.evaluate(dataset=data_df,
                                      collect_overall_stats=True,
                                      output_directory=output_dir)
    test_stats = test_stats
    viz_outputs = ('pdf', 'png')
    for viz_output in viz_outputs:
        vis_output_pattern_pdf = os.path.join(output_dir, '*.{}'.format(
            viz_output))
        visualize.roc_curves_from_test_statistics(
            [test_stats, test_stats],
            output_feature_name,
            model_names=['Model1', 'Model2'],
            output_directory=output_dir,
            file_format=viz_output
        )
        figure_cnt = glob.glob(vis_output_pattern_pdf)
        assert 1 == len(figure_cnt)
    shutil.rmtree(output_dir, ignore_errors=True)
示例#8
0
文件: test_schema.py 项目: cxz/ludwig
def test_config_features():
    all_input_features = [
        audio_feature('/tmp/destination_folder'),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature('/tmp/destination_folder'),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        'input_features': all_input_features,
        'output_features': all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature['type'] not in OUTPUT_FEATURE_TYPES
    ]
    for input_feature in input_only_features:
        config = {
            'input_features': all_input_features,
            'output_features': all_output_features + [input_feature],
        }

        dtype = input_feature['type']
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
示例#9
0
def test_config_features():
    all_input_features = [
        audio_feature("/tmp/destination_folder"),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature("/tmp/destination_folder"),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature["type"] not in output_type_registry.keys()
    ]
    for input_feature in input_only_features:
        config = {
            "input_features": all_input_features,
            "output_features": all_output_features + [input_feature],
        }

        dtype = input_feature["type"]
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
示例#10
0
def test_torchscript_e2e_tabular(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    # Configure features to be tested:
    bin_str_feature = binary_feature()
    transformed_number_features = [
        number_feature(preprocessing={"normalization": numeric_transformer})
        for numeric_transformer in numeric_transformation_registry.keys()
    ]
    input_features = [
        bin_str_feature,
        binary_feature(),
        *transformed_number_features,
        category_feature(vocab_size=3),
        bag_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
        # TODO: future support
        # date_feature(),
        # h3_feature(),
    ]
    output_features = [
        bin_str_feature,
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    # Generate training data
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    # Convert bool values to strings, e.g., {'Yes', 'No'}
    df = pd.read_csv(training_data_csv_path)
    false_value, true_value = "No", "Yes"
    df[bin_str_feature[NAME]] = df[bin_str_feature[NAME]].map(
        lambda x: true_value if x else false_value)
    df.to_csv(training_data_csv_path)

    validate_torchscript_outputs(tmpdir, config, backend,
                                 training_data_csv_path)
示例#11
0
def test_ray_calibration(calibration):
    input_features = [
        number_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
    ]
    output_features = [
        binary_feature(calibration=calibration),
        category_feature(vocab_size=3, calibration=calibration),
    ]
    run_test_with_features(
        input_features,
        output_features,
    )
示例#12
0
def test_validate_with_preprocessing_defaults():
    config = {
        "input_features": [
            audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults),
            bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults),
            binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults),
            category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults),
            date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults),
            h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults),
            image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults),
            numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults),
            sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults),
            set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults),
            text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults),
            timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults),
            vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults),
        ],
        "output_features": [{"name": "target", "type": "category"}],
        "training": {
            "decay": True,
            "learning_rate": 0.001,
            "validation_field": "target",
            "validation_metric": "accuracy",
        },
    }

    validate_config(config)
    config = merge_with_defaults(config)
    validate_config(config)
示例#13
0
def test_strip_whitespace_category(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    input_features = [binary_feature()]
    cat_feat = category_feature(vocab_size=3)
    output_features = [cat_feat]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # prefix with whitespace
    df[cat_feat[COLUMN]] = df[cat_feat[COLUMN]].apply(lambda s: " " + s)

    # run preprocessing
    ludwig_model = LudwigModel(config, backend=backend)
    train_ds, _, _, metadata = ludwig_model.preprocess(dataset=df)

    # expect values containing whitespaces to be properly mapped to vocab_size unique values
    assert len(np.unique(
        train_ds.dataset[cat_feat[PROC_COLUMN]])) == cat_feat["vocab_size"]
示例#14
0
def test_empty_split_error(backend, tmpdir):
    """Tests that an error is raised if one or more of the splits is empty after preprocessing."""
    data_csv_path = os.path.join(tmpdir, "data.csv")

    out_feat = binary_feature()
    input_features = [number_feature()]
    output_features = [out_feat]
    config = {
        "input_features": input_features,
        "output_features": output_features
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # Convert all the output features rows to null. Because the default missing value strategy is to drop empty output
    # rows, this will result in the dataset being empty after preprocessing.
    df[out_feat[COLUMN]] = None

    with init_backend(backend):
        ludwig_model = LudwigModel(config, backend=backend)
        with pytest.raises(ValueError,
                           match="Dataset is empty following preprocessing"):
            ludwig_model.preprocess(dataset=df)
示例#15
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINING] = {"batch_size": "42"}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT]["sampler"]["scheduler"] = SCHEDULER

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else default_early_stop
    assert merged_config[TRAINING]["early_stop"] == expected
示例#16
0
def test_torchscript_e2e_audio(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")

    input_features = [
        audio_feature(audio_dest_folder),
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    # NOTE: audio preprocessing mismatches by very small margins ~O(1e-6) but causes flakiness in e2e test.
    # Increasing tolerance is a workaround to reduce flakiness for now.
    # TODO: remove this workaround when audio preprocessing is fixed.
    validate_torchscript_outputs(tmpdir,
                                 config,
                                 backend,
                                 training_data_csv_path,
                                 tolerance=1e-6)
示例#17
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        INPUT_FEATURES: all_input_features,
        OUTPUT_FEATURES: all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINER] = {"batch_size": 42}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT][EXECUTOR][SCHEDULER] = SCHEDULER_DICT

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else ECDTrainerConfig().early_stop
    assert merged_config[TRAINER]["early_stop"] == expected
示例#18
0
def test_number_feature_wrong_dtype(csv_filename, tmpdir):
    """Tests that a number feature with all string values is treated as having missing values by default."""
    data_csv_path = os.path.join(tmpdir, csv_filename)

    num_feat = number_feature()
    input_features = [num_feat]
    output_features = [binary_feature()]
    config = {
        "input_features": input_features,
        "output_features": output_features
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # convert numbers to random strings
    def random_string():
        letters = string.ascii_lowercase
        return "".join(random.choice(letters) for _ in range(10))

    df[num_feat[COLUMN]] = df[num_feat[COLUMN]].apply(
        lambda _: random_string())

    # run preprocessing
    backend = LocalTestBackend()
    ludwig_model = LudwigModel(config, backend=backend)
    train_ds, val_ds, test_ds, _ = ludwig_model.preprocess(dataset=df)

    concatenated_df = concatenate_df(train_ds.to_df(), val_ds.to_df(),
                                     test_ds.to_df(), backend)

    # check that train_ds had invalid values replaced with the missing value
    assert len(concatenated_df) == len(df)
    assert np.all(concatenated_df[num_feat[PROC_COLUMN]] == 0.0)
示例#19
0
def test_binary_feature(enc_encoder):
    # synthetic binary tensor
    binary_tensor = torch.randn([BATCH_SIZE, SEQ_SIZE],
                               dtype=torch.float32)

    # generate binary feature config
    binary_feature_config = binary_feature(
        folder='.',
        encoder=enc_encoder,
        max_sequence_length=SEQ_SIZE
    )

    # instantiate binary input feature object
    binary_input_feature = BinaryInputFeature(binary_feature_config)

    # pass synthetic binary tensor through the input feature
    encoder_output = binary_input_feature(binary_tensor)

    # confirm correctness of the the binary encoder output
    assert isinstance(encoder_output, dict)
    assert 'encoder_output' in encoder_output
    assert isinstance(encoder_output['encoder_output'], torch.Tensor)
    if enc_encoder == 'passthrough':
        assert encoder_output['encoder_output'].shape \
               == (BATCH_SIZE, 1, SEQ_SIZE)
    else:
        assert encoder_output['encoder_output'].shape \
               == (BATCH_SIZE, DEFAULT_FC_SIZE)
示例#20
0
def test_experiment_various_feature_types(csv_filename):
    input_features = [binary_feature(), bag_feature()]
    output_features = [set_feature(max_len=3, vocab_size=5)]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)
示例#21
0
def test_experiment_vector_feature_1(csv_filename):
    input_features = [vector_feature()]
    output_features = [binary_feature()]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    run_experiment(input_features, output_features, data_csv=rel_path)
def test_missing_value_prediction(csv_filename):
    random.seed(1)
    np.random.seed(1)
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [
            category_feature(
                vocab_size=2,
                reduce_input="sum",
                preprocessing=dict(missing_value_strategy="fill_with_mode"))
        ]
        output_features = [binary_feature()]

        dataset = pd.read_csv(
            generate_data(input_features, output_features, csv_filename))

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "fc_size": 14
            },
        }
        model = LudwigModel(config)
        _, _, output_dir = model.train(dataset=dataset,
                                       output_directory=tmpdir)

        # Set the input column to None, we should be able to replace the missing value with the mode
        # from the training set
        dataset[input_features[0]["name"]] = None
        model.predict(dataset=dataset)

        model = LudwigModel.load(os.path.join(output_dir, "model"))
        model.predict(dataset=dataset)
示例#23
0
def run_test_gbm_multiple_outputs(tmpdir, backend_config):
    """Test that an error is raised when the model is trained with multiple outputs."""
    input_features = [number_feature(), category_feature(reduce_output="sum")]
    output_features = [
        category_feature(vocab_size=3),
        binary_feature(),
        category_feature(vocab_size=3),
    ]

    csv_filename = os.path.join(tmpdir, "training.csv")
    dataset_filename = generate_data(input_features,
                                     output_features,
                                     csv_filename,
                                     num_examples=100)

    config = {
        MODEL_TYPE: "gbm",
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "num_boost_round": 2
        },
    }

    model = LudwigModel(config, backend=backend_config)
    with pytest.raises(ValueError,
                       match="Only single task currently supported"):
        model.train(dataset=dataset_filename, output_directory=tmpdir)
def test_config_trainer_bad_optimizer():
    config = {
        "input_features": [
            category_feature(vocab_size=2, reduce_input="sum"),
            number_feature(),
        ],
        "output_features": [binary_feature(weight_regularization=None)],
        "combiner": {
            "type": "tabnet",
        },
        TRAINER: {},
    }
    validate_config(config)

    # Test manually set-to-null optimizer vs unspecified:
    config[TRAINER]["optimizer"] = None
    with pytest.raises(ValidationError):
        validate_config(config)
    assert ECDTrainerConfig.Schema().load({}).optimizer is not None

    # Test all types in optimizer_registry supported:
    for key in optimizer_registry.keys():
        config[TRAINER]["optimizer"] = {"type": key}
        validate_config(config)

    # Test invalid optimizer type:
    config[TRAINER]["optimizer"] = {"type": 0}
    with pytest.raises(ValidationError):
        validate_config(config)
    config[TRAINER]["optimizer"] = {"type": {}}
    with pytest.raises(ValidationError):
        validate_config(config)
    config[TRAINER]["optimizer"] = {"type": "invalid"}
    with pytest.raises(ValidationError):
        validate_config(config)
示例#25
0
def test_binary_predictions(tmpdir, backend, distinct_values):
    input_features = [
        category_feature(vocab_size=3),
    ]

    feature = binary_feature()
    output_features = [
        feature,
    ]

    data_csv_path = generate_data(
        input_features,
        output_features,
        os.path.join(tmpdir, "dataset.csv"),
        num_examples=100,
    )
    data_df = pd.read_csv(data_csv_path)

    # Optionally convert bool values to strings, e.g., {'Yes', 'No'}
    false_value, true_value = distinct_values
    data_df[feature[NAME]] = data_df[feature[NAME]].map(lambda x: true_value
                                                        if x else false_value)
    data_df.to_csv(data_csv_path, index=False)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 1
        }
    }

    patch_args = (
        "ludwig.features.binary_feature.BinaryOutputFeature.logits",
        partial(random_binary_logits, num_predict_samples=len(data_df)),
    )

    preds_df, _ = predict_with_backend(tmpdir,
                                       config,
                                       data_csv_path,
                                       backend,
                                       patch_args=patch_args)
    cols = set(preds_df.columns)
    assert f"{feature[NAME]}_predictions" in cols
    assert f"{feature[NAME]}_probabilities_{str(false_value)}" in cols
    assert f"{feature[NAME]}_probabilities_{str(true_value)}" in cols
    assert f"{feature[NAME]}_probability" in cols

    for pred, prob_0, prob_1, prob in zip(
            preds_df[f"{feature[NAME]}_predictions"],
            preds_df[f"{feature[NAME]}_probabilities_{str(false_value)}"],
            preds_df[f"{feature[NAME]}_probabilities_{str(true_value)}"],
            preds_df[f"{feature[NAME]}_probability"],
    ):
        assert pred == false_value or pred == true_value
        if pred == true_value:
            assert prob_1 == prob
        else:
            assert prob_0 == prob
        assert np.allclose(prob_0, 1 - prob_1)
示例#26
0
def test_missing_value_prediction(csv_filename):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [
            category_feature(
                vocab_size=2,
                reduce_input='sum',
                preprocessing=dict(missing_value_strategy='fill_with_mode'))
        ]
        output_features = [binary_feature()]

        dataset = pd.read_csv(
            generate_data(input_features, output_features, csv_filename))

        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
        }
        model = LudwigModel(config)
        _, _, output_dir = model.train(dataset=dataset,
                                       output_directory=tmpdir)

        # Set the input column to None, we should be able to replace the missing value with the mode
        # from the training set
        dataset[input_features[0]['name']] = None
        model.predict(dataset=dataset)

        model = LudwigModel.load(os.path.join(output_dir, 'model'))
        model.predict(dataset=dataset)
示例#27
0
def test_missing_values_fill_with_mean(backend, csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    kwargs = {PREPROCESSING: {"missing_value_strategy": FILL_WITH_MEAN}}
    input_features = [
        number_feature(**kwargs),
        binary_feature(),
        category_feature(vocab_size=3),
    ]
    output_features = [binary_feature()]
    training_data_csv_path = generate_data(input_features, output_features, data_csv_path)

    config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}}
    with init_backend(backend):
        # run preprocessing
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.preprocess(dataset=training_data_csv_path)
示例#28
0
def test_experiment_timeseries(csv_filename):
    input_features = [timeseries_feature()]
    output_features = [binary_feature()]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    input_features[0]["encoder"] = "transformer"
    run_experiment(input_features, output_features, dataset=rel_path)
示例#29
0
def test_ray_tabular():
    input_features = [
        sequence_feature(reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
        numerical_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [
        binary_feature(),
        numerical_feature(normalization="zscore"),
    ]
    run_test_parquet(input_features, output_features)
示例#30
0
def test_experiment_h3(csv_filename):
    input_features = [h3_feature()]
    output_features = [binary_feature()]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    for encoder in h3_encoder_registry:
        input_features[0]['encoder'] = encoder
        run_experiment(input_features, output_features, data_csv=rel_path)