Python CategoricalEncoder示例，datawig.column_encoders.CategoricalEncoder Python示例

示例#1

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_imputer_load_read_exec_only_dir(tmpdir, data_frame):
    import stat

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'

    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir

    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)
    imputer.fit(train_df=df, num_epochs=1)

    # make tmpdir read/exec-only by owner/group/others
    os.chmod(
        tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD
        | stat.S_IRGRP | stat.S_IROTH)

    try:
        Imputer.load(tmpdir)
    except AssertionError as e:
        print(e)
        pytest.fail(
            'Loading imputer from read-only directory should not fail.')

示例#2

0

显示文件

文件： test_imputer_iterators.py 项目： yadevi/datawig

def get_new_iterator_df(df):
    return ImputerIterDf(df,
                         data_columns=[SequentialEncoder(label_col,
                                                         max_tokens=max_tokens,
                                                         seq_len=2)],
                         label_columns=[CategoricalEncoder(label_col, max_tokens=max_tokens)],
                         batch_size=2)

示例#3

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_imputer_unrepresentative_test_df(test_dir, data_frame):
    """

    Tests whether the imputer runs through in cases when test data set (and hence metrics and precision/recall curves)
    doesn't contain values present in training data

    """
    # generate some random data
    random_data = data_frame(n_samples=100)

    df_train, df_test, _ = random_split(random_data, [.8, .1, .1])

    excluded = df_train['labels'].values[0]
    df_test = df_test[df_test['labels'] != excluded]

    data_encoder_cols = [BowEncoder('features')]
    label_encoder_cols = [CategoricalEncoder('labels')]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, "tmp", "real_data_experiment")

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df_train,
                                                   test_df=df_test,
                                                   num_epochs=10)

    only_excluded_df = df_train[df_train['labels'] == excluded]
    imputations = imputer.predict_above_precision(
        only_excluded_df, precision_threshold=.99)['labels']
    assert all([x == () for x in imputations])

示例#4

0

显示文件

文件： test_imputer.py 项目： jbdatascience/datawig

def test_not_explainable(test_dir, data_frame):
    label_col = 'label'
    df = data_frame(n_samples=100, label_col=label_col)

    data_encoder_cols = [BowEncoder('features')]
    label_encoder_cols = [CategoricalEncoder(label_col)]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, "tmp", "out")

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df, num_epochs=1)

    assert not imputer.is_explainable

    try:
        imputer.explain('some label')
        raise pytest.fail(
            'imputer.explain should fail with an appropriate error message')
    except ValueError as exception:
        assert exception.args[0] == 'No explainable data encoders available.'

    instance = pd.Series({'features': 'some feature text'})
    try:
        imputer.explain_instance(instance)
        raise pytest.fail(
            'imputer.explain_instance should fail with an appropriate error message'
        )
    except ValueError as exception:
        assert exception.args[0] == 'No explainable data encoders available.'

示例#5

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_imputer_without_train_df(test_dir):
    """
    Test asserting that imputer.fit fails without training data or training data in wrong format
    """
    df_train = ['ffffffooooo']

    data_encoder_cols = [BowEncoder('item_name')]
    label_encoder_cols = [CategoricalEncoder('brand')]

    data_cols = [BowFeaturizer('item_name')]

    output_path = os.path.join(test_dir, "tmp", "real_data_experiment")

    imputer = Imputer(
        data_featurizers=data_cols,
        label_encoders=label_encoder_cols,
        data_encoders=data_encoder_cols,
        output_path=output_path,
    )

    with pytest.raises(
            ValueError,
            message="Need a non-empty DataFrame for fitting Imputer model"):
        imputer.fit(train_df=df_train)

    with pytest.raises(
            ValueError,
            message="Need a non-empty DataFrame for fitting Imputer model"):
        imputer.fit(train_df=None)

示例#6

0

显示文件

def test_drop_missing():
    """
    Tests some private functions of the Imputer class
    """
    df_train = pd.DataFrame(
        {'label': [1, None, np.nan, 2] * 4, 'data': ['bla', 'drop', 'drop', 'fasl'] * 4})
    df_test = df_train.copy()

    max_tokens = int(2 ** 15)

    batch_size = 16

    data_encoder_cols = [BowEncoder('data', max_tokens=max_tokens)]
    label_encoder_cols = [CategoricalEncoder('label', max_tokens=1)]
    data_cols = [BowFeaturizer('data', vocab_size=max_tokens)]

    output_path = os.path.join(dir_path, "resources", "tmp", "real_data_experiment")

    imputer = Imputer(
        data_featurizers=data_cols,
        label_encoders=label_encoder_cols,
        data_encoders=data_encoder_cols,
        output_path=output_path
    ).fit(
        train_df=df_train,
        test_df=df_test,
        batch_size=batch_size
    )

    df_dropped = imputer._Imputer__drop_missing_labels(df_train, how='any')

    df_dropped_true = pd.DataFrame({'data': {3: 'fasl', 7: 'fasl', 11: 'fasl', 15: 'fasl'},
                                    'label': {3: 2.0, 7: 2.0, 11: 2.0, 15: 2.0}})

    assert df_dropped[['data', 'label']].equals(df_dropped_true[['data', 'label']])

示例#7

0

显示文件

def test_imputer_without_test_set_random_split():
    """
    Test asserting that the random split is working internally
    by calling imputer.fit only with a training set.
    """

    feature_col = "string_feature"
    label_col = "label"

    n_samples = 5000
    num_labels = 3
    seq_len = 20
    vocab_size = int(2 ** 10)

    # generate some random data
    df_train = generate_string_data_frame(feature_col=feature_col,
                                             label_col=label_col,
                                             vocab_size=vocab_size,
                                             num_labels=num_labels,
                                             num_words=seq_len,
                                             n_samples=n_samples)


    num_epochs = 1
    batch_size = 64
    learning_rate = 1e-3

    data_encoder_cols = [
        BowEncoder(feature_col, max_tokens=vocab_size)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col, vocab_size=vocab_size)
    ]

    output_path = os.path.join(dir_path, "resources", "tmp", "real_data_experiment")

    imputer = Imputer(
        data_featurizers=data_cols,
        label_encoders=label_encoder_cols,
        data_encoders=data_encoder_cols,
        output_path=output_path
    )

    try:
        imputer.fit(
            train_df=df_train,
            learning_rate=learning_rate,
            num_epochs=num_epochs,
            batch_size=batch_size
        )
    except TypeError:
        pytest.fail("Didn't expect a TypeError exception with missing test data")

    shutil.rmtree(output_path)

示例#8

0

显示文件

文件： test_imputer_iterators.py 项目： yadevi/datawig

def test_iter_decoder_df():
    # draw skewed brands
    brands = [{feature_col: brand} for brand in
              list(map(lambda e: str(int(e)), np.random.exponential(scale=1, size=1000)))]

    brand_df = pd.DataFrame(brands)
    it = ImputerIterDf(brand_df,
                       data_columns=[SequentialEncoder(feature_col, max_tokens=10, seq_len=2)],
                       label_columns=[CategoricalEncoder(feature_col, max_tokens=100)],
                       batch_size=2)
    decoded = it.decode(next(it).label)
    np.testing.assert_array_equal(decoded[0], brand_df[feature_col].head(it.batch_size).values)

示例#9

0

显示文件

def test_iter_padding_offset():
    col = 'brand'
    df = pd.DataFrame([{
        col: brand
    } for brand in list(
        map(lambda e: str(int(e)), np.random.exponential(scale=1, size=36)))])
    df_train = df.sample(frac=0.5)
    it = ImputerIterDf(df_train,
                       data_columns=[BowEncoder(col)],
                       label_columns=[CategoricalEncoder(col, max_tokens=5)],
                       batch_size=32)
    assert it.start_padding_idx == df_train.shape[0]

示例#10

0

显示文件

def test_imputer_init():
    with pytest.raises(ValueError) as e:
        imputer = Imputer(data_featurizers='item_name', label_encoders=['brand'], data_encoders='')

    with pytest.raises(ValueError) as e:
        imputer = Imputer(data_featurizers=[BowFeaturizer('item_name')],
                          label_encoders="brand",
                          data_encoders='')

    with pytest.raises(ValueError) as e:
        imputer = Imputer(data_featurizers=[BowFeaturizer('item_name')],
                          label_encoders=[CategoricalEncoder("brand")],
                          data_encoders='')

    with pytest.raises(ValueError) as e:
        imputer = Imputer(data_featurizers=[BowFeaturizer('item_name')],
                          label_encoders=[CategoricalEncoder("brand")],
                          data_encoders=[BowEncoder('not_in_featurizers')])

    with pytest.raises(ValueError) as e:
        imputer = Imputer(data_featurizers=[BowFeaturizer('item_name')],
                          label_encoders=[CategoricalEncoder("brand")],
                          data_encoders=[BowEncoder('brand')])

    label_encoders = [CategoricalEncoder('brand', max_tokens=10)]
    data_featurizers = [LSTMFeaturizer('item_name'), EmbeddingFeaturizer('manufacturer')]

    data_encoders = [
        SequentialEncoder(
            'item_name'
        ),
        CategoricalEncoder(
            'manufacturer'
        )
    ]

    imputer = Imputer(
        data_featurizers=data_featurizers,
        label_encoders=label_encoders,
        data_encoders=data_encoders
    )

    assert imputer.output_path == "brand"
    assert imputer.module_path == 'brand/model'
    assert imputer.metrics_path == 'brand/fit-test-metrics.json'

    assert imputer.output_path == "brand"
    assert imputer.module_path == 'brand/model'
    assert imputer.metrics_path == 'brand/fit-test-metrics.json'

    imputer = Imputer(
        data_featurizers=data_featurizers,
        label_encoders=[CategoricalEncoder('B Rand', max_tokens=10)],
        data_encoders=data_encoders
    )
    assert imputer.output_path == "b_rand"

    shutil.rmtree("b_rand")

示例#11

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_imputer_tfidf(test_dir, data_frame):
    label_col = 'label'
    df = data_frame(n_samples=100, label_col=label_col)

    data_encoder_cols = [TfIdfEncoder('features')]
    label_encoder_cols = [CategoricalEncoder(label_col)]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, "tmp", "out")

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df, num_epochs=1)

    _, metrics = imputer.transform_and_compute_metrics(df)
    assert metrics['label']['avg_precision'] > 0.80

示例#12

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_inplace_prediction(test_dir, data_frame):
    label_col = 'label'
    df = data_frame(n_samples=100, label_col=label_col)

    data_encoder_cols = [TfIdfEncoder('features')]
    label_encoder_cols = [CategoricalEncoder(label_col)]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, "tmp", "out")

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df, num_epochs=1)

    predicted = imputer.predict(df, inplace=True)

    assert predicted is df

示例#13

0

显示文件

文件： test_imputer.py 项目： stjordanis/datawig

def test_non_writable_output_path(test_dir, data_frame):
    label_col = 'label'
    df = data_frame(n_samples=100, label_col=label_col)

    data_encoder_cols = [TfIdfEncoder('features')]
    label_encoder_cols = [CategoricalEncoder(label_col)]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, 'non_writable')

    Imputer(data_featurizers=data_cols,
            label_encoders=label_encoder_cols,
            data_encoders=data_encoder_cols,
            output_path=output_path).fit(train_df=df, num_epochs=1).save()

    from datawig.utils import logger

    try:
        # make output dir of imputer read-only
        os.chmod(output_path, S_IREAD | S_IXUSR)

        # make log file read only
        os.chmod(os.path.join(output_path, "imputer.log"), S_IREAD)
        imputer = Imputer.load(output_path)
        _ = imputer.predict(df)
        logger.warning("this should not fail")

        # remove log file
        os.chmod(os.path.join(output_path, "imputer.log"),
                 S_IREAD | S_IXUSR | S_IWUSR)
        os.chmod(output_path, S_IREAD | S_IXUSR | S_IWUSR)
        os.remove(os.path.join(output_path, "imputer.log"))

        # make output dir of imputer read-only
        os.chmod(output_path, S_IREAD | S_IXUSR)

        imputer = Imputer.load(output_path)
        _ = imputer.predict(df)
        logger.warning("this should not fail")
        os.chmod(output_path, S_IREAD | S_IXUSR | S_IWUSR)
    except Exception as e:
        print(e)
        pytest.fail("This invocation not raise any Exception")

示例#14

0

显示文件

文件： test_imputer.py 项目： felixbiessmann/datawig

def test_imputer_load_with_invalid_context(tmpdir, data_frame):

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'

    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir

    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)
    imputer.fit(train_df=df, num_epochs=1)
    imputer.ctx = None
    imputer.save()

    imputer_deser = Imputer.load(tmpdir)
    _ = imputer_deser.predict(df)

示例#15

0

显示文件

文件： test_imputer.py 项目： jbdatascience/datawig

def test_explain_instance_without_label(test_dir, data_frame):
    label_col = 'label'
    df = data_frame(n_samples=100, label_col=label_col)

    data_encoder_cols = [TfIdfEncoder('features')]
    label_encoder_cols = [CategoricalEncoder(label_col)]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, "tmp", "out")

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df, num_epochs=1)

    assert imputer.is_explainable

    instance = pd.Series({'features': 'some feature text'})
    # explain_instance should not raise an exception
    _ = imputer.explain_instance(instance)
    assert True

示例#16

0

显示文件

文件： test_imputer.py 项目： stjordanis/datawig

def test_fit_resumes(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = Imputer(
        data_encoders=[TfIdfEncoder([feature_col])],
        data_featurizers=[
            datawig.mxnet_input_symbols.BowFeaturizer(feature_col)
        ],
        label_encoders=[CategoricalEncoder(label_col)],
        output_path=test_dir)

    assert imputer.module is None

    imputer.fit(df, num_epochs=20)
    first_fit_module = imputer.module

    imputer.fit(df, num_epochs=20)
    second_fit_module = imputer.module

    assert first_fit_module == second_fit_module

示例#17

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_imputer_fit_fail_non_writable_output_dir(tmpdir, data_frame):
    import stat

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'
    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir
    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)

    # make tmpdir read/exec-only by owner/group/others
    os.chmod(
        tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD
        | stat.S_IRGRP | stat.S_IROTH)

    # fail if imputer.fit does not raise an AssertionError
    with pytest.raises(AssertionError) as e:
        imputer.fit(df, num_epochs=1)

示例#18

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_mxnet_module_wrapper(data_frame):
    from datawig.imputer import _MXNetModule
    import mxnet as mx
    from datawig.iterators import ImputerIterDf

    feature_col, label_col = "feature", "label"
    df = data_frame(n_samples=100,
                    feature_col=feature_col,
                    label_col=label_col)
    label_encoders = [CategoricalEncoder(label_col)]
    data_encoders = [BowEncoder(feature_col)]
    data_featurizers = [BowFeaturizer(feature_col, vocab_size=100)]
    iter_train = ImputerIterDf(df, data_encoders, label_encoders)

    mod = _MXNetModule(mx.current_context(),
                       label_encoders,
                       data_featurizers,
                       final_fc_hidden_units=[])(iter_train)

    assert mod._label_names == [label_col]
    assert mod.data_names == [feature_col]
    # weights and biases
    assert len(mod._arg_params) == 2

示例#19

0

显示文件

def get_new_iterator_df_bow(df):
    return ImputerIterDf(
        df,
        data_columns=[BowEncoder(feature_col, max_tokens=max_tokens)],
        label_columns=[CategoricalEncoder(label_col, max_tokens=num_labels)],
        batch_size=2)

示例#20

0

显示文件

def test_imputer_image_data():

    img_path = os.path.join(dir_path, "resources", "test_images")
    os.makedirs(img_path, exist_ok=True)

    colors = ['red', 'green', 'blue']

    for color in colors:
        create_test_image(os.path.join(img_path, color + ".png"), color)

    n_samples = 32
    color_labels = [random.choice(colors) for _ in range(n_samples)]

    df = pd.DataFrame({"image_files": color_labels,
                       "label": color_labels})

    df['image_files'] = img_path + "/" + df['image_files'] + ".png"

    output_path = os.path.join(dir_path, "resources", "tmp", "experiment_images")

    data_encoder_cols = [ImageEncoder(['image_files'])]
    data_cols = [ImageFeaturizer('image_files')]

    label_encoder_cols = [CategoricalEncoder(['label'])]

    imputer = Imputer(
        data_featurizers=data_cols,
        label_encoders=label_encoder_cols,
        data_encoders=data_encoder_cols,
        output_path=output_path
    ).fit(
        train_df=df,
        learning_rate=1e-3,
        num_epochs=2,
        patience=5,
        test_split=.1,
        weight_decay=.0001,
        batch_size=16
    )

    shutil.rmtree(output_path)

    # Test with image + numeric inputs
    df['numeric'] = np.random.uniform(-np.pi, np.pi, (n_samples,))

    output_path = os.path.join(dir_path, "resources", "tmp", "experiment_images_with_num")

    data_encoder_cols = [ImageEncoder(['image_files']), NumericalEncoder(['numeric'])]
    data_cols = [ImageFeaturizer('image_files'), NumericalFeaturizer('numeric', latent_dim=100)]
    label_encoder_cols = [CategoricalEncoder(['label'])]

    imputer = Imputer(
        data_featurizers=data_cols,
        label_encoders=label_encoder_cols,
        data_encoders=data_encoder_cols,
        output_path=output_path
    ).fit(
        train_df=df,
        learning_rate=1e-3,
        num_epochs=2,
        patience=5,
        test_split=.1,
        weight_decay=.0001,
        batch_size=16
    )
    shutil.rmtree(img_path)
    shutil.rmtree(output_path)

示例#21

0

显示文件

文件： test_calibration.py 项目： andrey-tpt/datawig

def test_automatic_calibration(data_frame):
    """
    Fit model with all featurisers and assert
    that calibration improves the expected calibration error.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 2000
    num_labels = 3
    seq_len = 20
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col + "_lstm",
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col + "_lstm",
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    num_epochs = 20
    batch_size = 32
    learning_rate = 1e-2

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols).fit(
                          train_df=df_train,
                          test_df=df_val,
                          learning_rate=learning_rate,
                          num_epochs=num_epochs,
                          batch_size=batch_size)

    assert imputer.calibration_info['ece_pre'] > imputer.calibration_info[
        'ece_post']

示例#22

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_imputer_duplicate_encoder_output_columns(test_dir, data_frame):
    """
    Tests Imputer with sequential, bag-of-words and categorical variables as inputs
    this could be run as part of integration test suite.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 10
    seq_len = 100
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col, max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col,
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col, vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col,
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    output_path = os.path.join(test_dir, "tmp",
                               "imputer_experiment_synthetic_data")

    num_epochs = 20
    batch_size = 16
    learning_rate = 1e-3

    with pytest.raises(ValueError) as e:
        imputer = Imputer(data_featurizers=data_cols,
                          label_encoders=label_encoder_cols,
                          data_encoders=data_encoder_cols,
                          output_path=output_path)
        imputer.fit(train_df=df_train,
                    test_df=df_val,
                    learning_rate=learning_rate,
                    num_epochs=num_epochs,
                    batch_size=batch_size)

示例#23

0

显示文件

文件： test_imputer.py 项目： andrey-tpt/datawig

def test_imputer_real_data_all_featurizers(test_dir, data_frame):
    """
    Tests Imputer with sequential, bag-of-words and categorical variables as inputs
    this could be run as part of integration test suite.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 5000
    num_labels = 3
    seq_len = 20
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col + "_lstm",
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col + "_lstm",
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    output_path = os.path.join(test_dir, "tmp",
                               "imputer_experiment_synthetic_data")

    num_epochs = 10
    batch_size = 32
    learning_rate = 1e-2

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df_train,
                                                   test_df=df_val,
                                                   learning_rate=learning_rate,
                                                   num_epochs=num_epochs,
                                                   batch_size=batch_size,
                                                   calibrate=False)

    len_df_before_predict = len(df_test)
    pred = imputer.transform(df_test)

    assert len(pred[label_col]) == len_df_before_predict

    assert sum(df_test[label_col].values == pred[label_col]) == len(df_test)

    _ = imputer.predict_proba_top_k(df_test, top_k=2)

    _, metrics = imputer.transform_and_compute_metrics(df_test)

    assert metrics[label_col]['avg_f1'] > 0.9

    deserialized = Imputer.load(imputer.output_path)

    _, metrics_deserialized = deserialized.transform_and_compute_metrics(
        df_test)

    assert metrics_deserialized[label_col]['avg_f1'] > 0.9

    # training on a small data set to get a imputer with low precision
    not_so_precise_imputer = Imputer(data_featurizers=data_cols,
                                     label_encoders=label_encoder_cols,
                                     data_encoders=data_encoder_cols,
                                     output_path=output_path).fit(
                                         train_df=df_train[:50],
                                         test_df=df_test,
                                         learning_rate=learning_rate,
                                         num_epochs=num_epochs,
                                         batch_size=batch_size,
                                         calibrate=False)

    df_test = df_test.reset_index()
    predictions_df = not_so_precise_imputer.predict(
        df_test, precision_threshold=.5, imputation_suffix="_imputed")

    assert predictions_df.columns.contains(label_col + "_imputed")
    assert predictions_df.columns.contains(label_col + "_imputed_proba")