Пример #1
0
def test_imputer_load_read_exec_only_dir(tmpdir, data_frame):
    import stat

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'

    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir

    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)
    imputer.fit(train_df=df, num_epochs=1)

    # make tmpdir read/exec-only by owner/group/others
    os.chmod(
        tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD
        | stat.S_IRGRP | stat.S_IROTH)

    try:
        Imputer.load(tmpdir)
    except AssertionError as e:
        print(e)
        pytest.fail(
            'Loading imputer from read-only directory should not fail.')
Пример #2
0
def test_non_writable_output_path(test_dir, data_frame):
    label_col = 'label'
    df = data_frame(n_samples=100, label_col=label_col)

    data_encoder_cols = [TfIdfEncoder('features')]
    label_encoder_cols = [CategoricalEncoder(label_col)]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, 'non_writable')

    Imputer(data_featurizers=data_cols,
            label_encoders=label_encoder_cols,
            data_encoders=data_encoder_cols,
            output_path=output_path).fit(train_df=df, num_epochs=1).save()

    from datawig.utils import logger

    try:
        # make output dir of imputer read-only
        os.chmod(output_path, S_IREAD | S_IXUSR)

        # make log file read only
        os.chmod(os.path.join(output_path, "imputer.log"), S_IREAD)
        imputer = Imputer.load(output_path)
        _ = imputer.predict(df)
        logger.warning("this should not fail")

        # remove log file
        os.chmod(os.path.join(output_path, "imputer.log"),
                 S_IREAD | S_IXUSR | S_IWUSR)
        os.chmod(output_path, S_IREAD | S_IXUSR | S_IWUSR)
        os.remove(os.path.join(output_path, "imputer.log"))

        # make output dir of imputer read-only
        os.chmod(output_path, S_IREAD | S_IXUSR)

        imputer = Imputer.load(output_path)
        _ = imputer.predict(df)
        logger.warning("this should not fail")
        os.chmod(output_path, S_IREAD | S_IXUSR | S_IWUSR)
    except Exception as e:
        print(e)
        pytest.fail("This invocation not raise any Exception")
Пример #3
0
def test_imputer_load_with_invalid_context(tmpdir, data_frame):

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'

    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir

    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)
    imputer.fit(train_df=df, num_epochs=1)
    imputer.ctx = None
    imputer.save()

    imputer_deser = Imputer.load(tmpdir)
    _ = imputer_deser.predict(df)
Пример #4
0
def test_imputer_real_data_all_featurizers(test_dir, data_frame):
    """
    Tests Imputer with sequential, bag-of-words and categorical variables as inputs
    this could be run as part of integration test suite.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 5000
    num_labels = 3
    seq_len = 20
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col + "_lstm",
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col + "_lstm",
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    output_path = os.path.join(test_dir, "tmp",
                               "imputer_experiment_synthetic_data")

    num_epochs = 10
    batch_size = 32
    learning_rate = 1e-2

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df_train,
                                                   test_df=df_val,
                                                   learning_rate=learning_rate,
                                                   num_epochs=num_epochs,
                                                   batch_size=batch_size,
                                                   calibrate=False)

    len_df_before_predict = len(df_test)
    pred = imputer.transform(df_test)

    assert len(pred[label_col]) == len_df_before_predict

    assert sum(df_test[label_col].values == pred[label_col]) == len(df_test)

    _ = imputer.predict_proba_top_k(df_test, top_k=2)

    _, metrics = imputer.transform_and_compute_metrics(df_test)

    assert metrics[label_col]['avg_f1'] > 0.9

    deserialized = Imputer.load(imputer.output_path)

    _, metrics_deserialized = deserialized.transform_and_compute_metrics(
        df_test)

    assert metrics_deserialized[label_col]['avg_f1'] > 0.9

    # training on a small data set to get a imputer with low precision
    not_so_precise_imputer = Imputer(data_featurizers=data_cols,
                                     label_encoders=label_encoder_cols,
                                     data_encoders=data_encoder_cols,
                                     output_path=output_path).fit(
                                         train_df=df_train[:50],
                                         test_df=df_test,
                                         learning_rate=learning_rate,
                                         num_epochs=num_epochs,
                                         batch_size=batch_size,
                                         calibrate=False)

    df_test = df_test.reset_index()
    predictions_df = not_so_precise_imputer.predict(
        df_test, precision_threshold=.5, imputation_suffix="_imputed")

    assert predictions_df.columns.contains(label_col + "_imputed")
    assert predictions_df.columns.contains(label_col + "_imputed_proba")
Пример #5
0
def test_explain_method_synthetic(test_dir):
    # Generate simulated data for testing explain method
    # Predict output column with entries in ['foo', 'bar'] from two columns, one
    # categorical in ['foo', 'dummy'], one text in ['text_foo_text', 'text_dummy_text'].
    # the output column is deterministically 'foo', if 'foo' occurs anywhere in any input column.
    N = 100
    cat_in_col = ['foo' if r > (1 / 2) else 'dummy' for r in np.random.rand(N)]
    text_in_col = ['fff' if r > (1 / 2) else 'ddd' for r in np.random.rand(N)]
    hash_in_col = ['h' for r in range(N)]
    cat_out_col = [
        'foo' if 'f' in input[0] + input[1] else 'bar'
        for input in zip(cat_in_col, text_in_col)
    ]

    df = pd.DataFrame()
    df['in_cat'] = cat_in_col
    df['in_text'] = text_in_col
    df['in_text_hash'] = hash_in_col
    df['out_cat'] = cat_out_col

    # Specify encoders and featurizers #
    data_encoder_cols = [
        datawig.column_encoders.TfIdfEncoder('in_text', tokens="chars"),
        datawig.column_encoders.CategoricalEncoder('in_cat', max_tokens=10),
        datawig.column_encoders.BowEncoder('in_text_hash', tokens="chars")
    ]
    data_featurizer_cols = [
        datawig.mxnet_input_symbols.BowFeaturizer('in_text'),
        datawig.mxnet_input_symbols.EmbeddingFeaturizer('in_cat'),
        datawig.mxnet_input_symbols.BowFeaturizer('in_text_hash')
    ]

    label_encoder_cols = [
        datawig.column_encoders.CategoricalEncoder('out_cat')
    ]

    # Specify model
    imputer = datawig.Imputer(data_featurizers=data_featurizer_cols,
                              label_encoders=label_encoder_cols,
                              data_encoders=data_encoder_cols,
                              output_path=os.path.join(test_dir, "tmp",
                                                       "explanation_tests"))

    # Train
    tr, te = random_split(df.sample(90), [.8, .2])
    imputer.fit(train_df=tr, test_df=te, num_epochs=20, learning_rate=1e-2)
    predictions = imputer.predict(te)

    # Evaluate
    assert precision_score(predictions.out_cat,
                           predictions.out_cat_imputed,
                           average='weighted') > .99

    # assert item explanation, iterate over some inputs
    for i in np.random.choice(N, 10):
        explanation = imputer.explain_instance(df.iloc[i])
        top_label = explanation['explained_label']

        if top_label == 'bar':
            assert (explanation['in_text'][0][0] == 'd'
                    and explanation['in_cat'][0][0] == 'dummy')
        elif top_label == 'foo':
            assert (explanation['in_text'][0][0] == 'f'
                    or explanation['in_cat'][0][0] == 'foo')

    # assert class explanations
    assert np.all([
        'f' in token for token, weight in imputer.explain('foo')['in_text']
    ][:3])
    assert [
        'f' in token for token, weight in imputer.explain('foo')['in_cat']
    ][0]

    # test serialisation to disk
    imputer.save()
    imputer_from_disk = Imputer.load(imputer.output_path)
    assert np.all([
        'f' in token
        for token, weight in imputer_from_disk.explain('foo')['in_text']
    ][:3])