Exemplo n.º 1
0
    def process(self, store: Store) -> dict:
        """
        Calculate the Sørensen dice coefficient between two columns
        :param store:
        :return: CheckResult
        """

        df1, df2 = store[NGram(n=self.n, ngram_type=self.ngram_type)]

        df1a, df1b = random_split(df1, [0.95, 0.05], seed=11)           # Baseline for df1
        df2a, df2b = random_split(df2, [0.95, 0.05], seed=11)           # Baseline for df2

        if df1b.empty or df2b.empty:
            raise ValueError('Dataset to small for split ratio or n={} to big'.format(self.n))

        result = {}
        for i in df1:

            result[i] = (self.calculate_sdc(self.join_and_normalize_ngrams(df1a[i]),
                                            self.join_and_normalize_ngrams(df1b[i])),
                         self.calculate_sdc(self.join_and_normalize_ngrams(df2a[i]),
                                            self.join_and_normalize_ngrams(df2b[i])),
                         self.calculate_sdc(self.join_and_normalize_ngrams(df1[i]),
                                            self.join_and_normalize_ngrams(df2[i])))

        return result
Exemplo n.º 2
0
    def process(self, store: Store) -> dict:
        """
        Calculate the euclidean distance between two embeddings.
        :param store:
        :return: CheckResult
        """

        df1, df2 = store[TextEmbeddingPrecalculation(
            model=self.model, trained_model=self.trained_model, agg='sum')]

        df1a, df1b = random_split(df1, [0.95, 0.05])  # Baseline for df1
        df2a, df2b = random_split(df2, [0.95, 0.05])  # Baseline for df2

        if df1a.empty or df1b.empty or df2a.empty or df2b.empty:
            raise ValueError('Dataset to small for split ratio')

        result = {}
        for i in df1:
            result[i] = (norm(
                self.sum_and_normalize_vectors(df1a[i]) -
                self.sum_and_normalize_vectors(df1b[i])),
                         norm(
                             self.sum_and_normalize_vectors(df2a[i]) -
                             self.sum_and_normalize_vectors(df2b[i])),
                         norm(
                             self.sum_and_normalize_vectors(df1[i]) -
                             self.sum_and_normalize_vectors(df2[i])))

        return result
Exemplo n.º 3
0
def test_hpo_defaults(test_dir, data_frame):
    label_col = "label"

    n_samples = 500
    num_labels = 3
    seq_len = 10

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    # add categorical feature
    df['categorical_feature'] = [
        'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples)
    ]

    # add numerical feature
    df['numeric_feature'] = np.random.rand(n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(input_columns=[
        'string_feature', 'categorical_feature', 'numeric_feature'
    ],
                            output_column='label',
                            output_path=output_path)

    imputer.fit_hpo(df_train, num_evals=10, num_epochs=5)

    assert imputer.hpo.results.precision_weighted.max() > .9
Exemplo n.º 4
0
def test_label_shift_weight_computation():
    """
    Tests that label shift detection can determine the label marginals of validation data.
    """

    train_proportion = [.7, .3]
    target_proportion = [.3, .7]

    data = synthetic_label_shift_simple(N=2000, label_proportions=train_proportion,
                                        error_proba=.1, covariates=['foo', 'bar'])

    # original train test splits
    tr, te = random_split(data, [.5, .5])

    # train domain classifier
    imputer = SimpleImputer(
        input_columns=['covariate'],
        output_column='label',
        output_path='/tmp/imputer_model')

    # Fit an imputer model on the train data (coo_imputed_proba, coo_imputed)
    imputer.fit(tr, te, num_epochs=15, learning_rate=3e-4, weight_decay=0)

    target_data = synthetic_label_shift_simple(1000, target_proportion,
                                               error_proba=.1, covariates=['foo', 'bar'])

    weights = imputer.check_for_label_shift(target_data)

    # compare the product of weights and training marginals
    # (i.e. estimated target marginals) with the true target marginals.
    for x in list(zip(list(weights.values()), train_proportion, target_proportion)):
        assert x[0]*x[1] - x[2] < .1
Exemplo n.º 5
0
def test_imputer_numeric_data(test_dir):
    """
    Tests numeric encoder/featurizer only

    """
    # Training data
    N = 1000
    x = np.random.uniform(-np.pi, np.pi, (N, ))
    df = pd.DataFrame({'x': x, 'cos': np.cos(x), '*2': x * 2, '**2': x**2})

    df_train, df_test = random_split(df, [.6, .4])
    output_path = os.path.join(test_dir, "tmp", "real_data_experiment_numeric")

    data_encoder_cols = [NumericalEncoder(['x'])]
    data_cols = [NumericalFeaturizer('x', numeric_latent_dim=100)]

    for target in ['*2', '**2', 'cos']:
        label_encoder_cols = [NumericalEncoder([target], normalize=False)]

        imputer = Imputer(data_featurizers=data_cols,
                          label_encoders=label_encoder_cols,
                          data_encoders=data_encoder_cols,
                          output_path=output_path)
        imputer.fit(train_df=df_train,
                    learning_rate=1e-1,
                    num_epochs=100,
                    patience=5,
                    test_split=.3,
                    weight_decay=.0,
                    batch_size=128)

        pred, metrics = imputer.transform_and_compute_metrics(df_test)
        df_test['predictions_' + target] = pred[target].flatten()
        print("Numerical metrics: {}".format(metrics[target]))
        assert metrics[target] < 10
Exemplo n.º 6
0
def test_imputer_hpo_numeric():
    """

    Tests SimpleImputer HPO for numeric data/imputation

    """

    N = 200
    numeric_data = np.random.uniform(-np.pi, np.pi, (N, ))
    df = pd.DataFrame({
        'x': numeric_data,
        '**2': numeric_data**2 + np.random.normal(0, .1, (N, )),
    })

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(dir_path, "resources", "tmp",
                               "real_data_experiment_numeric_hpo")

    imputer_numeric = SimpleImputer(input_columns=['x'],
                                    output_column="**2",
                                    output_path=output_path).fit_hpo(
                                        train_df=df_train,
                                        learning_rate=1e-3,
                                        num_epochs=100,
                                        patience=10,
                                        num_hash_bucket_candidates=[2**10],
                                        tokens_candidates=['words'],
                                        latent_dim_candidates=[10, 50, 100],
                                        hidden_layers_candidates=[1, 2])

    imputer_numeric.predict(df_test)

    assert mean_squared_error(df_test['**2'], df_test['**2_imputed']) < 1.0

    shutil.rmtree(output_path)
Exemplo n.º 7
0
def test_imputer_unrepresentative_test_df(test_dir, data_frame):
    """

    Tests whether the imputer runs through in cases when test data set (and hence metrics and precision/recall curves)
    doesn't contain values present in training data

    """
    # generate some random data
    random_data = data_frame(n_samples=100)

    df_train, df_test, _ = random_split(random_data, [.8, .1, .1])

    excluded = df_train['labels'].values[0]
    df_test = df_test[df_test['labels'] != excluded]

    data_encoder_cols = [BowEncoder('features')]
    label_encoder_cols = [CategoricalEncoder('labels')]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, "tmp", "real_data_experiment")

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df_train,
                                                   test_df=df_test,
                                                   num_epochs=10)

    only_excluded_df = df_train[df_train['labels'] == excluded]
    imputations = imputer.predict_above_precision(
        only_excluded_df, precision_threshold=.99)['labels']
    assert all([x == () for x in imputations])
Exemplo n.º 8
0
def test_hpo_many_columns(test_dir, data_frame):
    """

    """
    label_col = "label"

    n_samples = 300
    num_labels = 3
    ncols = 10
    seq_len = 4

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    for col in range(ncols):
        df['string_featur_' + str(col)] = df['string_feature']

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if not col in ['label']],
        output_column='label',
        output_path=output_path)

    imputer.fit_hpo(df_train, num_evals=2)

    assert imputer.hpo.results.precision_weighted.max() > .8
Exemplo n.º 9
0
def test_simple_imputer_label_shift(test_dir):
    """
    Test capabilities for detecting and correcting label shift
    """

    tr = synthetic_label_shift_simple(N=1000,
                                      label_proportions=[.2, .8],
                                      error_proba=.05,
                                      covariates=['foo', 'bar'])
    val = synthetic_label_shift_simple(N=500,
                                       label_proportions=[.9, .1],
                                       error_proba=.05,
                                       covariates=['foo', 'bar'])

    # randomly make covariate uninformative
    rand_idxs = np.random.choice(range(val.shape[0]),
                                 size=int(val.shape[0] / 3),
                                 replace=False)
    val.loc[rand_idxs, 'covariate'] = 'foo bar'

    tr, te = random_split(tr, [.8, .2])

    # train domain classifier
    imputer = SimpleImputer(input_columns=['covariate'],
                            output_column='label',
                            output_path=os.path.join(
                                test_dir, "tmp",
                                "label_weighting_experiments"))

    # Fit an imputer model on the train data (coo_imputed_proba, coo_imputed)
    imputer.fit(tr, te, num_epochs=15, learning_rate=3e-4, weight_decay=0)
    pred = imputer.predict(val)

    # compute estimate of ratio of marginals and add corresponding label to the training data
    weights = imputer.check_for_label_shift(val)

    # retrain classifier with balancing
    imputer_balanced = SimpleImputer(input_columns=['covariate'],
                                     output_column='label',
                                     output_path=os.path.join(
                                         test_dir, "tmp",
                                         "label_weighting_experiments"))

    # Fit an imputer model on the train data (coo_imputed_proba, coo_imputed)
    imputer_balanced.fit(tr,
                         te,
                         num_epochs=15,
                         learning_rate=3e-4,
                         weight_decay=0,
                         class_weights=weights)

    pred_balanced = imputer_balanced.predict(val)

    acc_balanced = (
        pred_balanced.label == pred_balanced['label_imputed']).mean()
    acc_classic = (pred.label == pred['label_imputed']).mean()

    # check that weighted performance is better
    assert acc_balanced > acc_classic
Exemplo n.º 10
0
    def prepare_dfs(
        self, df1: pd.DataFrame, df2: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Create a train and a test dataset, in which the number number of tuples
        that come from the first and the number of those from the second dataset are equal
        :param df1: first dataset
        :param df2: second dataset
        :return: tuple of train and test dataset
        """
        df1, df2 = self.label_dfs(df1, df2)
        df1_sampled, df2_sampled = self.sample_dfs(df1, df2)

        df1_train, df1_test = random_split(df1_sampled)
        df2_train, df2_test = random_split(df2_sampled)

        return df1_train, df1_test, df2_train, df2_test
Exemplo n.º 11
0
def test_imputer_hpo_text(test_dir, data_frame):
    """

    Tests SimpleImputer HPO with text data and categorical imputations

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 20

    # generate some random data
    df = data_frame(feature_col=feature_col,
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    df_train, df_test = random_split(df, [.8, .2])

    output_path = os.path.join(test_dir, "tmp", "experiment_text_hpo")

    imputer_string = SimpleImputer(
        input_columns=[feature_col],
        output_column=label_col,
        output_path=output_path
    )

    hps = dict()
    hps[feature_col] = {}
    hps[feature_col]['type'] = ['string']
    hps[feature_col]['tokens'] = [['words'], ['chars']]

    hps['global'] = {}
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['learning_rate'] = [1e-3]
    hps['global']['weight_decay'] = [0]
    hps['global']['num_epochs'] = [30]

    imputer_string.fit_hpo(df_train, hps=hps, num_epochs=10, num_evals=3)

    assert max(imputer_string.hpo.results['f1_micro']) > 0.7
Exemplo n.º 12
0
def test_imputer_hpo_text():
    """

    Tests SimpleImputer HPO with text data and categorical imputations

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 20

    # generate some random data
    df = generate_string_data_frame(feature_col=feature_col,
                                    label_col=label_col,
                                    num_labels=num_labels,
                                    num_words=seq_len,
                                    n_samples=n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(dir_path, "resources", "tmp",
                               "real_data_experiment_text_hpo")

    imputer_string = SimpleImputer(
        input_columns=[feature_col],
        output_column=label_col,
        output_path=output_path).fit_hpo(
            train_df=df_train,
            num_epochs=100,
            patience=3,
            num_hash_bucket_candidates=[2**10, 2**15],
            tokens_candidates=['words'],
            latent_dim_candidates=[10],
            hpo_max_train_samples=1000)

    imputer_string.predict(df_test)

    assert f1_score(df_test[label_col],
                    df_test[label_col + '_imputed'],
                    average="weighted") > .7

    shutil.rmtree(output_path)
Exemplo n.º 13
0
def test_imputer_hpo_numeric(test_dir):
    """

    Tests SimpleImputer HPO for numeric data/imputation

    """

    N = 200
    numeric_data = np.random.uniform(-np.pi, np.pi, (N, ))
    df = pd.DataFrame({
        'x': numeric_data,
        '**2': numeric_data**2 + np.random.normal(0, .1, (N, )),
    })

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp", "experiment_numeric_hpo")

    imputer_numeric = SimpleImputer(input_columns=['x'],
                                    output_column="**2",
                                    output_path=output_path)

    feature_col = 'x'

    hps = {}
    hps[feature_col] = {}
    hps[feature_col]['type'] = ['numeric']
    hps[feature_col]['numeric_latent_dim'] = [30]
    hps[feature_col]['numeric_hidden_layers'] = [1]

    hps['global'] = {}
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['learning_rate'] = [1e-3, 1e-4]
    hps['global']['weight_decay'] = [0]
    hps['global']['num_epochs'] = [200]
    hps['global']['patience'] = [100]
    hps['global']['concat_columns'] = [False]

    imputer_numeric.fit_hpo(df_train, hps=hps)
    results = imputer_numeric.hpo.results

    assert results[results['mse'] == min(results['mse'])]['mse'].iloc[0] < .3
Exemplo n.º 14
0
def test_explain_method_synthetic(test_dir):
    # Generate simulated data for testing explain method
    # Predict output column with entries in ['foo', 'bar'] from two columns, one
    # categorical in ['foo', 'dummy'], one text in ['text_foo_text', 'text_dummy_text'].
    # the output column is deterministically 'foo', if 'foo' occurs anywhere in any input column.
    N = 100
    cat_in_col = ['foo' if r > (1 / 2) else 'dummy' for r in np.random.rand(N)]
    text_in_col = ['fff' if r > (1 / 2) else 'ddd' for r in np.random.rand(N)]
    hash_in_col = ['h' for r in range(N)]
    cat_out_col = [
        'foo' if 'f' in input[0] + input[1] else 'bar'
        for input in zip(cat_in_col, text_in_col)
    ]

    df = pd.DataFrame()
    df['in_cat'] = cat_in_col
    df['in_text'] = text_in_col
    df['in_text_hash'] = hash_in_col
    df['out_cat'] = cat_out_col

    # Specify encoders and featurizers #
    data_encoder_cols = [
        datawig.column_encoders.TfIdfEncoder('in_text', tokens="chars"),
        datawig.column_encoders.CategoricalEncoder('in_cat', max_tokens=10),
        datawig.column_encoders.BowEncoder('in_text_hash', tokens="chars")
    ]
    data_featurizer_cols = [
        datawig.mxnet_input_symbols.BowFeaturizer('in_text'),
        datawig.mxnet_input_symbols.EmbeddingFeaturizer('in_cat'),
        datawig.mxnet_input_symbols.BowFeaturizer('in_text_hash')
    ]

    label_encoder_cols = [
        datawig.column_encoders.CategoricalEncoder('out_cat')
    ]

    # Specify model
    imputer = datawig.Imputer(data_featurizers=data_featurizer_cols,
                              label_encoders=label_encoder_cols,
                              data_encoders=data_encoder_cols,
                              output_path=os.path.join(test_dir, "tmp",
                                                       "explanation_tests"))

    # Train
    tr, te = random_split(df.sample(90), [.8, .2])
    imputer.fit(train_df=tr, test_df=te, num_epochs=20, learning_rate=1e-2)
    predictions = imputer.predict(te)

    # Evaluate
    assert precision_score(predictions.out_cat,
                           predictions.out_cat_imputed,
                           average='weighted') > .99

    # assert item explanation, iterate over some inputs
    for i in np.random.choice(N, 10):
        explanation = imputer.explain_instance(df.iloc[i])
        top_label = explanation['explained_label']

        if top_label == 'bar':
            assert (explanation['in_text'][0][0] == 'd'
                    and explanation['in_cat'][0][0] == 'dummy')
        elif top_label == 'foo':
            assert (explanation['in_text'][0][0] == 'f'
                    or explanation['in_cat'][0][0] == 'foo')

    # assert class explanations
    assert np.all([
        'f' in token for token, weight in imputer.explain('foo')['in_text']
    ][:3])
    assert [
        'f' in token for token, weight in imputer.explain('foo')['in_cat']
    ][0]

    # test serialisation to disk
    imputer.save()
    imputer_from_disk = Imputer.load(imputer.output_path)
    assert np.all([
        'f' in token
        for token, weight in imputer_from_disk.explain('foo')['in_text']
    ][:3])
Exemplo n.º 15
0
def test_simple_imputer_real_data_default_args(test_dir, data_frame):
    """
    Tests SimpleImputer with default options

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 2000
    num_labels = 3
    seq_len = 100
    vocab_size = int(2**15)

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    output_path = os.path.join(test_dir, "tmp", "real_data_experiment_simple")

    df_train_cols_before = df_train.columns.tolist()

    input_columns = [feature_col]

    imputer = SimpleImputer(input_columns=input_columns,
                            output_column=label_col,
                            output_path=output_path).fit(train_df=df_train)

    logfile = os.path.join(imputer.output_path, 'imputer.log')
    assert os.path.exists(logfile)
    assert os.path.getsize(logfile) > 0

    assert imputer.output_path == output_path
    assert imputer.imputer.data_featurizers[0].__class__ == BowFeaturizer
    assert imputer.imputer.data_encoders[0].__class__ == BowEncoder
    assert set(
        imputer.imputer.data_encoders[0].input_columns) == set(input_columns)
    assert set(imputer.imputer.label_encoders[0].input_columns) == set(
        [label_col])

    assert all([
        after == before
        for after, before in zip(df_train.columns, df_train_cols_before)
    ])

    df_no_label_column = df_test.copy()
    true_labels = df_test[label_col]
    del (df_no_label_column[label_col])
    df_test_cols_before = df_no_label_column.columns.tolist()

    df_test_imputed = imputer.predict(df_no_label_column, inplace=True)

    assert all([
        after == before for after, before in zip(df_no_label_column.columns,
                                                 df_test_cols_before)
    ])

    imputed_columns = df_test_cols_before + [
        label_col + "_imputed", label_col + "_imputed_proba"
    ]

    assert all([
        after == before
        for after, before in zip(df_test_imputed, imputed_columns)
    ])

    f1 = f1_score(true_labels,
                  df_test_imputed[label_col + '_imputed'],
                  average="weighted")

    assert f1 > .9

    new_path = imputer.output_path + "-" + rand_string()

    os.rename(imputer.output_path, new_path)

    deserialized = SimpleImputer.load(new_path)
    df_test = deserialized.predict(df_test,
                                   imputation_suffix="_deserialized_imputed")
    f1 = f1_score(df_test[label_col],
                  df_test[label_col + '_deserialized_imputed'],
                  average="weighted")

    assert f1 > .9

    retrained_simple_imputer = deserialized.fit(df_train, df_train)

    df_train_imputed = retrained_simple_imputer.predict(df_train.copy(),
                                                        inplace=True)
    f1 = f1_score(df_train[label_col],
                  df_train_imputed[label_col + '_imputed'],
                  average="weighted")

    assert f1 > .9

    metrics = retrained_simple_imputer.load_metrics()

    assert f1 == metrics['weighted_f1']
Exemplo n.º 16
0
def test_hpo_all_input_types(test_dir, data_frame):
    """

    Using sklearn advantages: parallelism, distributions of parameters, multiple cross-validation

    """
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 12

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    # add categorical feature
    df['categorical_feature'] = [
        'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples)
    ]

    # add numerical feature
    df['numeric_feature'] = np.random.rand(n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(input_columns=[
        'string_feature', 'categorical_feature', 'numeric_feature'
    ],
                            output_column='label',
                            output_path=output_path)

    # Define default hyperparameter choices for each column type (string, categorical, numeric)
    hps = dict()
    hps['global'] = {}
    hps['global']['learning_rate'] = [3e-4]
    hps['global']['weight_decay'] = [1e-8]
    hps['global']['num_epochs'] = [5, 50]
    hps['global']['patience'] = [5]
    hps['global']['batch_size'] = [16]
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['concat_columns'] = [True, False]

    hps['string_feature'] = {}
    hps['string_feature']['max_tokens'] = [2**15]
    hps['string_feature']['tokens'] = [['words', 'chars']]
    hps['string_feature']['ngram_range'] = {}
    hps['string_feature']['ngram_range']['words'] = [(1, 4), (2, 5)]
    hps['string_feature']['ngram_range']['chars'] = [(2, 4), (3, 5)]

    hps['categorical_feature'] = {}
    hps['categorical_feature']['type'] = ['categorical']
    hps['categorical_feature']['max_tokens'] = [2**15]
    hps['categorical_feature']['embed_dim'] = [10]

    hps['numeric_feature'] = {}
    hps['numeric_feature']['normalize'] = [True]
    hps['numeric_feature']['numeric_latent_dim'] = [10]
    hps['numeric_feature']['numeric_hidden_layers'] = [1]

    # user defined score function for hyperparameters
    def calibration_check(true, predicted, confidence):
        """
        expect kwargs: true, predicted, confidence
        here we compute a calibration sanity check
        """
        return (np.mean(true[confidence > .9] == predicted[confidence > .9]),
                np.mean(true[confidence > .5] == predicted[confidence > .5]))

    def coverage_check(true, predicted, confidence):
        return np.mean(confidence > .9)

    uds = [(calibration_check, 'calibration check'),
           (coverage_check, 'coverage at 90')]

    imputer.fit_hpo(df_train,
                    hps=hps,
                    user_defined_scores=uds,
                    num_evals=5,
                    hpo_run_name='test1_')

    imputer.fit_hpo(df_train,
                    hps=hps,
                    user_defined_scores=uds,
                    num_evals=5,
                    hpo_run_name='test2_',
                    max_running_hours=1 / 3600)

    results = imputer.hpo.results

    assert results[results['global:num_epochs'] == 50]['f1_micro'].iloc[0] > \
           results[results['global:num_epochs'] == 5]['f1_micro'].iloc[0]
Exemplo n.º 17
0
def test_numeric_or_text_imputer(test_dir, data_frame):
    """
    Tests SimpleImputer with default options

    """

    feature_col = "string_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 30
    vocab_size = int(2**10)

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    numeric_data = np.random.uniform(-np.pi, np.pi, (n_samples, ))
    df = pd.DataFrame({
        'x':
        numeric_data,
        '*2':
        numeric_data * 2. + np.random.normal(0, .1, (n_samples, )),
        '**2':
        numeric_data**2 + np.random.normal(0, .1, (n_samples, )),
        feature_col:
        random_data[feature_col].values,
        label_col:
        random_data[label_col].values
    })

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp", "real_data_experiment_numeric")

    imputer_numeric_linear = SimpleImputer(input_columns=['x', feature_col],
                                           output_column="*2",
                                           output_path=output_path).fit(
                                               train_df=df_train,
                                               learning_rate=1e-3,
                                           )

    imputer_numeric_linear.predict(df_test, inplace=True)

    assert mean_squared_error(df_test['*2'], df_test['*2_imputed']) < 1.0

    imputer_numeric = SimpleImputer(input_columns=['x', feature_col],
                                    output_column="**2",
                                    output_path=output_path).fit(
                                        train_df=df_train, learning_rate=1e-3)

    imputer_numeric.predict(df_test, inplace=True)

    assert mean_squared_error(df_test['**2'], df_test['**2_imputed']) < 1.0

    imputer_string = SimpleImputer(
        input_columns=[feature_col, 'x'],
        output_column=label_col,
        output_path=output_path).fit(train_df=df_train)

    imputer_string.predict(df_test, inplace=True)

    assert f1_score(df_test[label_col],
                    df_test[label_col + '_imputed'],
                    average="weighted") > .7
Exemplo n.º 18
0
def test_random_split():
    df = pd.DataFrame([{'a': 1}, {'a': 2}])
    train_df, test_df = random_split(df, split_ratios=[.5, .5], seed=10)
    assert all(train_df.values.flatten() == np.array([1]))
    assert all(test_df.values.flatten() == np.array([2]))
Exemplo n.º 19
0
def test_imputer_real_data_all_featurizers(test_dir, data_frame):
    """
    Tests Imputer with sequential, bag-of-words and categorical variables as inputs
    this could be run as part of integration test suite.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 5000
    num_labels = 3
    seq_len = 20
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col + "_lstm",
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col + "_lstm",
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    output_path = os.path.join(test_dir, "tmp",
                               "imputer_experiment_synthetic_data")

    num_epochs = 10
    batch_size = 32
    learning_rate = 1e-2

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df_train,
                                                   test_df=df_val,
                                                   learning_rate=learning_rate,
                                                   num_epochs=num_epochs,
                                                   batch_size=batch_size,
                                                   calibrate=False)

    len_df_before_predict = len(df_test)
    pred = imputer.transform(df_test)

    assert len(pred[label_col]) == len_df_before_predict

    assert sum(df_test[label_col].values == pred[label_col]) == len(df_test)

    _ = imputer.predict_proba_top_k(df_test, top_k=2)

    _, metrics = imputer.transform_and_compute_metrics(df_test)

    assert metrics[label_col]['avg_f1'] > 0.9

    deserialized = Imputer.load(imputer.output_path)

    _, metrics_deserialized = deserialized.transform_and_compute_metrics(
        df_test)

    assert metrics_deserialized[label_col]['avg_f1'] > 0.9

    # training on a small data set to get a imputer with low precision
    not_so_precise_imputer = Imputer(data_featurizers=data_cols,
                                     label_encoders=label_encoder_cols,
                                     data_encoders=data_encoder_cols,
                                     output_path=output_path).fit(
                                         train_df=df_train[:50],
                                         test_df=df_test,
                                         learning_rate=learning_rate,
                                         num_epochs=num_epochs,
                                         batch_size=batch_size,
                                         calibrate=False)

    df_test = df_test.reset_index()
    predictions_df = not_so_precise_imputer.predict(
        df_test, precision_threshold=.5, imputation_suffix="_imputed")

    assert predictions_df.columns.contains(label_col + "_imputed")
    assert predictions_df.columns.contains(label_col + "_imputed_proba")
Exemplo n.º 20
0
def test_automatic_calibration(data_frame):
    """
    Fit model with all featurisers and assert
    that calibration improves the expected calibration error.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 2000
    num_labels = 3
    seq_len = 20
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col + "_lstm",
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col + "_lstm",
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    num_epochs = 20
    batch_size = 32
    learning_rate = 1e-2

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols).fit(
                          train_df=df_train,
                          test_df=df_val,
                          learning_rate=learning_rate,
                          num_epochs=num_epochs,
                          batch_size=batch_size)

    assert imputer.calibration_info['ece_pre'] > imputer.calibration_info[
        'ece_post']
Exemplo n.º 21
0
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

from datawig import SimpleImputer
from datawig.utils import random_split
from sklearn.metrics import f1_score, classification_report
import pandas as pd
"""
Load Data
"""
df = pd.read_csv('mae_train_dataset.csv').sample(n=1000)
df_train, df_test = random_split(df, split_ratios=[0.8, 0.2])

# ------------------------------------------------------------------------------------
"""
Run default SimpleImputer
"""
# Initialize a SimpleImputer model
imputer = SimpleImputer(
    input_columns=[
        'title', 'text'
    ],  # columns containing information about the column we want to impute
    output_column='finish',  # the column we'd like to impute values for
    output_path='imputer_model'  # stores model data and metrics
)

# Fit an imputer model on the train data
Exemplo n.º 22
0
def test_imputer_duplicate_encoder_output_columns(test_dir, data_frame):
    """
    Tests Imputer with sequential, bag-of-words and categorical variables as inputs
    this could be run as part of integration test suite.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 10
    seq_len = 100
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col, max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col,
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col, vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col,
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    output_path = os.path.join(test_dir, "tmp",
                               "imputer_experiment_synthetic_data")

    num_epochs = 20
    batch_size = 16
    learning_rate = 1e-3

    with pytest.raises(ValueError) as e:
        imputer = Imputer(data_featurizers=data_cols,
                          label_encoders=label_encoder_cols,
                          data_encoders=data_encoder_cols,
                          output_path=output_path)
        imputer.fit(train_df=df_train,
                    test_df=df_val,
                    learning_rate=learning_rate,
                    num_epochs=num_epochs,
                    batch_size=batch_size)