def process(self, store: Store) -> dict: """ Calculate the Sørensen dice coefficient between two columns :param store: :return: CheckResult """ df1, df2 = store[NGram(n=self.n, ngram_type=self.ngram_type)] df1a, df1b = random_split(df1, [0.95, 0.05], seed=11) # Baseline for df1 df2a, df2b = random_split(df2, [0.95, 0.05], seed=11) # Baseline for df2 if df1b.empty or df2b.empty: raise ValueError('Dataset to small for split ratio or n={} to big'.format(self.n)) result = {} for i in df1: result[i] = (self.calculate_sdc(self.join_and_normalize_ngrams(df1a[i]), self.join_and_normalize_ngrams(df1b[i])), self.calculate_sdc(self.join_and_normalize_ngrams(df2a[i]), self.join_and_normalize_ngrams(df2b[i])), self.calculate_sdc(self.join_and_normalize_ngrams(df1[i]), self.join_and_normalize_ngrams(df2[i]))) return result
def process(self, store: Store) -> dict: """ Calculate the euclidean distance between two embeddings. :param store: :return: CheckResult """ df1, df2 = store[TextEmbeddingPrecalculation( model=self.model, trained_model=self.trained_model, agg='sum')] df1a, df1b = random_split(df1, [0.95, 0.05]) # Baseline for df1 df2a, df2b = random_split(df2, [0.95, 0.05]) # Baseline for df2 if df1a.empty or df1b.empty or df2a.empty or df2b.empty: raise ValueError('Dataset to small for split ratio') result = {} for i in df1: result[i] = (norm( self.sum_and_normalize_vectors(df1a[i]) - self.sum_and_normalize_vectors(df1b[i])), norm( self.sum_and_normalize_vectors(df2a[i]) - self.sum_and_normalize_vectors(df2b[i])), norm( self.sum_and_normalize_vectors(df1[i]) - self.sum_and_normalize_vectors(df2[i]))) return result
def test_hpo_defaults(test_dir, data_frame): label_col = "label" n_samples = 500 num_labels = 3 seq_len = 10 # generate some random data df = data_frame(feature_col="string_feature", label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # add categorical feature df['categorical_feature'] = [ 'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples) ] # add numerical feature df['numeric_feature'] = np.random.rand(n_samples) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_text_hpo") imputer = SimpleImputer(input_columns=[ 'string_feature', 'categorical_feature', 'numeric_feature' ], output_column='label', output_path=output_path) imputer.fit_hpo(df_train, num_evals=10, num_epochs=5) assert imputer.hpo.results.precision_weighted.max() > .9
def test_label_shift_weight_computation(): """ Tests that label shift detection can determine the label marginals of validation data. """ train_proportion = [.7, .3] target_proportion = [.3, .7] data = synthetic_label_shift_simple(N=2000, label_proportions=train_proportion, error_proba=.1, covariates=['foo', 'bar']) # original train test splits tr, te = random_split(data, [.5, .5]) # train domain classifier imputer = SimpleImputer( input_columns=['covariate'], output_column='label', output_path='/tmp/imputer_model') # Fit an imputer model on the train data (coo_imputed_proba, coo_imputed) imputer.fit(tr, te, num_epochs=15, learning_rate=3e-4, weight_decay=0) target_data = synthetic_label_shift_simple(1000, target_proportion, error_proba=.1, covariates=['foo', 'bar']) weights = imputer.check_for_label_shift(target_data) # compare the product of weights and training marginals # (i.e. estimated target marginals) with the true target marginals. for x in list(zip(list(weights.values()), train_proportion, target_proportion)): assert x[0]*x[1] - x[2] < .1
def test_imputer_numeric_data(test_dir): """ Tests numeric encoder/featurizer only """ # Training data N = 1000 x = np.random.uniform(-np.pi, np.pi, (N, )) df = pd.DataFrame({'x': x, 'cos': np.cos(x), '*2': x * 2, '**2': x**2}) df_train, df_test = random_split(df, [.6, .4]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_numeric") data_encoder_cols = [NumericalEncoder(['x'])] data_cols = [NumericalFeaturizer('x', numeric_latent_dim=100)] for target in ['*2', '**2', 'cos']: label_encoder_cols = [NumericalEncoder([target], normalize=False)] imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path) imputer.fit(train_df=df_train, learning_rate=1e-1, num_epochs=100, patience=5, test_split=.3, weight_decay=.0, batch_size=128) pred, metrics = imputer.transform_and_compute_metrics(df_test) df_test['predictions_' + target] = pred[target].flatten() print("Numerical metrics: {}".format(metrics[target])) assert metrics[target] < 10
def test_imputer_hpo_numeric(): """ Tests SimpleImputer HPO for numeric data/imputation """ N = 200 numeric_data = np.random.uniform(-np.pi, np.pi, (N, )) df = pd.DataFrame({ 'x': numeric_data, '**2': numeric_data**2 + np.random.normal(0, .1, (N, )), }) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(dir_path, "resources", "tmp", "real_data_experiment_numeric_hpo") imputer_numeric = SimpleImputer(input_columns=['x'], output_column="**2", output_path=output_path).fit_hpo( train_df=df_train, learning_rate=1e-3, num_epochs=100, patience=10, num_hash_bucket_candidates=[2**10], tokens_candidates=['words'], latent_dim_candidates=[10, 50, 100], hidden_layers_candidates=[1, 2]) imputer_numeric.predict(df_test) assert mean_squared_error(df_test['**2'], df_test['**2_imputed']) < 1.0 shutil.rmtree(output_path)
def test_imputer_unrepresentative_test_df(test_dir, data_frame): """ Tests whether the imputer runs through in cases when test data set (and hence metrics and precision/recall curves) doesn't contain values present in training data """ # generate some random data random_data = data_frame(n_samples=100) df_train, df_test, _ = random_split(random_data, [.8, .1, .1]) excluded = df_train['labels'].values[0] df_test = df_test[df_test['labels'] != excluded] data_encoder_cols = [BowEncoder('features')] label_encoder_cols = [CategoricalEncoder('labels')] data_cols = [BowFeaturizer('features')] output_path = os.path.join(test_dir, "tmp", "real_data_experiment") imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit(train_df=df_train, test_df=df_test, num_epochs=10) only_excluded_df = df_train[df_train['labels'] == excluded] imputations = imputer.predict_above_precision( only_excluded_df, precision_threshold=.99)['labels'] assert all([x == () for x in imputations])
def test_hpo_many_columns(test_dir, data_frame): """ """ label_col = "label" n_samples = 300 num_labels = 3 ncols = 10 seq_len = 4 # generate some random data df = data_frame(feature_col="string_feature", label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) for col in range(ncols): df['string_featur_' + str(col)] = df['string_feature'] df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_text_hpo") imputer = SimpleImputer( input_columns=[col for col in df.columns if not col in ['label']], output_column='label', output_path=output_path) imputer.fit_hpo(df_train, num_evals=2) assert imputer.hpo.results.precision_weighted.max() > .8
def test_simple_imputer_label_shift(test_dir): """ Test capabilities for detecting and correcting label shift """ tr = synthetic_label_shift_simple(N=1000, label_proportions=[.2, .8], error_proba=.05, covariates=['foo', 'bar']) val = synthetic_label_shift_simple(N=500, label_proportions=[.9, .1], error_proba=.05, covariates=['foo', 'bar']) # randomly make covariate uninformative rand_idxs = np.random.choice(range(val.shape[0]), size=int(val.shape[0] / 3), replace=False) val.loc[rand_idxs, 'covariate'] = 'foo bar' tr, te = random_split(tr, [.8, .2]) # train domain classifier imputer = SimpleImputer(input_columns=['covariate'], output_column='label', output_path=os.path.join( test_dir, "tmp", "label_weighting_experiments")) # Fit an imputer model on the train data (coo_imputed_proba, coo_imputed) imputer.fit(tr, te, num_epochs=15, learning_rate=3e-4, weight_decay=0) pred = imputer.predict(val) # compute estimate of ratio of marginals and add corresponding label to the training data weights = imputer.check_for_label_shift(val) # retrain classifier with balancing imputer_balanced = SimpleImputer(input_columns=['covariate'], output_column='label', output_path=os.path.join( test_dir, "tmp", "label_weighting_experiments")) # Fit an imputer model on the train data (coo_imputed_proba, coo_imputed) imputer_balanced.fit(tr, te, num_epochs=15, learning_rate=3e-4, weight_decay=0, class_weights=weights) pred_balanced = imputer_balanced.predict(val) acc_balanced = ( pred_balanced.label == pred_balanced['label_imputed']).mean() acc_classic = (pred.label == pred['label_imputed']).mean() # check that weighted performance is better assert acc_balanced > acc_classic
def prepare_dfs( self, df1: pd.DataFrame, df2: pd.DataFrame ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Create a train and a test dataset, in which the number number of tuples that come from the first and the number of those from the second dataset are equal :param df1: first dataset :param df2: second dataset :return: tuple of train and test dataset """ df1, df2 = self.label_dfs(df1, df2) df1_sampled, df2_sampled = self.sample_dfs(df1, df2) df1_train, df1_test = random_split(df1_sampled) df2_train, df2_test = random_split(df2_sampled) return df1_train, df1_test, df2_train, df2_test
def test_imputer_hpo_text(test_dir, data_frame): """ Tests SimpleImputer HPO with text data and categorical imputations """ feature_col = "string_feature" label_col = "label" n_samples = 1000 num_labels = 3 seq_len = 20 # generate some random data df = data_frame(feature_col=feature_col, label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "experiment_text_hpo") imputer_string = SimpleImputer( input_columns=[feature_col], output_column=label_col, output_path=output_path ) hps = dict() hps[feature_col] = {} hps[feature_col]['type'] = ['string'] hps[feature_col]['tokens'] = [['words'], ['chars']] hps['global'] = {} hps['global']['final_fc_hidden_units'] = [[]] hps['global']['learning_rate'] = [1e-3] hps['global']['weight_decay'] = [0] hps['global']['num_epochs'] = [30] imputer_string.fit_hpo(df_train, hps=hps, num_epochs=10, num_evals=3) assert max(imputer_string.hpo.results['f1_micro']) > 0.7
def test_imputer_hpo_text(): """ Tests SimpleImputer HPO with text data and categorical imputations """ feature_col = "string_feature" label_col = "label" n_samples = 1000 num_labels = 3 seq_len = 20 # generate some random data df = generate_string_data_frame(feature_col=feature_col, label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(dir_path, "resources", "tmp", "real_data_experiment_text_hpo") imputer_string = SimpleImputer( input_columns=[feature_col], output_column=label_col, output_path=output_path).fit_hpo( train_df=df_train, num_epochs=100, patience=3, num_hash_bucket_candidates=[2**10, 2**15], tokens_candidates=['words'], latent_dim_candidates=[10], hpo_max_train_samples=1000) imputer_string.predict(df_test) assert f1_score(df_test[label_col], df_test[label_col + '_imputed'], average="weighted") > .7 shutil.rmtree(output_path)
def test_imputer_hpo_numeric(test_dir): """ Tests SimpleImputer HPO for numeric data/imputation """ N = 200 numeric_data = np.random.uniform(-np.pi, np.pi, (N, )) df = pd.DataFrame({ 'x': numeric_data, '**2': numeric_data**2 + np.random.normal(0, .1, (N, )), }) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "experiment_numeric_hpo") imputer_numeric = SimpleImputer(input_columns=['x'], output_column="**2", output_path=output_path) feature_col = 'x' hps = {} hps[feature_col] = {} hps[feature_col]['type'] = ['numeric'] hps[feature_col]['numeric_latent_dim'] = [30] hps[feature_col]['numeric_hidden_layers'] = [1] hps['global'] = {} hps['global']['final_fc_hidden_units'] = [[]] hps['global']['learning_rate'] = [1e-3, 1e-4] hps['global']['weight_decay'] = [0] hps['global']['num_epochs'] = [200] hps['global']['patience'] = [100] hps['global']['concat_columns'] = [False] imputer_numeric.fit_hpo(df_train, hps=hps) results = imputer_numeric.hpo.results assert results[results['mse'] == min(results['mse'])]['mse'].iloc[0] < .3
def test_explain_method_synthetic(test_dir): # Generate simulated data for testing explain method # Predict output column with entries in ['foo', 'bar'] from two columns, one # categorical in ['foo', 'dummy'], one text in ['text_foo_text', 'text_dummy_text']. # the output column is deterministically 'foo', if 'foo' occurs anywhere in any input column. N = 100 cat_in_col = ['foo' if r > (1 / 2) else 'dummy' for r in np.random.rand(N)] text_in_col = ['fff' if r > (1 / 2) else 'ddd' for r in np.random.rand(N)] hash_in_col = ['h' for r in range(N)] cat_out_col = [ 'foo' if 'f' in input[0] + input[1] else 'bar' for input in zip(cat_in_col, text_in_col) ] df = pd.DataFrame() df['in_cat'] = cat_in_col df['in_text'] = text_in_col df['in_text_hash'] = hash_in_col df['out_cat'] = cat_out_col # Specify encoders and featurizers # data_encoder_cols = [ datawig.column_encoders.TfIdfEncoder('in_text', tokens="chars"), datawig.column_encoders.CategoricalEncoder('in_cat', max_tokens=10), datawig.column_encoders.BowEncoder('in_text_hash', tokens="chars") ] data_featurizer_cols = [ datawig.mxnet_input_symbols.BowFeaturizer('in_text'), datawig.mxnet_input_symbols.EmbeddingFeaturizer('in_cat'), datawig.mxnet_input_symbols.BowFeaturizer('in_text_hash') ] label_encoder_cols = [ datawig.column_encoders.CategoricalEncoder('out_cat') ] # Specify model imputer = datawig.Imputer(data_featurizers=data_featurizer_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=os.path.join(test_dir, "tmp", "explanation_tests")) # Train tr, te = random_split(df.sample(90), [.8, .2]) imputer.fit(train_df=tr, test_df=te, num_epochs=20, learning_rate=1e-2) predictions = imputer.predict(te) # Evaluate assert precision_score(predictions.out_cat, predictions.out_cat_imputed, average='weighted') > .99 # assert item explanation, iterate over some inputs for i in np.random.choice(N, 10): explanation = imputer.explain_instance(df.iloc[i]) top_label = explanation['explained_label'] if top_label == 'bar': assert (explanation['in_text'][0][0] == 'd' and explanation['in_cat'][0][0] == 'dummy') elif top_label == 'foo': assert (explanation['in_text'][0][0] == 'f' or explanation['in_cat'][0][0] == 'foo') # assert class explanations assert np.all([ 'f' in token for token, weight in imputer.explain('foo')['in_text'] ][:3]) assert [ 'f' in token for token, weight in imputer.explain('foo')['in_cat'] ][0] # test serialisation to disk imputer.save() imputer_from_disk = Imputer.load(imputer.output_path) assert np.all([ 'f' in token for token, weight in imputer_from_disk.explain('foo')['in_text'] ][:3])
def test_simple_imputer_real_data_default_args(test_dir, data_frame): """ Tests SimpleImputer with default options """ feature_col = "string_feature" label_col = "label" n_samples = 2000 num_labels = 3 seq_len = 100 vocab_size = int(2**15) # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_simple") df_train_cols_before = df_train.columns.tolist() input_columns = [feature_col] imputer = SimpleImputer(input_columns=input_columns, output_column=label_col, output_path=output_path).fit(train_df=df_train) logfile = os.path.join(imputer.output_path, 'imputer.log') assert os.path.exists(logfile) assert os.path.getsize(logfile) > 0 assert imputer.output_path == output_path assert imputer.imputer.data_featurizers[0].__class__ == BowFeaturizer assert imputer.imputer.data_encoders[0].__class__ == BowEncoder assert set( imputer.imputer.data_encoders[0].input_columns) == set(input_columns) assert set(imputer.imputer.label_encoders[0].input_columns) == set( [label_col]) assert all([ after == before for after, before in zip(df_train.columns, df_train_cols_before) ]) df_no_label_column = df_test.copy() true_labels = df_test[label_col] del (df_no_label_column[label_col]) df_test_cols_before = df_no_label_column.columns.tolist() df_test_imputed = imputer.predict(df_no_label_column, inplace=True) assert all([ after == before for after, before in zip(df_no_label_column.columns, df_test_cols_before) ]) imputed_columns = df_test_cols_before + [ label_col + "_imputed", label_col + "_imputed_proba" ] assert all([ after == before for after, before in zip(df_test_imputed, imputed_columns) ]) f1 = f1_score(true_labels, df_test_imputed[label_col + '_imputed'], average="weighted") assert f1 > .9 new_path = imputer.output_path + "-" + rand_string() os.rename(imputer.output_path, new_path) deserialized = SimpleImputer.load(new_path) df_test = deserialized.predict(df_test, imputation_suffix="_deserialized_imputed") f1 = f1_score(df_test[label_col], df_test[label_col + '_deserialized_imputed'], average="weighted") assert f1 > .9 retrained_simple_imputer = deserialized.fit(df_train, df_train) df_train_imputed = retrained_simple_imputer.predict(df_train.copy(), inplace=True) f1 = f1_score(df_train[label_col], df_train_imputed[label_col + '_imputed'], average="weighted") assert f1 > .9 metrics = retrained_simple_imputer.load_metrics() assert f1 == metrics['weighted_f1']
def test_hpo_all_input_types(test_dir, data_frame): """ Using sklearn advantages: parallelism, distributions of parameters, multiple cross-validation """ label_col = "label" n_samples = 1000 num_labels = 3 seq_len = 12 # generate some random data df = data_frame(feature_col="string_feature", label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # add categorical feature df['categorical_feature'] = [ 'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples) ] # add numerical feature df['numeric_feature'] = np.random.rand(n_samples) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_text_hpo") imputer = SimpleImputer(input_columns=[ 'string_feature', 'categorical_feature', 'numeric_feature' ], output_column='label', output_path=output_path) # Define default hyperparameter choices for each column type (string, categorical, numeric) hps = dict() hps['global'] = {} hps['global']['learning_rate'] = [3e-4] hps['global']['weight_decay'] = [1e-8] hps['global']['num_epochs'] = [5, 50] hps['global']['patience'] = [5] hps['global']['batch_size'] = [16] hps['global']['final_fc_hidden_units'] = [[]] hps['global']['concat_columns'] = [True, False] hps['string_feature'] = {} hps['string_feature']['max_tokens'] = [2**15] hps['string_feature']['tokens'] = [['words', 'chars']] hps['string_feature']['ngram_range'] = {} hps['string_feature']['ngram_range']['words'] = [(1, 4), (2, 5)] hps['string_feature']['ngram_range']['chars'] = [(2, 4), (3, 5)] hps['categorical_feature'] = {} hps['categorical_feature']['type'] = ['categorical'] hps['categorical_feature']['max_tokens'] = [2**15] hps['categorical_feature']['embed_dim'] = [10] hps['numeric_feature'] = {} hps['numeric_feature']['normalize'] = [True] hps['numeric_feature']['numeric_latent_dim'] = [10] hps['numeric_feature']['numeric_hidden_layers'] = [1] # user defined score function for hyperparameters def calibration_check(true, predicted, confidence): """ expect kwargs: true, predicted, confidence here we compute a calibration sanity check """ return (np.mean(true[confidence > .9] == predicted[confidence > .9]), np.mean(true[confidence > .5] == predicted[confidence > .5])) def coverage_check(true, predicted, confidence): return np.mean(confidence > .9) uds = [(calibration_check, 'calibration check'), (coverage_check, 'coverage at 90')] imputer.fit_hpo(df_train, hps=hps, user_defined_scores=uds, num_evals=5, hpo_run_name='test1_') imputer.fit_hpo(df_train, hps=hps, user_defined_scores=uds, num_evals=5, hpo_run_name='test2_', max_running_hours=1 / 3600) results = imputer.hpo.results assert results[results['global:num_epochs'] == 50]['f1_micro'].iloc[0] > \ results[results['global:num_epochs'] == 5]['f1_micro'].iloc[0]
def test_numeric_or_text_imputer(test_dir, data_frame): """ Tests SimpleImputer with default options """ feature_col = "string_feature" label_col = "label" n_samples = 1000 num_labels = 3 seq_len = 30 vocab_size = int(2**10) # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) numeric_data = np.random.uniform(-np.pi, np.pi, (n_samples, )) df = pd.DataFrame({ 'x': numeric_data, '*2': numeric_data * 2. + np.random.normal(0, .1, (n_samples, )), '**2': numeric_data**2 + np.random.normal(0, .1, (n_samples, )), feature_col: random_data[feature_col].values, label_col: random_data[label_col].values }) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_numeric") imputer_numeric_linear = SimpleImputer(input_columns=['x', feature_col], output_column="*2", output_path=output_path).fit( train_df=df_train, learning_rate=1e-3, ) imputer_numeric_linear.predict(df_test, inplace=True) assert mean_squared_error(df_test['*2'], df_test['*2_imputed']) < 1.0 imputer_numeric = SimpleImputer(input_columns=['x', feature_col], output_column="**2", output_path=output_path).fit( train_df=df_train, learning_rate=1e-3) imputer_numeric.predict(df_test, inplace=True) assert mean_squared_error(df_test['**2'], df_test['**2_imputed']) < 1.0 imputer_string = SimpleImputer( input_columns=[feature_col, 'x'], output_column=label_col, output_path=output_path).fit(train_df=df_train) imputer_string.predict(df_test, inplace=True) assert f1_score(df_test[label_col], df_test[label_col + '_imputed'], average="weighted") > .7
def test_random_split(): df = pd.DataFrame([{'a': 1}, {'a': 2}]) train_df, test_df = random_split(df, split_ratios=[.5, .5], seed=10) assert all(train_df.values.flatten() == np.array([1])) assert all(test_df.values.flatten() == np.array([2]))
def test_imputer_real_data_all_featurizers(test_dir, data_frame): """ Tests Imputer with sequential, bag-of-words and categorical variables as inputs this could be run as part of integration test suite. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 5000 num_labels = 3 seq_len = 20 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col + "_lstm", max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col + "_lstm", seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] output_path = os.path.join(test_dir, "tmp", "imputer_experiment_synthetic_data") num_epochs = 10 batch_size = 32 learning_rate = 1e-2 imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit(train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, calibrate=False) len_df_before_predict = len(df_test) pred = imputer.transform(df_test) assert len(pred[label_col]) == len_df_before_predict assert sum(df_test[label_col].values == pred[label_col]) == len(df_test) _ = imputer.predict_proba_top_k(df_test, top_k=2) _, metrics = imputer.transform_and_compute_metrics(df_test) assert metrics[label_col]['avg_f1'] > 0.9 deserialized = Imputer.load(imputer.output_path) _, metrics_deserialized = deserialized.transform_and_compute_metrics( df_test) assert metrics_deserialized[label_col]['avg_f1'] > 0.9 # training on a small data set to get a imputer with low precision not_so_precise_imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit( train_df=df_train[:50], test_df=df_test, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, calibrate=False) df_test = df_test.reset_index() predictions_df = not_so_precise_imputer.predict( df_test, precision_threshold=.5, imputation_suffix="_imputed") assert predictions_df.columns.contains(label_col + "_imputed") assert predictions_df.columns.contains(label_col + "_imputed_proba")
def test_automatic_calibration(data_frame): """ Fit model with all featurisers and assert that calibration improves the expected calibration error. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 2000 num_labels = 3 seq_len = 20 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col + "_lstm", max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col + "_lstm", seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] num_epochs = 20 batch_size = 32 learning_rate = 1e-2 imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols).fit( train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size) assert imputer.calibration_info['ece_pre'] > imputer.calibration_info[ 'ece_post']
# http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is distributed on # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language governing # permissions and limitations under the License. from datawig import SimpleImputer from datawig.utils import random_split from sklearn.metrics import f1_score, classification_report import pandas as pd """ Load Data """ df = pd.read_csv('mae_train_dataset.csv').sample(n=1000) df_train, df_test = random_split(df, split_ratios=[0.8, 0.2]) # ------------------------------------------------------------------------------------ """ Run default SimpleImputer """ # Initialize a SimpleImputer model imputer = SimpleImputer( input_columns=[ 'title', 'text' ], # columns containing information about the column we want to impute output_column='finish', # the column we'd like to impute values for output_path='imputer_model' # stores model data and metrics ) # Fit an imputer model on the train data
def test_imputer_duplicate_encoder_output_columns(test_dir, data_frame): """ Tests Imputer with sequential, bag-of-words and categorical variables as inputs this could be run as part of integration test suite. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 1000 num_labels = 10 seq_len = 100 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col, max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col, max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col, vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col, seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] output_path = os.path.join(test_dir, "tmp", "imputer_experiment_synthetic_data") num_epochs = 20 batch_size = 16 learning_rate = 1e-3 with pytest.raises(ValueError) as e: imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path) imputer.fit(train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size)