def test_imputer_load_read_exec_only_dir(tmpdir, data_frame): import stat # on shared build-fleet tests fail with converting tmpdir to string tmpdir = str(tmpdir) feature = 'feature' label = 'label' df = data_frame(feature, label, n_samples=100) # fit and output model + metrics to tmpdir imputer = Imputer(data_featurizers=[BowFeaturizer(feature)], label_encoders=[CategoricalEncoder(label)], data_encoders=[BowEncoder(feature)], output_path=tmpdir) imputer.fit(train_df=df, num_epochs=1) # make tmpdir read/exec-only by owner/group/others os.chmod( tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) try: Imputer.load(tmpdir) except AssertionError as e: print(e) pytest.fail( 'Loading imputer from read-only directory should not fail.')
def test_non_writable_output_path(test_dir, data_frame): label_col = 'label' df = data_frame(n_samples=100, label_col=label_col) data_encoder_cols = [TfIdfEncoder('features')] label_encoder_cols = [CategoricalEncoder(label_col)] data_cols = [BowFeaturizer('features')] output_path = os.path.join(test_dir, 'non_writable') Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit(train_df=df, num_epochs=1).save() from datawig.utils import logger try: # make output dir of imputer read-only os.chmod(output_path, S_IREAD | S_IXUSR) # make log file read only os.chmod(os.path.join(output_path, "imputer.log"), S_IREAD) imputer = Imputer.load(output_path) _ = imputer.predict(df) logger.warning("this should not fail") # remove log file os.chmod(os.path.join(output_path, "imputer.log"), S_IREAD | S_IXUSR | S_IWUSR) os.chmod(output_path, S_IREAD | S_IXUSR | S_IWUSR) os.remove(os.path.join(output_path, "imputer.log")) # make output dir of imputer read-only os.chmod(output_path, S_IREAD | S_IXUSR) imputer = Imputer.load(output_path) _ = imputer.predict(df) logger.warning("this should not fail") os.chmod(output_path, S_IREAD | S_IXUSR | S_IWUSR) except Exception as e: print(e) pytest.fail("This invocation not raise any Exception")
def test_imputer_load_with_invalid_context(tmpdir, data_frame): # on shared build-fleet tests fail with converting tmpdir to string tmpdir = str(tmpdir) feature = 'feature' label = 'label' df = data_frame(feature, label, n_samples=100) # fit and output model + metrics to tmpdir imputer = Imputer(data_featurizers=[BowFeaturizer(feature)], label_encoders=[CategoricalEncoder(label)], data_encoders=[BowEncoder(feature)], output_path=tmpdir) imputer.fit(train_df=df, num_epochs=1) imputer.ctx = None imputer.save() imputer_deser = Imputer.load(tmpdir) _ = imputer_deser.predict(df)
def test_imputer_real_data_all_featurizers(test_dir, data_frame): """ Tests Imputer with sequential, bag-of-words and categorical variables as inputs this could be run as part of integration test suite. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 5000 num_labels = 3 seq_len = 20 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col + "_lstm", max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col + "_lstm", seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] output_path = os.path.join(test_dir, "tmp", "imputer_experiment_synthetic_data") num_epochs = 10 batch_size = 32 learning_rate = 1e-2 imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit(train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, calibrate=False) len_df_before_predict = len(df_test) pred = imputer.transform(df_test) assert len(pred[label_col]) == len_df_before_predict assert sum(df_test[label_col].values == pred[label_col]) == len(df_test) _ = imputer.predict_proba_top_k(df_test, top_k=2) _, metrics = imputer.transform_and_compute_metrics(df_test) assert metrics[label_col]['avg_f1'] > 0.9 deserialized = Imputer.load(imputer.output_path) _, metrics_deserialized = deserialized.transform_and_compute_metrics( df_test) assert metrics_deserialized[label_col]['avg_f1'] > 0.9 # training on a small data set to get a imputer with low precision not_so_precise_imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit( train_df=df_train[:50], test_df=df_test, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, calibrate=False) df_test = df_test.reset_index() predictions_df = not_so_precise_imputer.predict( df_test, precision_threshold=.5, imputation_suffix="_imputed") assert predictions_df.columns.contains(label_col + "_imputed") assert predictions_df.columns.contains(label_col + "_imputed_proba")
def test_explain_method_synthetic(test_dir): # Generate simulated data for testing explain method # Predict output column with entries in ['foo', 'bar'] from two columns, one # categorical in ['foo', 'dummy'], one text in ['text_foo_text', 'text_dummy_text']. # the output column is deterministically 'foo', if 'foo' occurs anywhere in any input column. N = 100 cat_in_col = ['foo' if r > (1 / 2) else 'dummy' for r in np.random.rand(N)] text_in_col = ['fff' if r > (1 / 2) else 'ddd' for r in np.random.rand(N)] hash_in_col = ['h' for r in range(N)] cat_out_col = [ 'foo' if 'f' in input[0] + input[1] else 'bar' for input in zip(cat_in_col, text_in_col) ] df = pd.DataFrame() df['in_cat'] = cat_in_col df['in_text'] = text_in_col df['in_text_hash'] = hash_in_col df['out_cat'] = cat_out_col # Specify encoders and featurizers # data_encoder_cols = [ datawig.column_encoders.TfIdfEncoder('in_text', tokens="chars"), datawig.column_encoders.CategoricalEncoder('in_cat', max_tokens=10), datawig.column_encoders.BowEncoder('in_text_hash', tokens="chars") ] data_featurizer_cols = [ datawig.mxnet_input_symbols.BowFeaturizer('in_text'), datawig.mxnet_input_symbols.EmbeddingFeaturizer('in_cat'), datawig.mxnet_input_symbols.BowFeaturizer('in_text_hash') ] label_encoder_cols = [ datawig.column_encoders.CategoricalEncoder('out_cat') ] # Specify model imputer = datawig.Imputer(data_featurizers=data_featurizer_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=os.path.join(test_dir, "tmp", "explanation_tests")) # Train tr, te = random_split(df.sample(90), [.8, .2]) imputer.fit(train_df=tr, test_df=te, num_epochs=20, learning_rate=1e-2) predictions = imputer.predict(te) # Evaluate assert precision_score(predictions.out_cat, predictions.out_cat_imputed, average='weighted') > .99 # assert item explanation, iterate over some inputs for i in np.random.choice(N, 10): explanation = imputer.explain_instance(df.iloc[i]) top_label = explanation['explained_label'] if top_label == 'bar': assert (explanation['in_text'][0][0] == 'd' and explanation['in_cat'][0][0] == 'dummy') elif top_label == 'foo': assert (explanation['in_text'][0][0] == 'f' or explanation['in_cat'][0][0] == 'foo') # assert class explanations assert np.all([ 'f' in token for token, weight in imputer.explain('foo')['in_text'] ][:3]) assert [ 'f' in token for token, weight in imputer.explain('foo')['in_cat'] ][0] # test serialisation to disk imputer.save() imputer_from_disk = Imputer.load(imputer.output_path) assert np.all([ 'f' in token for token, weight in imputer_from_disk.explain('foo')['in_text'] ][:3])