def test_imputer_numeric_data(test_dir): """ Tests numeric encoder/featurizer only """ # Training data N = 1000 x = np.random.uniform(-np.pi, np.pi, (N, )) df = pd.DataFrame({'x': x, 'cos': np.cos(x), '*2': x * 2, '**2': x**2}) df_train, df_test = random_split(df, [.6, .4]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_numeric") data_encoder_cols = [NumericalEncoder(['x'])] data_cols = [NumericalFeaturizer('x', numeric_latent_dim=100)] for target in ['*2', '**2', 'cos']: label_encoder_cols = [NumericalEncoder([target], normalize=False)] imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path) imputer.fit(train_df=df_train, learning_rate=1e-1, num_epochs=100, patience=5, test_split=.3, weight_decay=.0, batch_size=128) pred, metrics = imputer.transform_and_compute_metrics(df_test) df_test['predictions_' + target] = pred[target].flatten() print("Numerical metrics: {}".format(metrics[target])) assert metrics[target] < 10
def test_imputer_load_read_exec_only_dir(tmpdir, data_frame): import stat # on shared build-fleet tests fail with converting tmpdir to string tmpdir = str(tmpdir) feature = 'feature' label = 'label' df = data_frame(feature, label, n_samples=100) # fit and output model + metrics to tmpdir imputer = Imputer(data_featurizers=[BowFeaturizer(feature)], label_encoders=[CategoricalEncoder(label)], data_encoders=[BowEncoder(feature)], output_path=tmpdir) imputer.fit(train_df=df, num_epochs=1) # make tmpdir read/exec-only by owner/group/others os.chmod( tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) try: Imputer.load(tmpdir) except AssertionError as e: print(e) pytest.fail( 'Loading imputer from read-only directory should not fail.')
def test_imputer_without_train_df(test_dir): """ Test asserting that imputer.fit fails without training data or training data in wrong format """ df_train = ['ffffffooooo'] data_encoder_cols = [BowEncoder('item_name')] label_encoder_cols = [CategoricalEncoder('brand')] data_cols = [BowFeaturizer('item_name')] output_path = os.path.join(test_dir, "tmp", "real_data_experiment") imputer = Imputer( data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path, ) with pytest.raises( ValueError, message="Need a non-empty DataFrame for fitting Imputer model"): imputer.fit(train_df=df_train) with pytest.raises( ValueError, message="Need a non-empty DataFrame for fitting Imputer model"): imputer.fit(train_df=None)
def test_imputer_without_test_set_random_split(): """ Test asserting that the random split is working internally by calling imputer.fit only with a training set. """ feature_col = "string_feature" label_col = "label" n_samples = 5000 num_labels = 3 seq_len = 20 vocab_size = int(2 ** 10) # generate some random data df_train = generate_string_data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) num_epochs = 1 batch_size = 64 learning_rate = 1e-3 data_encoder_cols = [ BowEncoder(feature_col, max_tokens=vocab_size) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col, vocab_size=vocab_size) ] output_path = os.path.join(dir_path, "resources", "tmp", "real_data_experiment") imputer = Imputer( data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path ) try: imputer.fit( train_df=df_train, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size ) except TypeError: pytest.fail("Didn't expect a TypeError exception with missing test data") shutil.rmtree(output_path)
def test_imputer_load_with_invalid_context(tmpdir, data_frame): # on shared build-fleet tests fail with converting tmpdir to string tmpdir = str(tmpdir) feature = 'feature' label = 'label' df = data_frame(feature, label, n_samples=100) # fit and output model + metrics to tmpdir imputer = Imputer(data_featurizers=[BowFeaturizer(feature)], label_encoders=[CategoricalEncoder(label)], data_encoders=[BowEncoder(feature)], output_path=tmpdir) imputer.fit(train_df=df, num_epochs=1) imputer.ctx = None imputer.save() imputer_deser = Imputer.load(tmpdir) _ = imputer_deser.predict(df)
def test_imputer_fit_fail_non_writable_output_dir(tmpdir, data_frame): import stat # on shared build-fleet tests fail with converting tmpdir to string tmpdir = str(tmpdir) feature = 'feature' label = 'label' df = data_frame(feature, label, n_samples=100) # fit and output model + metrics to tmpdir imputer = Imputer(data_featurizers=[BowFeaturizer(feature)], label_encoders=[CategoricalEncoder(label)], data_encoders=[BowEncoder(feature)], output_path=tmpdir) # make tmpdir read/exec-only by owner/group/others os.chmod( tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) # fail if imputer.fit does not raise an AssertionError with pytest.raises(AssertionError) as e: imputer.fit(df, num_epochs=1)
def test_fit_resumes(test_dir, data_frame): feature_col, label_col = "feature", "label" df = data_frame(feature_col=feature_col, label_col=label_col) imputer = Imputer( data_encoders=[TfIdfEncoder([feature_col])], data_featurizers=[ datawig.mxnet_input_symbols.BowFeaturizer(feature_col) ], label_encoders=[CategoricalEncoder(label_col)], output_path=test_dir) assert imputer.module is None imputer.fit(df, num_epochs=20) first_fit_module = imputer.module imputer.fit(df, num_epochs=20) second_fit_module = imputer.module assert first_fit_module == second_fit_module
def test_imputer_duplicate_encoder_output_columns(test_dir, data_frame): """ Tests Imputer with sequential, bag-of-words and categorical variables as inputs this could be run as part of integration test suite. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 1000 num_labels = 10 seq_len = 100 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col, max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col, max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col, vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col, seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] output_path = os.path.join(test_dir, "tmp", "imputer_experiment_synthetic_data") num_epochs = 20 batch_size = 16 learning_rate = 1e-3 with pytest.raises(ValueError) as e: imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path) imputer.fit(train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size)