def test_imputer_init(): with pytest.raises(ValueError) as e: imputer = Imputer(data_featurizers='item_name', label_encoders=['brand'], data_encoders='') with pytest.raises(ValueError) as e: imputer = Imputer(data_featurizers=[BowFeaturizer('item_name')], label_encoders="brand", data_encoders='') with pytest.raises(ValueError) as e: imputer = Imputer(data_featurizers=[BowFeaturizer('item_name')], label_encoders=[CategoricalEncoder("brand")], data_encoders='') with pytest.raises(ValueError) as e: imputer = Imputer(data_featurizers=[BowFeaturizer('item_name')], label_encoders=[CategoricalEncoder("brand")], data_encoders=[BowEncoder('not_in_featurizers')]) with pytest.raises(ValueError) as e: imputer = Imputer(data_featurizers=[BowFeaturizer('item_name')], label_encoders=[CategoricalEncoder("brand")], data_encoders=[BowEncoder('brand')]) label_encoders = [CategoricalEncoder('brand', max_tokens=10)] data_featurizers = [LSTMFeaturizer('item_name'), EmbeddingFeaturizer('manufacturer')] data_encoders = [ SequentialEncoder( 'item_name' ), CategoricalEncoder( 'manufacturer' ) ] imputer = Imputer( data_featurizers=data_featurizers, label_encoders=label_encoders, data_encoders=data_encoders ) assert imputer.output_path == "brand" assert imputer.module_path == 'brand/model' assert imputer.metrics_path == 'brand/fit-test-metrics.json' assert imputer.output_path == "brand" assert imputer.module_path == 'brand/model' assert imputer.metrics_path == 'brand/fit-test-metrics.json' imputer = Imputer( data_featurizers=data_featurizers, label_encoders=[CategoricalEncoder('B Rand', max_tokens=10)], data_encoders=data_encoders ) assert imputer.output_path == "b_rand" shutil.rmtree("b_rand")
def test_imputer_real_data_all_featurizers(test_dir, data_frame): """ Tests Imputer with sequential, bag-of-words and categorical variables as inputs this could be run as part of integration test suite. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 5000 num_labels = 3 seq_len = 20 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col + "_lstm", max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col + "_lstm", seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] output_path = os.path.join(test_dir, "tmp", "imputer_experiment_synthetic_data") num_epochs = 10 batch_size = 32 learning_rate = 1e-2 imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit(train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, calibrate=False) len_df_before_predict = len(df_test) pred = imputer.transform(df_test) assert len(pred[label_col]) == len_df_before_predict assert sum(df_test[label_col].values == pred[label_col]) == len(df_test) _ = imputer.predict_proba_top_k(df_test, top_k=2) _, metrics = imputer.transform_and_compute_metrics(df_test) assert metrics[label_col]['avg_f1'] > 0.9 deserialized = Imputer.load(imputer.output_path) _, metrics_deserialized = deserialized.transform_and_compute_metrics( df_test) assert metrics_deserialized[label_col]['avg_f1'] > 0.9 # training on a small data set to get a imputer with low precision not_so_precise_imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path).fit( train_df=df_train[:50], test_df=df_test, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, calibrate=False) df_test = df_test.reset_index() predictions_df = not_so_precise_imputer.predict( df_test, precision_threshold=.5, imputation_suffix="_imputed") assert predictions_df.columns.contains(label_col + "_imputed") assert predictions_df.columns.contains(label_col + "_imputed_proba")
def test_imputer_duplicate_encoder_output_columns(test_dir, data_frame): """ Tests Imputer with sequential, bag-of-words and categorical variables as inputs this could be run as part of integration test suite. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 1000 num_labels = 10 seq_len = 100 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col, max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col, max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col, vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col, seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] output_path = os.path.join(test_dir, "tmp", "imputer_experiment_synthetic_data") num_epochs = 20 batch_size = 16 learning_rate = 1e-3 with pytest.raises(ValueError) as e: imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols, output_path=output_path) imputer.fit(train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size)
def test_automatic_calibration(data_frame): """ Fit model with all featurisers and assert that calibration improves the expected calibration error. """ feature_col = "string_feature" categorical_col = "categorical_feature" label_col = "label" n_samples = 2000 num_labels = 3 seq_len = 20 vocab_size = int(2**10) latent_dim = 30 embed_dim = 30 # generate some random data random_data = data_frame(feature_col=feature_col, label_col=label_col, vocab_size=vocab_size, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # we use a the label prefixes as a dummy categorical input variable random_data[categorical_col] = random_data[label_col].apply( lambda x: x[:2]) df_train, df_test, df_val = random_split(random_data, [.8, .1, .1]) data_encoder_cols = [ BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size), SequentialEncoder(feature_col, feature_col + "_lstm", max_tokens=vocab_size, seq_len=seq_len), CategoricalEncoder(categorical_col, max_tokens=num_labels) ] label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)] data_cols = [ BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size), LSTMFeaturizer(field_name=feature_col + "_lstm", seq_len=seq_len, latent_dim=latent_dim, num_hidden=30, embed_dim=embed_dim, num_layers=2, vocab_size=num_labels), EmbeddingFeaturizer(field_name=categorical_col, embed_dim=embed_dim, vocab_size=num_labels) ] num_epochs = 20 batch_size = 32 learning_rate = 1e-2 imputer = Imputer(data_featurizers=data_cols, label_encoders=label_encoder_cols, data_encoders=data_encoder_cols).fit( train_df=df_train, test_df=df_val, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size) assert imputer.calibration_info['ece_pre'] > imputer.calibration_info[ 'ece_post']