def test_dense_preprocessor_inverse_transform(embed_cols, continuous_cols, scale): dense_preprocessor = DensePreprocessor(embed_cols=embed_cols, continuous_cols=continuous_cols, scale=scale) encoded = dense_preprocessor.fit_transform(df) decoded = dense_preprocessor.inverse_transform(encoded) try: if isinstance(embed_cols[0], tuple): embed_cols = [c[0] for c in embed_cols] emb_df = df[embed_cols] except Exception: emb_df = pd.DataFrame() try: cont_df = df[continuous_cols] except Exception: cont_df = pd.DataFrame() org_df = pd.concat([emb_df, cont_df], axis=1) decoded = decoded.astype(org_df.dtypes.to_dict()) assert decoded.equals(org_df)
def test_prepare_deep_without_embedding_columns(): errors = [] df_randint = pd.DataFrame(np.random.choice(np.arange(100), (100, 2))) df_randint.columns = ["col1", "col2"] preprocessor3 = DensePreprocessor(continuous_cols=["col1", "col2"]) try: X_randint = preprocessor3.fit_transform(df_randint) except Exception: errors.append("Fundamental Error") out_booleans = [] means, stds = np.mean(X_randint, axis=0), np.std(X_randint, axis=0) for mean, std in zip(means, stds): out_booleans.append(np.isclose(mean, 0.0)) out_booleans.append(np.isclose(std, 1.0)) if not np.all(out_booleans): errors.append("There is something going on with the scaler") assert not errors, "errors occured:\n{}".format("\n".join(errors))
continuous_cols = [ "latitude", "longitude", "security_deposit", "extra_people" ] already_standard = ["latitude", "longitude"] df["yield_cat"] = pd.cut(df["yield"], bins=[0.2, 65, 163, 600], labels=[0, 1, 2]) df.drop("yield", axis=1, inplace=True) target = "yield_cat" target = np.array(df[target].values) prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = prepare_wide.fit_transform(df) prepare_deep = DensePreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols) X_deep = prepare_deep.fit_transform(df) wide = Wide(wide_dim=X_wide.shape[1], pred_dim=3) deepdense = DeepDense( hidden_layers=[64, 32], dropout=[0.2, 0.2], deep_column_idx=prepare_deep.deep_column_idx, embed_input=prepare_deep.embeddings_input, continuous_cols=continuous_cols, ) model = WideDeep(wide=wide, deepdense=deepdense, pred_dim=3) model.compile(method="multiclass", metrics=[Accuracy, F1Score]) model.fit( X_wide=X_wide, X_deep=X_deep,
"input_df, encoder, output_df", [(df_letters, le_letters, df_letters_le), (df_numbers, le_numbers, df_numbers_le)], ) def test_label_encoder(input_df, encoder, output_df): original_df = encoder.inverse_transform(output_df) assert original_df.equals(input_df) ################################################################################ # Test the DensePreprocessor: only categorical columns to be represented with # embeddings ############################################################################### cat_embed_cols = [("col1", 5), ("col2", 5)] preprocessor1 = DensePreprocessor(cat_embed_cols) # type: ignore[arg-type] X_letters = preprocessor1.fit_transform(df_letters) preprocessor2 = DensePreprocessor(cat_embed_cols) # type: ignore[arg-type] X_numbers = preprocessor2.fit_transform(df_numbers) error_list = [] @pytest.mark.parametrize( "input_df, X_deep, preprocessor", [(df_letters, X_letters, preprocessor1), (df_numbers, X_numbers, preprocessor2)], ) def test_prepare_deep_without_continous_columns(input_df, X_deep, preprocessor):
continuous_cols = ["latitude", "longitude", "security_deposit", "extra_people"] already_standard = ["latitude", "longitude"] text_col = "description" word_vectors_path = "data/glove.6B/glove.6B.100d.txt" img_col = "id" img_path = "data/airbnb/property_picture" target = "yield" target = df[target].values prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = prepare_wide.fit_transform(df) prepare_deep = DensePreprocessor( embed_cols=cat_embed_cols, # type: ignore[arg-type] continuous_cols=continuous_cols, already_standard=already_standard, ) X_deep = prepare_deep.fit_transform(df) text_processor = TextPreprocessor( word_vectors_path=word_vectors_path, text_col=text_col ) X_text = text_processor.fit_transform(df) image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df) wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1) deepdense = DeepDense( hidden_layers=[64, 32],
"input_df, encoder, output_df", [(df_letters, le_letters, df_letters_le), (df_numbers, le_numbers, df_numbers_le)], ) def test_label_encoder(input_df, encoder, output_df): original_df = encoder.inverse_transform(output_df) assert original_df.equals(input_df) ################################################################################ # Test the DensePreprocessor: only categorical columns to be represented with # embeddings ############################################################################### cat_embed_cols = [("col1", 5), ("col2", 5)] preprocessor1 = DensePreprocessor(cat_embed_cols) X_letters = preprocessor1.fit_transform(df_letters) preprocessor2 = DensePreprocessor(cat_embed_cols) X_numbers = preprocessor2.fit_transform(df_numbers) error_list = [] @pytest.mark.parametrize( "input_df, X_deep, preprocessor", [(df_letters, X_letters, preprocessor1), (df_numbers, X_numbers, preprocessor2)], ) def test_prepare_deep_without_continous_columns(input_df, X_deep, preprocessor):
("native_country", "occupation")] cat_embed_cols = [ ("education", 10), ("relationship", 8), ("workclass", 10), ("occupation", 10), ("native_country", 10), ] continuous_cols = ["age", "hours_per_week"] target = "income_label" target = df[target].values prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = prepare_wide.fit_transform(df) prepare_deep = DensePreprocessor( embed_cols=cat_embed_cols, continuous_cols=continuous_cols # type: ignore[arg-type] ) X_deep = prepare_deep.fit_transform(df) wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1) deepdense = DeepDense( hidden_layers=[64, 32], dropout=[0.2, 0.2], deep_column_idx=prepare_deep.deep_column_idx, embed_input=prepare_deep.embeddings_input, continuous_cols=continuous_cols, ) # # To use DeepDenseResnet as the deepdense component simply: # deepdense = DeepDenseResnet(