def prepare_data(results_dir): train, test = load_dataset() cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique( ) < 200 and col != "target": cat_embed_cols.append(col) num_cols = [ c for c in train.columns if c not in cat_embed_cols + ["target"] ] args = read_best_model_args(results_dir) prepare_tab = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=num_cols, scale=True, for_tabtransformer=True, ) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_test = prepare_tab.transform(test) y_test = test.target.values mlp_hidden_dims_same = len(cat_embed_cols) return mlp_hidden_dims_same, args, prepare_tab, X_train, X_test, y_train, y_test
def prepare_data(results_dir): train, test = load_dataset() # All columns will be treated as categorical. The column with the highest # number of categories has 308 cat_embed_cols = [c for c in train.columns if c != "target"] prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_test = prepare_tab.transform(test) y_test = test.target.values return prepare_tab, X_train, X_test, y_train, y_test
def test_tab_preprocessor_trasformer_raise_error(embed_cols, continuous_cols, scale): with pytest.raises(ValueError): tab_preprocessor = TabPreprocessor( # noqa: F841 embed_cols=embed_cols, continuous_cols=continuous_cols, scale=scale, for_tabtransformer=True, )
def test_prepare_deep_without_embedding_columns(): errors = [] df_randint = pd.DataFrame(np.random.choice(np.arange(100), (100, 2))) df_randint.columns = ["col1", "col2"] preprocessor3 = TabPreprocessor(continuous_cols=["col1", "col2"]) try: X_randint = preprocessor3.fit_transform(df_randint) except Exception: errors.append("Fundamental Error") out_booleans = [] means, stds = np.mean(X_randint, axis=0), np.std(X_randint, axis=0) for mean, std in zip(means, stds): out_booleans.append(np.isclose(mean, 0.0)) out_booleans.append(np.isclose(std, 1.0)) if not np.all(out_booleans): errors.append("There is something going on with the scaler") assert not errors, "errors occured:\n{}".format("\n".join(errors))
def test_tab_preprocessor_trasformer(embed_cols, continuous_cols, scale): tab_preprocessor = TabPreprocessor( embed_cols=embed_cols, continuous_cols=continuous_cols, scale=scale, for_tabtransformer=True, verbose=False, ) encoded = tab_preprocessor.fit_transform(df) decoded = tab_preprocessor.inverse_transform(encoded) try: if isinstance(embed_cols[0], tuple): embed_cols = [c[0] for c in embed_cols] emb_df = df[embed_cols] except Exception: emb_df = pd.DataFrame() try: cont_df = df[continuous_cols] except Exception: cont_df = pd.DataFrame() org_df = pd.concat([emb_df, cont_df], axis=1) decoded = decoded.astype(org_df.dtypes.to_dict()) assert decoded.equals(org_df)
train = pd.read_pickle(PROCESSED_DATA_DIR / "fb_comments_train.p") valid = pd.read_pickle(PROCESSED_DATA_DIR / "fb_comments_val.p") upper_limit = train.target.quantile(0.99) train = train[train.target <= upper_limit] valid = valid[valid.target <= upper_limit] cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target": cat_embed_cols.append(col) num_cols = [c for c in train.columns if c not in cat_embed_cols + ["target"]] prepare_tab = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=num_cols, scale=args.scale_cont ) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_valid = prepare_tab.transform(valid) y_valid = valid.target.values if args.blocks_dims == "same": n_inp_dim = sum([e[2] for e in prepare_tab.embeddings_input]) blocks_dims = [n_inp_dim, n_inp_dim, n_inp_dim] else: blocks_dims = eval(args.blocks_dims) if args.mlp_hidden_dims == "auto": n_inp_dim = blocks_dims[-1] mlp_hidden_dims = [4 * n_inp_dim, 2 * n_inp_dim]
already_standard = ["latitude", "longitude"] text_col = "description" word_vectors_path = "data/glove.6B/glove.6B.100d.txt" img_col = "id" img_path = "data/airbnb/property_picture" target = "yield" target = df[target].values wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) tab_preprocessor = TabPreprocessor( embed_cols=cat_embed_cols, # type: ignore[arg-type] continuous_cols=continuous_cols, already_standard=already_standard, ) X_tab = tab_preprocessor.fit_transform(df) text_processor = TextPreprocessor(word_vectors_path=word_vectors_path, text_col=text_col) X_text = text_processor.fit_transform(df) image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df) wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1) deepdense = TabMlp( mlp_hidden_dims=[64, 32], mlp_dropout=[0.2, 0.2],
"latitude", "longitude", "security_deposit", "extra_people" ] already_standard = ["latitude", "longitude"] df["yield_cat"] = pd.cut(df["yield"], bins=[0.2, 65, 163, 600], labels=[0, 1, 2]) df.drop("yield", axis=1, inplace=True) target = "yield_cat" target = np.array(df[target].values) wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) tab_preprocessor = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=continuous_cols # type: ignore[arg-type] ) X_deep = tab_preprocessor.fit_transform(df) wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=3) deepdense = TabMlp( mlp_hidden_dims=[64, 32], mlp_dropout=[0.2, 0.2], column_idx=tab_preprocessor.column_idx, embed_input=tab_preprocessor.embeddings_input, continuous_cols=continuous_cols, ) model = WideDeep(wide=wide, deeptabular=deepdense, pred_dim=3) optimizer = torch.optim.Adam(model.parameters(), lr=0.03) trainer = wd.Trainer(model,
RESULTS_DIR = WORKDIR / "/".join(["results", args.bankm_dset, "tabresnet"]) if not RESULTS_DIR.is_dir(): os.makedirs(RESULTS_DIR) train = pd.read_pickle(PROCESSED_DATA_DIR / "bankm_train.p") valid = pd.read_pickle(PROCESSED_DATA_DIR / "bankm_val.p") colnames = [c.replace(".", "_") for c in train.columns] train.columns = colnames valid.columns = colnames # All columns will be treated as categorical. The column with the highest # number of categories has 308 cat_embed_cols = [c for c in train.columns if c != "target"] # all columns will be represented by embeddings prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_valid = prepare_tab.transform(valid) y_valid = valid.target.values if args.blocks_dims == "same": n_inp_dim = sum([e[2] for e in prepare_tab.embeddings_input]) blocks_dims = [n_inp_dim, n_inp_dim, n_inp_dim] else: blocks_dims = eval(args.blocks_dims) if args.mlp_hidden_dims == "auto": n_inp_dim = blocks_dims[-1] mlp_hidden_dims = [4 * n_inp_dim, 2 * n_inp_dim] else:
("native_country", "occupation")] cat_embed_cols = [ ("education", 10), ("relationship", 8), ("workclass", 10), ("occupation", 10), ("native_country", 10), ] continuous_cols = ["age", "hours_per_week"] target = "income_label" target = df[target].values prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = prepare_wide.fit_transform(df) prepare_deep = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=continuous_cols # type: ignore[arg-type] ) X_tab = prepare_deep.fit_transform(df) wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1) deeptabular = TabMlp( mlp_hidden_dims=[200, 100], mlp_dropout=[0.2, 0.2], column_idx=prepare_deep.column_idx, embed_input=prepare_deep.embeddings_input, continuous_cols=continuous_cols, ) # # To use TabResnet as the deeptabular component simply: # deeptabular = TabResnet(
valid = pd.read_pickle(PROCESSED_DATA_DIR / "adult_val.p") test = pd.read_pickle(PROCESSED_DATA_DIR / "adult_test.p") for df in [train, valid, test]: df.drop("education_num", axis=1, inplace=True) train = pd.concat([train, valid], ignore_index=True) # 200 is rather arbitraty but one has to make a decision as to how to decide # if something will be represented as embeddings or continuous in a "kind-of" # automated way cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target": cat_embed_cols.append(col) # all columns will be represented by embeddings prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols, for_tabtransformer=True) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_test = prepare_tab.transform(test) y_test = test.target.values args = read_best_model_args(RESULTS_DIR) if args.mlp_hidden_dims == "same": mlp_hidden_dims = [ len(cat_embed_cols) * args.input_dim, len(cat_embed_cols) * args.input_dim, (len(cat_embed_cols) * args.input_dim) // 2, ] elif args.mlp_hidden_dims == "None": mlp_hidden_dims = None
num_cols = [ c for c in train.columns if c not in cat_embed_cols + ["target"] ] wide_cols = [] for col in train.columns: if train[col].nunique() < 40 and col != "target": wide_cols.append(col) prepare_wide = WidePreprocessor(wide_cols) X_wide_train = prepare_wide.fit_transform(train) X_wide_valid = prepare_wide.transform(valid) prepare_tab = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=num_cols, for_tabtransformer=True, scale=False, ) X_tab_train = prepare_tab.fit_transform(train) X_tab_valid = prepare_tab.transform(valid) y_train = train.target.values y_valid = valid.target.values wide = Wide(wide_dim=np.unique(X_wide_train).shape[0]) X_train = {"X_wide": X_wide_train, "X_tab": X_tab_train, "target": y_train} X_val = {"X_wide": X_wide_valid, "X_tab": X_tab_valid, "target": y_valid} else: cat_embed_cols = []
if args.with_wide: cat_embed_cols = [] for col in train.columns: if train[col].nunique() > 5 and train[col].nunique() < 200 and col != "target": cat_embed_cols.append(col) wide_cols = [] for col in train.columns: if train[col].nunique() < 40 and col != "target": wide_cols.append(col) prepare_wide = WidePreprocessor(wide_cols) X_wide_train = prepare_wide.fit_transform(train) X_wide_valid = prepare_wide.transform(valid) prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols, for_tabtransformer=True) X_tab_train = prepare_tab.fit_transform(train) X_tab_valid = prepare_tab.transform(valid) y_train = train.target.values y_valid = valid.target.values wide = Wide(wide_dim=np.unique(X_wide_train).shape[0]) X_train = {"X_wide": X_wide_train, "X_tab": X_tab_train, "target": y_train} X_val = {"X_wide": X_wide_valid, "X_tab": X_tab_valid, "target": y_valid} else: cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target":
target_column_list = ["income_label"] target = train_df[target_column_list].values """ Preprocessing """ # Wide wide_preprocessor = WidePreprocessor(wide_cols=wide_columns_list, crossed_cols=wide_cross_column_list) x_wide = wide_preprocessor.fit_transform(train_df) # Deep tab_preprocessor = TabPreprocessor( embed_cols=deep_embedding_columns_list, continuous_cols=deep_continuous_column_list) x_deep = tab_preprocessor.fit_transform(train_df) """ Model 구조 정의 """ # Model wide = Wide(wide_dim=np.unique(x_wide).shape[0], pred_dim=1) deeptabular = TabMlp(mlp_hidden_dims=[64, 32], column_idx=tab_preprocessor.column_idx, embed_input=tab_preprocessor.embeddings_input, continuous_cols=deep_continuous_column_list) model = WideDeep(wide=wide, deeptabular=deeptabular) """ 학습 """
def test_notfittederror(): processor = TabPreprocessor(embed_cols=["col1", "col2"], continuous_cols=["col3", "col4"]) with pytest.raises(NotFittedError): processor.transform(df)
"input_df, encoder, output_df", [(df_letters, le_letters, df_letters_le), (df_numbers, le_numbers, df_numbers_le)], ) def test_label_encoder(input_df, encoder, output_df): original_df = encoder.inverse_transform(output_df) assert original_df.equals(input_df) ################################################################################ # Test the TabPreprocessor: only categorical columns to be represented with # embeddings ############################################################################### cat_embed_cols = [("col1", 5), ("col2", 5)] preprocessor1 = TabPreprocessor(cat_embed_cols) # type: ignore[arg-type] X_letters = preprocessor1.fit_transform(df_letters) preprocessor2 = TabPreprocessor(cat_embed_cols) # type: ignore[arg-type] X_numbers = preprocessor2.fit_transform(df_numbers) error_list = [] @pytest.mark.parametrize( "input_df, X_deep, preprocessor", [(df_letters, X_letters, preprocessor1), (df_numbers, X_numbers, preprocessor2)], ) def test_prepare_deep_without_continous_columns(input_df, X_deep, preprocessor):