def prepare_data(results_dir): train, test = load_dataset() cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique( ) < 200 and col != "target": cat_embed_cols.append(col) num_cols = [ c for c in train.columns if c not in cat_embed_cols + ["target"] ] args = read_best_model_args(results_dir) prepare_tab = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=num_cols, scale=True, for_tabtransformer=True, ) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_test = prepare_tab.transform(test) y_test = test.target.values mlp_hidden_dims_same = len(cat_embed_cols) return mlp_hidden_dims_same, args, prepare_tab, X_train, X_test, y_train, y_test
def prepare_data(results_dir): train, test = load_dataset() # All columns will be treated as categorical. The column with the highest # number of categories has 308 cat_embed_cols = [c for c in train.columns if c != "target"] prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_test = prepare_tab.transform(test) y_test = test.target.values return prepare_tab, X_train, X_test, y_train, y_test
train = train[train.target <= upper_limit] valid = valid[valid.target <= upper_limit] cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target": cat_embed_cols.append(col) num_cols = [c for c in train.columns if c not in cat_embed_cols + ["target"]] prepare_tab = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=num_cols, scale=args.scale_cont ) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_valid = prepare_tab.transform(valid) y_valid = valid.target.values if args.blocks_dims == "same": n_inp_dim = sum([e[2] for e in prepare_tab.embeddings_input]) blocks_dims = [n_inp_dim, n_inp_dim, n_inp_dim] else: blocks_dims = eval(args.blocks_dims) if args.mlp_hidden_dims == "auto": n_inp_dim = blocks_dims[-1] mlp_hidden_dims = [4 * n_inp_dim, 2 * n_inp_dim] else: mlp_hidden_dims = eval(args.mlp_hidden_dims) deeptabular = TabResnet(
df.drop("education_num", axis=1, inplace=True) train = pd.concat([train, valid], ignore_index=True) # 200 is rather arbitraty but one has to make a decision as to how to decide # if something will be represented as embeddings or continuous in a "kind-of" # automated way cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target": cat_embed_cols.append(col) # all columns will be represented by embeddings prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols, for_tabtransformer=True) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_test = prepare_tab.transform(test) y_test = test.target.values args = read_best_model_args(RESULTS_DIR) if args.mlp_hidden_dims == "same": mlp_hidden_dims = [ len(cat_embed_cols) * args.input_dim, len(cat_embed_cols) * args.input_dim, (len(cat_embed_cols) * args.input_dim) // 2, ] elif args.mlp_hidden_dims == "None": mlp_hidden_dims = None else: mlp_hidden_dims = eval(args.mlp_hidden_dims)
train = pd.read_pickle(PROCESSED_DATA_DIR / "bankm_train.p") valid = pd.read_pickle(PROCESSED_DATA_DIR / "bankm_val.p") colnames = [c.replace(".", "_") for c in train.columns] train.columns = colnames valid.columns = colnames # All columns will be treated as categorical. The column with the highest # number of categories has 308 cat_embed_cols = [c for c in train.columns if c != "target"] # all columns will be represented by embeddings prepare_deep = TabPreprocessor(embed_cols=cat_embed_cols) X_train = prepare_deep.fit_transform(train) y_train = train.target.values X_valid = prepare_deep.transform(valid) y_valid = valid.target.values deeptabular = TabNet( column_idx=prepare_deep.column_idx, embed_input=prepare_deep.embeddings_input, embed_dropout=args.embed_dropout, n_steps=args.n_steps, step_dim=args.step_dim, attn_dim=args.attn_dim, dropout=args.dropout, n_glu_step_dependent=args.n_glu_step_dependent, n_glu_shared=args.n_glu_shared, ghost_bn=args.ghost_bn, virtual_batch_size=args.virtual_batch_size, momentum=args.momentum,
def test_notfittederror(): processor = TabPreprocessor(embed_cols=["col1", "col2"], continuous_cols=["col3", "col4"]) with pytest.raises(NotFittedError): processor.transform(df)