예제 #1
0
def prepare_data(results_dir):

    train, test = load_dataset()

    cat_embed_cols = []
    for col in train.columns:
        if train[col].dtype == "O" or train[col].nunique(
        ) < 200 and col != "target":
            cat_embed_cols.append(col)
    num_cols = [
        c for c in train.columns if c not in cat_embed_cols + ["target"]
    ]

    args = read_best_model_args(results_dir)
    prepare_tab = TabPreprocessor(
        embed_cols=cat_embed_cols,
        continuous_cols=num_cols,
        scale=True,
        for_tabtransformer=True,
    )
    X_train = prepare_tab.fit_transform(train)
    y_train = train.target.values
    X_test = prepare_tab.transform(test)
    y_test = test.target.values

    mlp_hidden_dims_same = len(cat_embed_cols)

    return mlp_hidden_dims_same, args, prepare_tab, X_train, X_test, y_train, y_test
def prepare_data(results_dir):

    train, test = load_dataset()

    # All columns will be treated as categorical. The column with the highest
    # number of categories has 308
    cat_embed_cols = [c for c in train.columns if c != "target"]

    prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols)
    X_train = prepare_tab.fit_transform(train)
    y_train = train.target.values
    X_test = prepare_tab.transform(test)
    y_test = test.target.values

    return prepare_tab, X_train, X_test, y_train, y_test
def test_tab_preprocessor_trasformer_raise_error(embed_cols, continuous_cols,
                                                 scale):
    with pytest.raises(ValueError):
        tab_preprocessor = TabPreprocessor(  # noqa: F841
            embed_cols=embed_cols,
            continuous_cols=continuous_cols,
            scale=scale,
            for_tabtransformer=True,
        )
def test_prepare_deep_without_embedding_columns():

    errors = []
    df_randint = pd.DataFrame(np.random.choice(np.arange(100), (100, 2)))
    df_randint.columns = ["col1", "col2"]
    preprocessor3 = TabPreprocessor(continuous_cols=["col1", "col2"])

    try:
        X_randint = preprocessor3.fit_transform(df_randint)
    except Exception:
        errors.append("Fundamental Error")

    out_booleans = []

    means, stds = np.mean(X_randint, axis=0), np.std(X_randint, axis=0)
    for mean, std in zip(means, stds):
        out_booleans.append(np.isclose(mean, 0.0))
        out_booleans.append(np.isclose(std, 1.0))

    if not np.all(out_booleans):
        errors.append("There is something going on with the scaler")

    assert not errors, "errors occured:\n{}".format("\n".join(errors))
def test_tab_preprocessor_trasformer(embed_cols, continuous_cols, scale):
    tab_preprocessor = TabPreprocessor(
        embed_cols=embed_cols,
        continuous_cols=continuous_cols,
        scale=scale,
        for_tabtransformer=True,
        verbose=False,
    )
    encoded = tab_preprocessor.fit_transform(df)
    decoded = tab_preprocessor.inverse_transform(encoded)
    try:
        if isinstance(embed_cols[0], tuple):
            embed_cols = [c[0] for c in embed_cols]
        emb_df = df[embed_cols]
    except Exception:
        emb_df = pd.DataFrame()
    try:
        cont_df = df[continuous_cols]
    except Exception:
        cont_df = pd.DataFrame()
    org_df = pd.concat([emb_df, cont_df], axis=1)
    decoded = decoded.astype(org_df.dtypes.to_dict())
    assert decoded.equals(org_df)
train = pd.read_pickle(PROCESSED_DATA_DIR / "fb_comments_train.p")
valid = pd.read_pickle(PROCESSED_DATA_DIR / "fb_comments_val.p")

upper_limit = train.target.quantile(0.99)

train = train[train.target <= upper_limit]
valid = valid[valid.target <= upper_limit]

cat_embed_cols = []
for col in train.columns:
    if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target":
        cat_embed_cols.append(col)
num_cols = [c for c in train.columns if c not in cat_embed_cols + ["target"]]

prepare_tab = TabPreprocessor(
    embed_cols=cat_embed_cols, continuous_cols=num_cols, scale=args.scale_cont
)
X_train = prepare_tab.fit_transform(train)
y_train = train.target.values
X_valid = prepare_tab.transform(valid)
y_valid = valid.target.values

if args.blocks_dims == "same":
    n_inp_dim = sum([e[2] for e in prepare_tab.embeddings_input])
    blocks_dims = [n_inp_dim, n_inp_dim, n_inp_dim]
else:
    blocks_dims = eval(args.blocks_dims)

if args.mlp_hidden_dims == "auto":
    n_inp_dim = blocks_dims[-1]
    mlp_hidden_dims = [4 * n_inp_dim, 2 * n_inp_dim]
예제 #7
0
    already_standard = ["latitude", "longitude"]
    text_col = "description"
    word_vectors_path = "data/glove.6B/glove.6B.100d.txt"
    img_col = "id"
    img_path = "data/airbnb/property_picture"
    target = "yield"

    target = df[target].values

    wide_preprocessor = WidePreprocessor(wide_cols=wide_cols,
                                         crossed_cols=crossed_cols)
    X_wide = wide_preprocessor.fit_transform(df)

    tab_preprocessor = TabPreprocessor(
        embed_cols=cat_embed_cols,  # type: ignore[arg-type]
        continuous_cols=continuous_cols,
        already_standard=already_standard,
    )
    X_tab = tab_preprocessor.fit_transform(df)

    text_processor = TextPreprocessor(word_vectors_path=word_vectors_path,
                                      text_col=text_col)
    X_text = text_processor.fit_transform(df)

    image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)
    X_images = image_processor.fit_transform(df)

    wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
    deepdense = TabMlp(
        mlp_hidden_dims=[64, 32],
        mlp_dropout=[0.2, 0.2],
예제 #8
0
        "latitude", "longitude", "security_deposit", "extra_people"
    ]
    already_standard = ["latitude", "longitude"]
    df["yield_cat"] = pd.cut(df["yield"],
                             bins=[0.2, 65, 163, 600],
                             labels=[0, 1, 2])
    df.drop("yield", axis=1, inplace=True)
    target = "yield_cat"

    target = np.array(df[target].values)
    wide_preprocessor = WidePreprocessor(wide_cols=wide_cols,
                                         crossed_cols=crossed_cols)
    X_wide = wide_preprocessor.fit_transform(df)

    tab_preprocessor = TabPreprocessor(
        embed_cols=cat_embed_cols,
        continuous_cols=continuous_cols  # type: ignore[arg-type]
    )
    X_deep = tab_preprocessor.fit_transform(df)

    wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=3)
    deepdense = TabMlp(
        mlp_hidden_dims=[64, 32],
        mlp_dropout=[0.2, 0.2],
        column_idx=tab_preprocessor.column_idx,
        embed_input=tab_preprocessor.embeddings_input,
        continuous_cols=continuous_cols,
    )
    model = WideDeep(wide=wide, deeptabular=deepdense, pred_dim=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.03)

    trainer = wd.Trainer(model,
예제 #9
0
RESULTS_DIR = WORKDIR / "/".join(["results", args.bankm_dset, "tabresnet"])
if not RESULTS_DIR.is_dir():
    os.makedirs(RESULTS_DIR)

train = pd.read_pickle(PROCESSED_DATA_DIR / "bankm_train.p")
valid = pd.read_pickle(PROCESSED_DATA_DIR / "bankm_val.p")
colnames = [c.replace(".", "_") for c in train.columns]
train.columns = colnames
valid.columns = colnames

# All columns will be treated as categorical. The column with the highest
# number of categories has 308
cat_embed_cols = [c for c in train.columns if c != "target"]

# all columns will be represented by embeddings
prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols)
X_train = prepare_tab.fit_transform(train)
y_train = train.target.values
X_valid = prepare_tab.transform(valid)
y_valid = valid.target.values

if args.blocks_dims == "same":
    n_inp_dim = sum([e[2] for e in prepare_tab.embeddings_input])
    blocks_dims = [n_inp_dim, n_inp_dim, n_inp_dim]
else:
    blocks_dims = eval(args.blocks_dims)

if args.mlp_hidden_dims == "auto":
    n_inp_dim = blocks_dims[-1]
    mlp_hidden_dims = [4 * n_inp_dim, 2 * n_inp_dim]
else:
예제 #10
0
                    ("native_country", "occupation")]
    cat_embed_cols = [
        ("education", 10),
        ("relationship", 8),
        ("workclass", 10),
        ("occupation", 10),
        ("native_country", 10),
    ]
    continuous_cols = ["age", "hours_per_week"]
    target = "income_label"
    target = df[target].values
    prepare_wide = WidePreprocessor(wide_cols=wide_cols,
                                    crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)
    prepare_deep = TabPreprocessor(
        embed_cols=cat_embed_cols,
        continuous_cols=continuous_cols  # type: ignore[arg-type]
    )
    X_tab = prepare_deep.fit_transform(df)

    wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

    deeptabular = TabMlp(
        mlp_hidden_dims=[200, 100],
        mlp_dropout=[0.2, 0.2],
        column_idx=prepare_deep.column_idx,
        embed_input=prepare_deep.embeddings_input,
        continuous_cols=continuous_cols,
    )

    # # To use TabResnet as the deeptabular component simply:
    # deeptabular = TabResnet(
예제 #11
0
valid = pd.read_pickle(PROCESSED_DATA_DIR / "adult_val.p")
test = pd.read_pickle(PROCESSED_DATA_DIR / "adult_test.p")
for df in [train, valid, test]:
    df.drop("education_num", axis=1, inplace=True)
train = pd.concat([train, valid], ignore_index=True)

# 200 is rather arbitraty but one has to make a decision as to how to decide
# if something will be represented as embeddings or continuous in a "kind-of"
# automated way
cat_embed_cols = []
for col in train.columns:
    if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target":
        cat_embed_cols.append(col)

# all columns will be represented by embeddings
prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols, for_tabtransformer=True)
X_train = prepare_tab.fit_transform(train)
y_train = train.target.values
X_test = prepare_tab.transform(test)
y_test = test.target.values

args = read_best_model_args(RESULTS_DIR)

if args.mlp_hidden_dims == "same":
    mlp_hidden_dims = [
        len(cat_embed_cols) * args.input_dim,
        len(cat_embed_cols) * args.input_dim,
        (len(cat_embed_cols) * args.input_dim) // 2,
    ]
elif args.mlp_hidden_dims == "None":
    mlp_hidden_dims = None
예제 #12
0
    num_cols = [
        c for c in train.columns if c not in cat_embed_cols + ["target"]
    ]

    wide_cols = []
    for col in train.columns:
        if train[col].nunique() < 40 and col != "target":
            wide_cols.append(col)

    prepare_wide = WidePreprocessor(wide_cols)
    X_wide_train = prepare_wide.fit_transform(train)
    X_wide_valid = prepare_wide.transform(valid)

    prepare_tab = TabPreprocessor(
        embed_cols=cat_embed_cols,
        continuous_cols=num_cols,
        for_tabtransformer=True,
        scale=False,
    )
    X_tab_train = prepare_tab.fit_transform(train)
    X_tab_valid = prepare_tab.transform(valid)

    y_train = train.target.values
    y_valid = valid.target.values

    wide = Wide(wide_dim=np.unique(X_wide_train).shape[0])

    X_train = {"X_wide": X_wide_train, "X_tab": X_tab_train, "target": y_train}
    X_val = {"X_wide": X_wide_valid, "X_tab": X_tab_valid, "target": y_valid}

else:
    cat_embed_cols = []
예제 #13
0
if args.with_wide:
    cat_embed_cols = []
    for col in train.columns:
        if train[col].nunique() > 5 and train[col].nunique() < 200 and col != "target":
            cat_embed_cols.append(col)

    wide_cols = []
    for col in train.columns:
        if train[col].nunique() < 40 and col != "target":
            wide_cols.append(col)

    prepare_wide = WidePreprocessor(wide_cols)
    X_wide_train = prepare_wide.fit_transform(train)
    X_wide_valid = prepare_wide.transform(valid)

    prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols, for_tabtransformer=True)
    X_tab_train = prepare_tab.fit_transform(train)
    X_tab_valid = prepare_tab.transform(valid)

    y_train = train.target.values
    y_valid = valid.target.values

    wide = Wide(wide_dim=np.unique(X_wide_train).shape[0])

    X_train = {"X_wide": X_wide_train, "X_tab": X_tab_train, "target": y_train}
    X_val = {"X_wide": X_wide_valid, "X_tab": X_tab_valid, "target": y_valid}

else:
    cat_embed_cols = []
    for col in train.columns:
        if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target":
예제 #14
0
    target_column_list = ["income_label"]

    target = train_df[target_column_list].values
    """
    Preprocessing
    """

    # Wide
    wide_preprocessor = WidePreprocessor(wide_cols=wide_columns_list,
                                         crossed_cols=wide_cross_column_list)

    x_wide = wide_preprocessor.fit_transform(train_df)

    # Deep
    tab_preprocessor = TabPreprocessor(
        embed_cols=deep_embedding_columns_list,
        continuous_cols=deep_continuous_column_list)
    x_deep = tab_preprocessor.fit_transform(train_df)
    """
    Model 구조 정의
    """
    # Model
    wide = Wide(wide_dim=np.unique(x_wide).shape[0], pred_dim=1)
    deeptabular = TabMlp(mlp_hidden_dims=[64, 32],
                         column_idx=tab_preprocessor.column_idx,
                         embed_input=tab_preprocessor.embeddings_input,
                         continuous_cols=deep_continuous_column_list)
    model = WideDeep(wide=wide, deeptabular=deeptabular)
    """
    학습
    """
def test_notfittederror():
    processor = TabPreprocessor(embed_cols=["col1", "col2"],
                                continuous_cols=["col3", "col4"])
    with pytest.raises(NotFittedError):
        processor.transform(df)
    "input_df, encoder, output_df",
    [(df_letters, le_letters, df_letters_le),
     (df_numbers, le_numbers, df_numbers_le)],
)
def test_label_encoder(input_df, encoder, output_df):
    original_df = encoder.inverse_transform(output_df)
    assert original_df.equals(input_df)


################################################################################
# Test the TabPreprocessor: only categorical columns to be represented with
# embeddings
###############################################################################
cat_embed_cols = [("col1", 5), ("col2", 5)]

preprocessor1 = TabPreprocessor(cat_embed_cols)  # type: ignore[arg-type]
X_letters = preprocessor1.fit_transform(df_letters)

preprocessor2 = TabPreprocessor(cat_embed_cols)  # type: ignore[arg-type]
X_numbers = preprocessor2.fit_transform(df_numbers)

error_list = []


@pytest.mark.parametrize(
    "input_df, X_deep, preprocessor",
    [(df_letters, X_letters, preprocessor1),
     (df_numbers, X_numbers, preprocessor2)],
)
def test_prepare_deep_without_continous_columns(input_df, X_deep,
                                                preprocessor):