prop_unique = (X[obj_cols].nunique() / len(df)).sort_values( ascending=False) # in order of most unique to least unique = pd.concat([prop_unique, nunique], axis=1) unique.columns = [ "proportion", "nunique", ] print(unique) ENCODE = True if ENCODE: X = similarity_encode( X, encode_columns=[ "Subtitle", ], n_prototypes=4, preran=False, drop_original=True, ) LENGTH_ENCODE = True if LENGTH_ENCODE: len_encode = ["URL"] for col in len_encode: X[f"{col}_len"] = X[col].apply(len) X = X.drop(col, axis=1) CATEGORIZE = False if CATEGORIZE: X[obj_cols] = X[obj_cols].astype("category")
import lightgbm as lgb from helpers import similarity_encode import pandas as pd from pathlib import Path import matplotlib.pyplot as plt df = pd.read_csv( r"data\forestfires.csv", parse_dates=[], index_col=[], ) X, y = similarity_encode(df, encode=False, categorize=True, preran=False) X = X.drop("rain", axis=1) d = lgb.Dataset(X, y, silent=True) # rmse: 98.18188205858038 NUM_BOOST_ROUND = 455 params = { "objective": "rmse", "metric": "rmse", "verbose": -1, "n_jobs": 6, "learning_rate": 0.004090619790710353, "feature_pre_filter": False, "lambda_l1": 6.99239231800302e-08, "lambda_l2": 9.330959145992983, "num_leaves": 9, "feature_fraction": 0.8999999999999999, "bagging_fraction": 1.0, "bagging_freq": 0, "min_child_samples": 20,
obj_cols = X.select_dtypes("object").columns nunique = X[obj_cols].nunique() prop_unique = (X[obj_cols].nunique() / len(df)).sort_values( ascending=False) # in order of most unique to least unique = pd.concat([prop_unique, nunique], axis=1) unique.columns = [ "proportion", "nunique", ] unique if ENCODE: X = similarity_encode( X, encode_columns=[], n_prototypes=5, train=True, drop_original=False, ) X[obj_cols] = X[obj_cols].astype("category") sns.kdeplot(y) plt.title("KDE distribution") plt.show() SEED = 0 SAMPLE_SIZE = 10000 Xt, Xv, yt, yv = train_test_split( X, y, random_state=SEED) # split into train and validation set
obj_cols = X.select_dtypes("object").columns nunique = X[obj_cols].nunique() prop_unique = (X[obj_cols].nunique() / len(df)).sort_values( ascending=False ) # in order of most unique to least unique = pd.concat([prop_unique, nunique], axis=1) unique.columns = [ "proportion", "nunique", ] unique X = similarity_encode( X, encode_columns=["Name", "MeetName", "Division", "Federation"], n_prototypes=5, train=True, drop_original=False, ) X[obj_cols] = X[obj_cols].astype('category') sns.kdeplot(y) plt.title("KDE distribution") plt.show() SEED = 0 SAMPLE_SIZE = 10000 Xt, Xv, yt, yv = train_test_split( X, y, random_state=SEED