def get_lgbm(train_x, val_x, train_y, val_y, cv, n_jobs, scoring): """ Train a lightgbm model Args: train_x: samples used for trainiing val_x: validation set train_y: train targets val_y: validation targets cv: # of cross-validations n_jobs: for making the job parallel scoring: scoring function to use such as MAE Returns: Best estomator """ # Get converged boosting iterations with high learning rate, MAE as the convergence crietria lgbm = lgb( n_estimators=1000, learning_rate=0.1, max_depth=5, num_leaves=100, objective="regression", # min_data_in_leaf=2, n_jobs=-1, verbose=-1, ) lgbm.fit( train_x, train_y, eval_set=[(val_x, val_y)], eval_metric="mae", # eval_metric='l1', early_stopping_rounds=10, ) num_iteration = lgbm.best_iteration_ print("num_iteration", num_iteration) print("in randomsearch cv") # Generally thousands of randomized search for optimal parameters # learning rate and num_leaves are very important param_dist = { #'boosting_type': [ 'dart'], #'boosting_type': ['gbdt', 'dart', 'rf'], #'num_leaves': sp.stats.randint(2, 1001), #'subsample_for_bin': sp.stats.randint(10, 1001), #'min_split_gain': sp.stats.uniform(0, 5.0), #'min_child_weight': sp.stats.uniform(1e-6, 1e-2), #'reg_alpha': sp.stats.uniform(0, 1e-2), #'reg_lambda': sp.stats.uniform(0, 1e-2), #'tree_learner': ['data', 'feature', 'serial', 'voting' ], #'application': ['regression_l1', 'regression_l2', 'regression'], #'bagging_freq': sp.stats.randint(1, 11), #'bagging_fraction': sp.stats.uniform(.1, 0.9), #'feature_fraction': sp.stats.uniform(.1, 0.9), #'learning_rate': sp.stats.uniform(1e-3, 0.9), #'est__num_leaves': [2,8,16], #'est__min_data_in_leaf': [1,2,4], #'est__learning_rate': [0.005,0.01,0.1], #'est__max_depth': [1,3,5], #sp.stats.randint(1, 501), #'est__n_estimators': [num_iteration,2*num_iteration,5*num_iteration],#sp.stats.randint(100, 20001), #'gpu_use_dp': [True, False], #'est__num_leaves': sp.stats.randint(3, 1000), #'est__max_depth': sp.stats.randint(1, 5), "est__learning_rate": sp.stats.uniform(1e-3, 0.9) } lgbm = lgb( objective="regression", # device='gpu', n_estimators=num_iteration, n_jobs=n_jobs, verbose=-1, ) pipe = Pipeline([ ("stdscal", StandardScaler()), ("vart", VarianceThreshold(1e-4)), ("est", lgbm), ]) n_iter = 10 # Increase n_iter rscv = RandomizedSearchCV( estimator=pipe, param_distributions=param_dist, cv=cv, scoring=scoring, n_iter=n_iter, n_jobs=n_jobs, verbose=3, refit=True, ) rscv = rscv.fit(train_x, train_y) return rscv.best_estimator_
f = open(file, "w") f.write(json.dumps(info, cls=MontyEncoder, indent=4)) f.close() os.chdir("../") if __name__ == "__main__": # This may take long time # run(version='version_1',scoring='neg_mean_absolute_error',cv=5,n_jobs=1,prop='op_gap',do_cv=False) # smaller test fit model model = lgb( n_estimators=100, learning_rate=0.1, max_depth=5, num_leaves=100, objective="regression", n_jobs=-1, verbose=-1, ) x, y, jid = jdata(prop="form_enp") X_train, X_test, y_train, y_test, jid_train, jid_test = train_test_split( x, y, jid, random_state=1, test_size=0.1) len(X_train), len(X_test) # Let's take 500 of training set as a quick example X = X_train[0:500] Y = y_train[0:500] model.fit(X, Y) info = {}
x = x.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1) # In[70]: from sklearn.model_selection import train_test_split as tts param = { 'n_estimators': [90, 100, 110], 'learning_rate': [0.1, 0.13, 0.09], 'max_depth': [5, 6, 7] } knn = {'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]} # In[71]: p = lgb(max_depth=7) # In[72]: from sklearn.preprocessing import LabelEncoder as le for c in x.columns: if x[c].dtype == 'object': x[c] = le().fit_transform(x[c].astype(str)) # In[73]: x.Age = le().fit_transform(x.Age.astype(str)) x.Fare = le().fit_transform(x.Fare.astype(str)) y.Age = le().fit_transform(y.Age.astype(str)) y.Fare = le().fit_transform(y.Fare.astype(str))
reg = DTR() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) y_pred_train = reg.predict(X_train) Table(y_pred, y_pred_train, y_train, y_test, 'DTR', X_train, X_test) from sklearn.ensemble import RandomForestRegressor as RF reg = RF() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) y_pred_train = reg.predict(X_train) Table(y_pred, y_pred_train, y_train, y_test, 'RFR', X_train, X_test) plotting(y_pred, 'RFR') from lightgbm import LGBMRegressor as lgb reg = lgb() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) y_pred_train = reg.predict(X_train) Table(y_pred, y_pred_train, y_train, y_test, 'LGBM', X_train, X_test) plotting(y_pred, 'RFR') from xgboost import XGBRegressor reg = XGBRegressor() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) y_pred_train = reg.predict(X_train) Table(y_pred, y_pred_train, y_train, y_test, 'LGBM', X_train, X_test) plotting(y_pred, 'RFR') import pickle
df3_['median_circle_particle_pz'] = df3_['jet_id'].apply( lambda x: median_circle_particle_pz[x]) sum_circle_particle_pz = df4_.groupby(['jet_id'])['circle_particle_pz'].sum() df3_['sum_circle_particle_pz'] = df3_['jet_id'].apply( lambda x: sum_circle_particle_pz[x]) std_circle_particle_pz = df4_.groupby(['jet_id'])['circle_particle_pz'].std() df3_['std_circle_particle_pz'] = df3_['jet_id'].apply( lambda x: std_circle_particle_pz[x]) var_circle_particle_pz = df4_.groupby(['jet_id'])['circle_particle_pz'].var() df3_['var_circle_particle_pz'] = df3_['jet_id'].apply( lambda x: var_circle_particle_pz[x]) features = df3.columns features = list(features) features.remove('jet_id') features.remove('event_id') features.remove('label') model = lgb() y_predict = model.fit(df3[features], df3['label']).predict(df3_[features]) df5 = pd.DataFrame() df5['id'] = df3_['jet_id'] df5['label'] = y_predict df5.to_csv("submit.csv", index=False) df3.to_csv("train_jet2.csv", index=False) df4.to_csv("train_particle2.csv", index=False) df3_.to_csv("test_jet2.csv", index=False) df4_.to_csv("test_particle2.csv", index=False)