def train_it(train, holdout_df, filename): train_y = train["is_attributed"] train_x = train[predict_col] X_train, X_valid, y_train, y_valid = model_selection.train_test_split( train_x, train_y, test_size=0.2, random_state=99) timer.time("prepare train in ") lgb = pocket_lgb.GoldenLgb() model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid) lgb.show_feature_importance(model) del train, X_train, X_valid, y_train, y_valid gc.collect() timer.time("end train in ") validator = holdout_validator2.HoldoutValidator(model, holdout_df, predict_col) validator.validate() validator.output_prediction(filename) timer.time("done validation in ") return model
predict_col = column_selector.get_predict_col() train_y = train["is_attributed"] train_x = train[predict_col] X_train, X_valid, y_train, y_valid = model_selection.train_test_split( train_x, train_y, test_size=0.2, random_state=99) timer.time("prepare train in ") lgb = pocket_lgb.GoldenLgb() model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid) lgb.show_feature_importance(model) y_pred = model.predict(holdout_df[predict_col]) timer.time("end train in ") validator = holdout_validator2.HoldoutValidator(model, holdout_df, predict_col) validator.validate() validator.validate_rmse(ERROR_ANALYSIS) #validator.output_prediction(PREDICTION) timer.time("done validation in ") del validator del model gc.collect() #################### # second round #################### pl_data = holdout_df[predict_col].copy() pl_data["pseudo_label"] = y_pred