def predict_sub(model_lgb, testdex, test, subfilename): print_header('Submission') print_doing_in_task('predicting') lgpred = model_lgb.predict(test) lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) print('saving submission file to', subfilename) lgsub.to_csv(subfilename, index=True, header=True) print('done')
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_TUNE_RESULT args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode # OPTION=args.option print_debug(DEBUG) if DEBUG: dir_feature = '../processed_features_debug2/' else: dir_feature = '../processed_features/' # boosting_list = ['gbdt', 'dart'] boosting_list = ['gbdt'] num_leave_list = [7, 9, 15, 31, 63, 128] max_depth_list = [3, 4, 7, 15, 31, 64] model_list = [] for i in range(len(num_leave_list)): num_leave = num_leave_list[i] max_depth = max_depth_list[i] for boosting_type in boosting_list: model_list = model_list + [ '{}_{}_{}'.format(boosting_type, num_leave, max_depth) ] LOCAL_TUNE_RESULT = pd.DataFrame( index=model_list, columns=['running_time', 'num_round', 'train', 'val']) if DEBUG: print(LOCAL_TUNE_RESULT) option = 1 is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel.pickle' print_header('Option {}'.format(option)) print('is_textadded {} \n predictors {} \n mat filename {}'.format( is_textadded, PREDICTORS, mat_filename)) for k in range(len(num_leave_list)): i = len(num_leave_list) - k - 1 num_leave = num_leave_list[i] max_depth = max_depth_list[i] for boosting_type in boosting_list: DO(option, is_textadded, mat_filename, dir_feature, num_leave, max_depth, boosting_type) print_header('FINAL SUMMARY') print(LOCAL_TUNE_RESULT) LOCAL_TUNE_RESULT.to_csv('csv/tune_params.csv', index=True)
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode # OPTION=args.option print_debug(DEBUG) done_feature_df = load_csv('csv/forward_selection.csv') print(done_feature_df) if DEBUG: dir_feature = '../processed_features_debug2/' else: dir_feature = '../processed_features/' option = 0 is_textadded = False PREDICTORS = PREDICTORS_BASED feature_list = ['base'] files = glob.glob(dir_feature + '*.pickle') REMOVED_LIST = [ 'cat_encode', 'len_feature_kernel', 'text_feature_kernel', 'time' ] for file in files: filename = os.path.basename(file) feature = re.sub('\.pickle$', '', filename) if is_added(filename, REMOVED_LIST): feature_list = feature_list + [feature] LOCAL_VALIDATION_RESULT = pd.DataFrame( index=feature_list, columns=['running_time', 'num_round', 'train', 'val', 'diff']) if DEBUG: print(feature_list) print(LOCAL_VALIDATION_RESULT) for feature in feature_list: if feature == 'base': PREDICTORS = PREDICTORS else: PREDICTORS = PREDICTORS + [feature] DO(option, is_textadded, 'abc', dir_feature, 1988, feature) if feature != 'base': PREDICTORS.remove(feature) print_header('FINAL SUMMARY') print(LOCAL_VALIDATION_RESULT) LOCAL_VALIDATION_RESULT.to_csv('forward_selection.csv', index=True)
def prepare_training(mat_filename, dir_feature, predictors, is_textadded): print_header('Load features') df, y, len_train, traindex, testdex = load_train_test(['item_id'], TARGET, DEBUG) del len_train gc.collect() df = drop_col(df, REMOVED_LIST) # add features print_doing('add tabular features') for feature in predictors: dir_feature_file = dir_feature + feature + '.pickle' if not os.path.exists(dir_feature_file): print('can not find {}. Please check'.format(dir_feature_file)) else: if feature in df: print('{} already added'.format(feature)) else: print_doing_in_task('adding {}'.format(feature)) df = add_feature(df, dir_feature_file) print_memory() if is_textadded: # add text_feature print_doing_in_task('add text features') ready_df, tfvocab = get_text_matrix(mat_filename, 'all', 2, 0) # stack print_doing_in_task('stack') X = hstack([ csr_matrix(df.loc[traindex, :].values), ready_df[0:traindex.shape[0]] ]) # Sparse Matrix testing = hstack([ csr_matrix(df.loc[testdex, :].values), ready_df[traindex.shape[0]:] ]) print_memory() print_doing_in_task('prepare vocab') tfvocab = df.columns.tolist() + tfvocab for shape in [X, testing]: print("{} Rows and {} Cols".format(*shape.shape)) print("Feature Names Length: ", len(tfvocab)) else: tfvocab = df.columns.tolist() testing = hstack([csr_matrix(df.loc[testdex, :].values)]) X = hstack([csr_matrix(df.loc[traindex, :].values)]) # Sparse Matrix return X, y, testing, tfvocab, df.columns.tolist(), testdex
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode # OPTION=args.option print_debug(DEBUG) if DEBUG: dir_feature = '../processed_features_debug2/' else: dir_feature = '../processed_features/' option = 0 is_textadded = False PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel.pickle' # seed_list = np.random.randint(2000, size=1000) random.seed(1992) seed_array = random.sample(range(0, 10000), 100) seed_list = [] for seed in seed_array: seed_list = seed_list + ['seed_' + str(seed)] LOCAL_VALIDATION_RESULT = pd.DataFrame(index=seed_list, columns=[ 'seed', 'running_time', 'num_round', 'train', 'val', 'local_test', 'diff' ]) print(seed_list) print(LOCAL_VALIDATION_RESULT) for seed in seed_array: DO(option, is_textadded, mat_filename, dir_feature, seed) print_header('FINAL SUMMARY') print(LOCAL_VALIDATION_RESULT) LOCAL_VALIDATION_RESULT.to_csv('seed_select.csv', index=False)
def train(X, y, num_leave, full_predictors, categorical, predictors, boosting_type, option, seed): if DEBUG: subfilename = '../sub/debug_findseed_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) modelfilename = '../trained_models/debug_findseed_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) else: subfilename = '../sub/findseed_{}_{}_{}features_num_leave{}_OPTION{}.csv'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) modelfilename = '../trained_models/findseed_{}_{}_{}features_num_leave{}_OPTION{}.txt'. \ format(yearmonthdate_string,boosting_type,str(len(predictors)),num_leave,option) print_header("Training") start_time = time.time() print_doing_in_task('prepare dataset...') X, X_local_valid, y, y_local_valid = train_test_split(X, y, test_size=0.2, random_state=seed) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=seed) print('training shape: {} \n'.format(X.shape)) print("Light Gradient Boosting Regressor") lgbm_params = { 'task': 'train', 'boosting_type': boosting_type, 'objective': 'regression', 'metric': 'rmse', 'max_depth': 15, 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'learning_rate': 0.1, 'verbose': 0 } print('params:', lgbm_params) lgtrain = lgb.Dataset(X_train, y_train, feature_name=full_predictors, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=full_predictors, categorical_feature=categorical) if DEBUG: num_boost_round = 300 early_stopping_rounds = 10 else: num_boost_round = 20000 early_stopping_rounds = 30 lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=num_boost_round, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=early_stopping_rounds, verbose_eval=10) print_memory() print_header("Model Report") runnning_time = '{0:.2f}'.format((time.time() - start_time) / 60) num_boost_rounds_lgb = lgb_clf.best_iteration print_doing_in_task('fit val') val_rmse = '{0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))) print_doing_in_task('fit train') train_rmse = '{0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_train, lgb_clf.predict(X_train)))) print_doing_in_task('fit local val') local_valid_rmse = '{0:.4f}'.format( np.sqrt( metrics.mean_squared_error(y_local_valid, lgb_clf.predict(X_local_valid)))) diff_lb = '{0:.4f}'.format(abs(float(local_valid_rmse) - 0.2300)) print('OPTION', option) print('model training time: {} mins'.format(runnning_time)) print('seed number: {}'.format(seed)) print('num_boost_rounds_lgb: {}'.format(num_boost_rounds_lgb)) print('train rmse: {}'.format(train_rmse)) print('val rmse: {}'.format(val_rmse)) print('local valid rmse: {}'.format(local_valid_rmse)) print('diff comapred to lb: {}'.format(diff_lb)) print('saving model to', modelfilename) lgb_clf.save_model(modelfilename) seed_name = 'seed_' + str(seed) LOCAL_VALIDATION_RESULT['seed'][seed_name] = seed LOCAL_VALIDATION_RESULT['running_time'][seed_name] = runnning_time LOCAL_VALIDATION_RESULT['num_round'][seed_name] = num_boost_rounds_lgb LOCAL_VALIDATION_RESULT['train'][seed_name] = train_rmse LOCAL_VALIDATION_RESULT['val'][seed_name] = val_rmse LOCAL_VALIDATION_RESULT['local_test'][seed_name] = local_valid_rmse LOCAL_VALIDATION_RESULT['diff'][seed_name] = diff_lb return lgb_clf, subfilename
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode # OPTION=args.option print_debug(DEBUG) if DEBUG: dir_feature = '../processed_features_debug2/' else: dir_feature = '../processed_features/' option_list = [] for option in range(10): option_list = option_list + ['option' + str(option)] LOCAL_VALIDATION_RESULT = pd.DataFrame( index=option_list, columns=['running_time', 'num_round', 'train', 'val']) if DEBUG: print(option_list) print(LOCAL_VALIDATION_RESULT) test_list = [8] for option in test_list: # nothing here if option == 0: is_textadded = False PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel.pickle' # kernel elif option == 1: is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel.pickle' # kernel max_feature = 1000 elif option == 2: is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel_1000.pickle' # kernel max_feature = 30000 elif option == 3: is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel_30000.pickle' # kernel max_feature = infinite elif option == 4: is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel_-1.pickle' # kernel max_feature = 18000 + 'good' feature elif option == 5: is_textadded = True PREDICTORS = PREDICTORS_BASED + PREDICTORS_GOOD mat_filename = dir_feature + 'text_feature_kernel.pickle' # kernel max_feature = 18000 + not-checked feature elif option == 6: is_textadded = True PREDICTORS = PREDICTORS_BASED + PREDICTORS_NOTCHECKED mat_filename = dir_feature + 'text_feature_kernel.pickle' elif option == 7: is_textadded = True PREDICTORS = PREDICTORS_BASED + PREDICTORS_OVERFIT mat_filename = dir_feature + 'text_feature_kernel.pickle' elif option == 8: is_textadded = True PREDICTORS = PREDICTORS_BASED + PREDICTORS_TRY mat_filename = dir_feature + 'text_feature_kernel_30000.pickle' if DEBUG: print_header('Option {}'.format(option)) print('is_textadded {} \n predictors {} \n mat filename {}'.format( is_textadded, PREDICTORS, mat_filename)) DO(option, is_textadded, mat_filename, dir_feature) print_header('FINAL SUMMARY') print(LOCAL_VALIDATION_RESULT)
def train(X, y, num_leave, max_depth, full_predictors, categorical, predictors, boosting_type, option): print_header("Training") start_time = time.time() print_doing_in_task('prepare dataset...') X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED) print('training shape: {} \n'.format(X.shape)) print("Light Gradient Boosting Regressor") lgbm_params = { 'task': 'train', 'boosting_type': boosting_type, 'objective': 'regression', 'metric': 'rmse', 'max_depth': max_depth, 'num_leave': num_leave, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'learning_rate': 0.1, 'lambda_l1': 10, 'max_bin': 512, 'verbose': -1 } print('params:', lgbm_params) lgtrain = lgb.Dataset(X_train, y_train, feature_name=full_predictors, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=full_predictors, categorical_feature=categorical) if DEBUG: num_boost_round = 300 early_stopping_rounds = 10 else: num_boost_round = 20000 early_stopping_rounds = 100 lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=num_boost_round, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=early_stopping_rounds, verbose_eval=10) print_memory() print_header("Model Report") runnning_time = '{0:.2f}'.format((time.time() - start_time) / 60) num_boost_rounds_lgb = lgb_clf.best_iteration print_doing_in_task('fit val') val_rmse = '{0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))) print_doing_in_task('fit train') train_rmse = '{0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_train, lgb_clf.predict(X_train)))) print_header("Model Report") print('boosting_type {}, num_leave {}, max_depth {}'.format( boosting_type, num_leave, max_depth)) print('model training time: {0:.2f} mins'.format( (time.time() - start_time) / 60)) print('num_boost_rounds_lgb: {}'.format(lgb_clf.best_iteration)) print('best rmse: {0:.4f}'.format( np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))) model = '{}_{}_{}'.format(boosting_type, num_leave, max_depth) LOCAL_TUNE_RESULT['running_time'][model] = runnning_time LOCAL_TUNE_RESULT['num_round'][model] = num_boost_rounds_lgb LOCAL_TUNE_RESULT['train'][model] = train_rmse LOCAL_TUNE_RESULT['val'][model] = val_rmse