def main(): global args, DEBUG args = parser.parse_args() DEBUG = args.debug print_debug(DEBUG) for dataset in ['test', 'train']: do_dataset(dataset)
def main(): global args, DEBUG, FRAC, PREDICTORS args = parser.parse_args() DEBUG = args.debug FRAC = args.frac print_debug(DEBUG) if DEBUG: storename = '../processed_features_debug{}/{}_debug{}.h5'.format(DEBUG, 'train', DEBUG) mat_filename = '../processed_features_debug{}/text_feature_kernel.pickle'.format(DEBUG) else: storename = '../processed_features/{}.h5'.format('train') mat_filename = '../processed_features/text_feature_kernel.pickle' PREDICTORS = get_predictors(storename) boosting_type_list = ['gbdt','dart'] num_leaves_list = [7,9,31,63] max_depth_list = [3,4,7,9] for boosting_type in boosting_type_list: for i in range(len(num_leaves_list)): print ('==============================================================') num_leaves = num_leaves_list[i] max_depth = max_depth_list[i] print('num leaves:', num_leaves) print('max depth:', max_depth) DO(mat_filename,storename,num_leaves,max_depth,1,boosting_type)
def main(): global args, DEBUG args = parser.parse_args() DEBUG = args.debug print_debug(DEBUG) for dataset in ['train', 'test']: do_dataset(dataset) write_all_feature_to_text()
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode print_debug(DEBUG) DO()
def main(): global args, DEBUG, DATASET args = parser.parse_args() DATASET = args.dataset DEBUG = args.debug print_debug(DEBUG) if DEBUG: todir = '../processed_features_debug{}/'.format(DEBUG) else: todir = '../processed_features/' gen_aggregated_kernel(todir, '.pickle')
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_TUNE_RESULT args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode # OPTION=args.option print_debug(DEBUG) if DEBUG: dir_feature = '../processed_features_debug2/' else: dir_feature = '../processed_features/' # boosting_list = ['gbdt', 'dart'] boosting_list = ['gbdt'] num_leave_list = [7, 9, 15, 31, 63, 128] max_depth_list = [3, 4, 7, 15, 31, 64] model_list = [] for i in range(len(num_leave_list)): num_leave = num_leave_list[i] max_depth = max_depth_list[i] for boosting_type in boosting_list: model_list = model_list + [ '{}_{}_{}'.format(boosting_type, num_leave, max_depth) ] LOCAL_TUNE_RESULT = pd.DataFrame( index=model_list, columns=['running_time', 'num_round', 'train', 'val']) if DEBUG: print(LOCAL_TUNE_RESULT) option = 1 is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel.pickle' print_header('Option {}'.format(option)) print('is_textadded {} \n predictors {} \n mat filename {}'.format( is_textadded, PREDICTORS, mat_filename)) for k in range(len(num_leave_list)): i = len(num_leave_list) - k - 1 num_leave = num_leave_list[i] max_depth = max_depth_list[i] for boosting_type in boosting_list: DO(option, is_textadded, mat_filename, dir_feature, num_leave, max_depth, boosting_type) print_header('FINAL SUMMARY') print(LOCAL_TUNE_RESULT) LOCAL_TUNE_RESULT.to_csv('csv/tune_params.csv', index=True)
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, OPTION args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode OPTION = args.option print_debug(DEBUG) feature_train = get_good_local() if DEBUG: print(feature_train) PREDICTORS = PREDICTORS_BASED + feature_train if DEBUG: print(PREDICTORS) DO()
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode # OPTION=args.option print_debug(DEBUG) done_feature_df = load_csv('csv/forward_selection.csv') print(done_feature_df) if DEBUG: dir_feature = '../processed_features_debug2/' else: dir_feature = '../processed_features/' option = 0 is_textadded = False PREDICTORS = PREDICTORS_BASED feature_list = ['base'] files = glob.glob(dir_feature + '*.pickle') REMOVED_LIST = [ 'cat_encode', 'len_feature_kernel', 'text_feature_kernel', 'time' ] for file in files: filename = os.path.basename(file) feature = re.sub('\.pickle$', '', filename) if is_added(filename, REMOVED_LIST): feature_list = feature_list + [feature] LOCAL_VALIDATION_RESULT = pd.DataFrame( index=feature_list, columns=['running_time', 'num_round', 'train', 'val', 'diff']) if DEBUG: print(feature_list) print(LOCAL_VALIDATION_RESULT) for feature in feature_list: if feature == 'base': PREDICTORS = PREDICTORS else: PREDICTORS = PREDICTORS + [feature] DO(option, is_textadded, 'abc', dir_feature, 1988, feature) if feature != 'base': PREDICTORS.remove(feature) print_header('FINAL SUMMARY') print(LOCAL_VALIDATION_RESULT) LOCAL_VALIDATION_RESULT.to_csv('forward_selection.csv', index=True)
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode # OPTION=args.option print_debug(DEBUG) if DEBUG: dir_feature = '../processed_features_debug2/' else: dir_feature = '../processed_features/' option = 0 is_textadded = False PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel.pickle' # seed_list = np.random.randint(2000, size=1000) random.seed(1992) seed_array = random.sample(range(0, 10000), 100) seed_list = [] for seed in seed_array: seed_list = seed_list + ['seed_' + str(seed)] LOCAL_VALIDATION_RESULT = pd.DataFrame(index=seed_list, columns=[ 'seed', 'running_time', 'num_round', 'train', 'val', 'local_test', 'diff' ]) print(seed_list) print(LOCAL_VALIDATION_RESULT) for seed in seed_array: DO(option, is_textadded, mat_filename, dir_feature, seed) print_header('FINAL SUMMARY') print(LOCAL_VALIDATION_RESULT) LOCAL_VALIDATION_RESULT.to_csv('seed_select.csv', index=False)
def main(): global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT args = parser.parse_args() DEBUG = args.debug FRAC = args.frac TRAINMODE = args.trainmode # OPTION=args.option print_debug(DEBUG) if DEBUG: dir_feature = '../processed_features_debug2/' else: dir_feature = '../processed_features/' option_list = [] for option in range(10): option_list = option_list + ['option' + str(option)] LOCAL_VALIDATION_RESULT = pd.DataFrame( index=option_list, columns=['running_time', 'num_round', 'train', 'val']) if DEBUG: print(option_list) print(LOCAL_VALIDATION_RESULT) test_list = [8] for option in test_list: # nothing here if option == 0: is_textadded = False PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel.pickle' # kernel elif option == 1: is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel.pickle' # kernel max_feature = 1000 elif option == 2: is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel_1000.pickle' # kernel max_feature = 30000 elif option == 3: is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel_30000.pickle' # kernel max_feature = infinite elif option == 4: is_textadded = True PREDICTORS = PREDICTORS_BASED mat_filename = dir_feature + 'text_feature_kernel_-1.pickle' # kernel max_feature = 18000 + 'good' feature elif option == 5: is_textadded = True PREDICTORS = PREDICTORS_BASED + PREDICTORS_GOOD mat_filename = dir_feature + 'text_feature_kernel.pickle' # kernel max_feature = 18000 + not-checked feature elif option == 6: is_textadded = True PREDICTORS = PREDICTORS_BASED + PREDICTORS_NOTCHECKED mat_filename = dir_feature + 'text_feature_kernel.pickle' elif option == 7: is_textadded = True PREDICTORS = PREDICTORS_BASED + PREDICTORS_OVERFIT mat_filename = dir_feature + 'text_feature_kernel.pickle' elif option == 8: is_textadded = True PREDICTORS = PREDICTORS_BASED + PREDICTORS_TRY mat_filename = dir_feature + 'text_feature_kernel_30000.pickle' if DEBUG: print_header('Option {}'.format(option)) print('is_textadded {} \n predictors {} \n mat filename {}'.format( is_textadded, PREDICTORS, mat_filename)) DO(option, is_textadded, mat_filename, dir_feature) print_header('FINAL SUMMARY') print(LOCAL_VALIDATION_RESULT)
def main(): global args, DEBUG, FRAC, PREDICTORS args = parser.parse_args() DEBUG = args.debug FRAC = args.frac print_debug(DEBUG) print("\nData Load Stage") target = TARGET tabular_predictors = get_tabular_predictors() # categorical = get_categorical(predictors) if DEBUG: mat_filename = '../processed_features_debug2/text_feature_kernel.pickle' dir_feature = '../processed_features_debug2/' else: mat_filename = '../processed_features/text_feature_kernel.pickle' dir_feature = '../processed_features/' X, y, test, full_predictors, predictors = prepare_training( mat_filename, dir_feature, tabular_predictors) categorical = get_categorical(predictors) predictors = get_predictors(predictors) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED) print(X.shape) print("Light Gradient Boosting Regressor") lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'max_depth': 15, # 'num_leaves': 31, 'feature_fraction': 0.7, 'bagging_fraction': 0.8, # 'bagging_freq': 5, 'learning_rate': 0.019, 'verbose': 0 } print(lgbm_params) # LGBM Dataset Formatting lgtrain = lgb.Dataset(X_train, y_train, feature_name=full_predictors, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=full_predictors, categorical_feature=categorical) # Go Go Go modelstart = time.time() lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=16000, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=200, verbose_eval=200)