def predictComplete(predict_path, backup_path): y_days = [2, 5, 10, 20] result_table_name = 'LGBM_LIVE_PREDICTION_FINAL' calendar_table_name = 'TRADE_CALENDAR' '__________________________________prediction___________________________________________' nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('predicting has been started') print(nowtime) # find the latest trade day sql_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'. format(**ConfigQuant)) today, yesterday, trade_calendar = getLatestTradeDay( sql_engine, calendar_table_name) if today not in trade_calendar: print('%s is not trade date' % today) return # read features test_x = loadData(yesterday, yesterday) with open('%s/xnamelist.pcl' % predict_path, 'rb') as tmp_fo: # load feature name xnamelist = pickle.load(tmp_fo) resultscoredf_h = pd.DataFrame([]) all_y_scores = test_x[['date', 'code']].copy() test_x = test_x[xnamelist] # select features used in model training # check data type of loaded data tmp_dtypes = test_x.dtypes tmp_dtypes = tmp_dtypes[tmp_dtypes == 'O'] if tmp_dtypes.size > 0: print('data corrupted') raise ValueError for tmp_day in y_days: y_name = 'Y_%dD' % tmp_day clf = joblib.load('%s/model_%s.m' % (predict_path, y_name)) y_score = clf.predict(test_x) # 生成回测数据 all_y_scores.loc[:, y_name] = y_score # write prediction in local folder all_y_scores.to_csv('%s/stockscore_%s.csv' % (backup_path, yesterday)) # write prediction to database writeDB(result_table_name, all_y_scores, sql_engine) nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(nowtime)
def predictComplete(predict_path, backup_path): y_days = [2, 5, 10, 20] result_table_name = 'LGBM_LIVE_PREDICTION_FINAL' '__________________________________prediction___________________________________________' nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('predicting has been started') print(nowtime) # find the latest trade day today = datetime.strftime(datetime.now(), '%Y-%m-%d') sql_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'. format(**ConfigQuant)) sql_statement = "select max(`date`) from %s where `date` < '%s'" % ( 'TRADE_CALENDAR', today) sql_conn = sql_engine.connect() yesterday = pd.read_sql(sql_statement, sql_conn) sql_conn.close() yesterday = yesterday.iloc[0, 0] # read features test_x = loadData(yesterday, yesterday) with open('%s/xnamelist.pcl' % predict_path, 'rb') as tmp_fo: # load feature name xnamelist = pickle.load(tmp_fo) resultscoredf_h = pd.DataFrame([]) all_y_scores = test_x[['date', 'code']].copy() test_x = test_x[xnamelist] # select features used in model training for tmp_day in y_days: y_name = 'Y_%dD' % tmp_day clf = joblib.load('%s/model_%s.m' % (predict_path, y_name)) y_score = clf.predict(test_x) # 生成回测数据 all_y_scores.loc[:, y_name] = y_score # write prediction in local folder all_y_scores.to_csv('%s/stockscore_%s.csv' % (backup_path, yesterday)) # write prediction to database writeDB(result_table_name, all_y_scores, sql_engine) nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('testing has finished') print(nowtime)
def run(year, season, output_path, predict_path): y_days = [2, 5, 10, 20] # ------------------ setting -------------------------------- # season_start_date = { # 1: '-01-01', # 2: '-04-01', # 3: '-07-01', # 4: '-10-01' # } # season_end_date = { # 1: '-03-31', # 2: '-06-30', # 3: '-09-30', # 4: '-12-31' # } season_start_date = { 1: '-01-01', 2: '-03-01', 3: '-05-01', 4: '-07-01', 5: '-09-01', 6: '-11-01' } season_end_date = { 1: '-02-31', 2: '-04-31', 3: '-06-31', 4: '-08-31', 5: '-10-31', 6: '-12-31' } starttime_train = str(year) + season_start_date[season] endtime_train = str(year + 1) + season_start_date[season] endtime_train = datetime.strftime( datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=30), '%Y-%m-%d') # minus 30 days to avoid usage of future data starttime_test = str(year + 1) + season_start_date[season] endtime_test = str(year + 1) + season_end_date[season] # # def the training function and testing function # # trainig function # # 按需要更改模型名称 # global train_x, train_y, val_x, val_y, train_data, val_data # # # ============== objective function ============= # def objective(args): # params = { # 'task': 'train', # 'num_threads': 45, # 'objective': 'binary', # 'boosting': 'dart', # 'verbosity': -1, # 'tree_learner': 'data', # 'seed': 66, # 'min_data_in_leaf': 200, # 'metric': 'auc', # 'max_depth': args['max_depth'] + 6, # 'learning_rate': args['learning_rate'], # 'feature_fraction': args['feature_fraction'], # 'bagging_fraction': args['bagging_fraction'], # 'num_leaves': np.math.floor(2 ** (args['max_depth'] + 6) * 0.7) # } # clf = lgb.train(params, train_data, num_boost_round=1000000, # valid_sets=[train_data, val_data], valid_names=['train', 'val'], # early_stopping_rounds=15, verbose_eval=1000) # # y_score = clf.predict(val_x) # fpr, tpr, threshods = roc_curve(val_y, y_score, pos_label=1) # aucscore = auc(fpr, tpr) # return -aucscore # # # ========================================== # # ============= optimization parameter space =============== # params_space = { # 'learning_rate': hp.uniform('learning_rate', 0.05, 0.15), # 'max_depth': hp.randint('max_depth', 10), # 'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 0.9), # 'feature_fraction': hp.uniform('feature_fraction', 0.7, 0.9), # } # # ========================================== # # x = loadData(starttime_train, endtime_train) # y = yload(starttime_train, endtime_train) # # # drop features with large proportion of nan # drop_nan_ratio = 0.7 # tmp_nan_ratio = x.isnull().sum(axis=0) / x.shape[0] # tmp_drop_cols = tmp_nan_ratio[tmp_nan_ratio > drop_nan_ratio].index.tolist() # if len(tmp_drop_cols) > 0: # print('drop nan columns:', tmp_drop_cols) # x = x.drop(tmp_drop_cols, axis=1) # xnamelist = x.columns.tolist() # feature names (without code & date) # xnamelist.remove('code') # xnamelist.remove('date') # with open('%s/xnamelist.pcl' % output_path, 'wb') as tmp_fo: # save feature name # pickle.dump(xnamelist, tmp_fo) # # for tmp_day in y_days: # y_name = 'Y_%dD' % tmp_day # nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # print(y_name) # print('training has started.') # print(nowtime) # # tmp_y = y[['code', 'date', y_name]] # tmp_data = pd.merge(x, tmp_y, on=['date', 'code'], how='inner') # tmp_data.dropna(subset=[y_name], inplace=True) # tmp_y = tmp_data[y_name] # tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1) # train_x, val_x, train_y, val_y = train_test_split(tmp_x, tmp_y, test_size=0.1, random_state=68) # del tmp_data, tmp_x, tmp_y # gc.collect() # train_data = lgb.Dataset(train_x, label=train_y) # val_data = lgb.Dataset(val_x, label=val_y, reference=train_data) # # # 可以调整max_evals来进行更多的尝试 # best_sln = fmin(objective, space=params_space, algo=tpe.suggest, max_evals=10) # # params = { # 'task': 'train', # 'num_threads': 45, # 'objective': 'binary', # 'boosting': 'dart', # 'verbosity': -1, # 'tree_learner': 'data', # 'seed': 66, # 'min_data_in_leaf': 200, # 'metric': 'auc', # 'learning_rate': best_sln['learning_rate'], # 'feature_fraction': best_sln['feature_fraction'], # 'max_depth': best_sln['max_depth'] + 6, # 'bagging_fraction': best_sln['bagging_fraction'], # 'num_leaves': np.math.floor(2 ** (best_sln['max_depth'] + 6) * 0.7), # } # # clf = lgb.train(params, train_data, num_boost_round=3000, valid_sets=[train_data, val_data], # valid_names=['train', 'val'], early_stopping_rounds=15, verbose_eval=1000) # # joblib.dump(clf, '%s/model_%s_%s_%s.m' % (output_path, year + 1, season, y_name)) # importance = pd.DataFrame({'feature': clf.feature_name(), 'importance': clf.feature_importance('gain')}) # importance.to_excel('%s/feature_importance_%s_%s_%s.xlsx' % (output_path, year + 1, season, y_name)) # # nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # print('training has finished') # print(nowtime) # del train_x, train_y, val_x, val_y # gc.collect() # # del x, y # gc.collect() '_____________________________________________________________________________' nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('testing has been started') print(nowtime) # 生成下一季度测试结果 test_x = loadData(starttime_test, endtime_test) test_y = yload(starttime_test, endtime_test) with open('%s/xnamelist.pcl' % output_path, 'rb') as tmp_fo: # load feature name xnamelist = pickle.load(tmp_fo) resultscoredf_h = pd.DataFrame([]) all_y_scores = test_x[['date', 'code']].copy() for tmp_day in y_days: y_name = 'Y_%dD' % tmp_day tmp_y = test_y[['code', 'date', y_name]] tmp_data = pd.merge(test_x, tmp_y, on=['date', 'code'], how='inner') values = {y_name: int(0)} tmp_data.fillna(value=values, inplace=True) tmp_data.reindex() stock_index = tmp_data[['date', 'code']] tmp_y = tmp_data[y_name] tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1) tmp_x = tmp_x[xnamelist] # filter out features used to train model clf = joblib.load('%s/model_%s_%s_%s.m' % (output_path, year + 1, season, y_name)) y_score = clf.predict(tmp_x) y_boundle = pd.DataFrame({'proba': y_score, 'real': tmp_y}) y_boundle.sort_values(by='proba', ascending=False, inplace=True) y_boundle.reindex() tmp_list = np.repeat(np.nan, len(y_boundle)) tmp_list[:int(np.floor(len(y_boundle) / 100))] = 1 tmp_list[int(np.floor(len(y_boundle) / 100)):] = 0 y_boundle['predict'] = tmp_list accuracyscore = accuracy_score(y_boundle['real'], y_boundle['predict']) fpr, tpr, threshods = roc_curve(y_boundle['real'], y_score, pos_label=1) ks = np.max(np.abs(tpr - fpr)) aucscore = auc(fpr, tpr) precision = precision_score(y_boundle['real'], y_boundle['predict'], average='binary') recall = recall_score(y_boundle['real'], y_boundle['predict'], average='weighted') print( '___________________________________________________________________' ) print('%s_%s_%s' % (year, season, y_name)) print('precision:', precision) print('recall:', recall) print('auc:', aucscore) print('accuracyscore:', accuracyscore) print('K-S:', ks) print(classification_report(y_boundle['real'], y_boundle['predict'])) print(confusion_matrix(y_boundle['real'], y_boundle['predict'])) print( '___________________________________________________________________' ) # check score under different thresholds threshold_list = list(range(50, 100, 5)) threshold_list = [round(x * 0.01, 2) for x in threshold_list] scores_list = {} scores_list['year'] = [year, year] scores_list['season'] = [season, season] for tmp_thrhd in threshold_list: tmp_y_predict = np.int64(y_score > tmp_thrhd) tmp_precision = precision_score(tmp_y, tmp_y_predict, average='binary') tmp_recall = recall_score(tmp_y, tmp_y_predict, average='binary') scores_list[tmp_thrhd] = [tmp_precision, tmp_recall] scores_list = pd.DataFrame( scores_list, index=[ '%dS%d_%dDprecision_1' % (year, season, tmp_day), '%dS%d_%dDrecall_1' % (year, season, tmp_day) ]) tmp_columns = threshold_list + ['year', 'season'] scores_list = scores_list[tmp_columns] # rearrange columns if resultscoredf_h.empty: resultscoredf_h = scores_list else: resultscoredf_h = resultscoredf_h.append(scores_list) # 生成回测数据 y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % tmp_day]) stock_index = pd.concat([stock_index, y_score], axis=1) all_y_scores = all_y_scores.merge(stock_index, on=['date', 'code'], how='left') # all_y_scores.to_csv('%s/stockscore_%ds%d.csv' % (predict_path, year, season)) result_csv_path = '%s/precision_recall_remake.csv' % predict_path if os.path.exists(result_csv_path): resultscoredf_h.to_csv(result_csv_path, header=False, mode='a') else: resultscoredf_h.to_csv(result_csv_path) nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('testing has finished') print(nowtime)
def run(year, season, out_folder_path, out_predict_path): # ------------------ setting -------------------------------- # season_start_date ={ # 1: '-01-01', # 2: '-04-01', # 3: '-07-01', # 4: '-10-01' # } # season_end_date = { # 1: '-03-31', # 2: '-06-30', # 3: '-09-30', # 4: '-12-31' # } season_start_date = { 1: '-01-01', 2: '-03-01', 3: '-05-01', 4: '-07-01', 5: '-09-01', 6: '-11-01' } season_end_date = { 1: '-02-31', 2: '-04-31', 3: '-06-31', 4: '-08-31', 5: '-10-31', 6: '-12-31' } starttime_train = str(year) + season_start_date[season] endtime_train = str(year + 1) + season_start_date[season] endtime_train = datetime.strftime( datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=30), '%Y-%m-%d') # drop 30 days to avoid using future data in training starttime_test = str(year + 1) + season_start_date[season] endtime_test = str(year + 1) + season_end_date[season] # starttime_train = '2012-01-01' # endtime_train = '2012-01-14' # # starttime_test = '2013-01-01' # endtime_test = '2013-01-04' # starttime_train1 = '%s-01-01'%year # endtime_train1 = '%s-06-30'%year # # endtime_train1 = '%s-01-04' % year # starttime_train2 = '%s-07-01'%year # endtime_train2 = '%s-12-31'%year # # endtime_train2 = '%s-07-04' % year # starttime_q1 = '%s-01-01'%(year+1) # endtime_q1 = '%s-03-31'%(year+1) # # endtime_q1 = '%s-01-04' % (year + 1) # # starttime_q2 = '%s-04-01'%(year+1) # # endtime_q2 = '%s-06-30'%(year+1) # # excel_h = 'resultscore_%s.xlsx'%(year) Y_table_name = 'STOCK_TOP_BOTTOM_Y' Y_days = [2, 5, 10, 20] #starttime_train = '%s-06-21'%year #endtime_train = '%s-06-21'%year #starttime_q1 = '%s-06-21'%year #endtime_q1 = '%s-06-21'%year #starttime_q2 = '%s-06-21'%year #endtime_q2 = '%s-06-21'%year #excel_h = 'resultscore_%s.xlsx'%(year) #the scope of paremeters of model params_space = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'nthread': 50, 'learning_rate': hp.uniform("learning_rate", 0.05, 0.15), 'max_depth': hp.randint('max_depth', 10), 'subsample': hp.uniform("subsample", 0.5, 0.9), 'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 0.9), } # parameters = { # 'silent':1 , #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 # 'nthread':30, # cpu 线程数 默认最大 # 'learning_rate':0.1, # 如同学习率 # #min_child_weight=0.5, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 # #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 # #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 # 'max_depth':6, # 构建树的深度,越大越容易过拟合 # 'gamma':0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。 # 'subsample':0.9, # 随机采样训练样本 训练实例的子采样比 # 'max_delta_step':0, #最大增量步长,我们允许每个树的权重估计。 # 'colsample_bytree':0.9, # 生成树时进行的列采样 # 'reg_lambda':1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 # #reg_alpha=0, # L1 正则项参数 # #scale_pos_weight=1.3, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重 # #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标 # #num_class=10, # 类别数,多分类与 multisoftmax 并用 # ' n_estimators':500, #树的个数 # 'seed':100, #随机种子 # 'eval_metric': 'auc' # } #the return rate of stocks # return_rate = {'rate_2':[1,2,3,4,5], # 'rate_5':[2,3,5,7,10], # 'rate_10':[3,5,7,10,15], # 'rate_20':[4,7,10,15,20], # 'rate_30':[5,10,15,20,25] # } # ynamelist = [] # for day in [2,5,10,20,30]: # for rate in return_rate['rate_%s'%(str(day))]: # ynamelist.append('Y_%sD_%sPCT'%(day,rate)) # create logger logger = logging.getLogger('%d_s%d' % (year, season)) tmp_log_path = '%s/%d_s%d.log' % (out_folder_path, year, season) tmp_log_file_handler = logging.FileHandler(tmp_log_path) tmp_fmt = logging.Formatter("%(asctime)s %(threadName)-10s %(message)s", "%Y-%m-%d %H:%M:%S") tmp_log_file_handler.setFormatter(tmp_fmt) logger.addHandler(tmp_log_file_handler) logger.setLevel(logging.INFO) # logging.basicConfig(level=logging.INFO, # format='[%(asctime)s] %(message)s', # datefmt='%Y-%m-%d %H:%M:%S', # filename='%s/%d_s%d.log' % (out_folder_path, year, season), # filemode='w') # # logger = logging.getLogger('%d_s%d'%(year, season)) '''---------------------------- training -----------------------------------''' # prepare training data logger.info('training has been started.') tmp_dt_start = datetime.strptime(starttime_train, '%Y-%m-%d') tmp_dt_end = datetime.strptime(endtime_train, '%Y-%m-%d') tmp_dt_mid = tmp_dt_start + (tmp_dt_end - tmp_dt_start) / 2 end1_train = datetime.strftime(tmp_dt_mid, '%Y-%m-%d') start2_train = datetime.strftime(tmp_dt_mid + timedelta(days=1), '%Y-%m-%d') train_x1 = loadData(starttime_train, end1_train) train_x2 = loadData(start2_train, endtime_train) train_x = train_x1.append(train_x2) del train_x1, train_x2 gc.collect() # train_x = loadData(starttime_train,endtime_train) train_y = yload(Y_table_name, starttime_train, endtime_train, ConfigQuant) train_y.drop('time_stamp', axis=1, inplace=True) # drop features that are mostly nan drop_nan_ratio = 0.7 tmp_nan_ratio = train_x.isnull().sum(axis=0) / train_x.shape[0] tmp_drop_cols = tmp_nan_ratio[ tmp_nan_ratio > drop_nan_ratio].index.tolist() if len(tmp_drop_cols) > 0: print('drop nan columns:', tmp_drop_cols) train_x = train_x.drop(tmp_drop_cols, axis=1) xnamelist = train_x.columns.tolist() # feature names (without code & date) xnamelist.remove('code') xnamelist.remove('date') # check if there is object type columns in the data tmp_dtypes = train_x[xnamelist].dtypes tmp_dtypes = tmp_dtypes[tmp_dtypes == 'O'] if tmp_dtypes.size > 0: print(tmp_dtypes) # merge X, y train_data = pd.merge(train_x, train_y, on=['date', 'code'], how='left') del train_x, train_y gc.collect() # save training feature name out_path = '%s/xnamelist.pcl' % out_folder_path with open(out_path, 'wb') as out_file: pickle.dump(xnamelist, out_file) # preprocessing training data try: train_data.drop_duplicates(['code', 'date'], inplace=True) train_data = train_data.sort_values('date', ascending=True) except: logger.error('train_data error') train_data.drop(['date', 'code'], axis=1, inplace=True) # drop code & date #training the model # for day in [2,5,10,20,30]: for day in Y_days: model_training(day, train_data, xnamelist, params_space, logger, out_folder_path) #delete all the variables del day, params_space, train_data gc.collect() logger.info('training has finished') '''---------------------------- Testing -----------------------------------''' #S1 # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('testing_q1 has been started') # logger.info(nowtime) # load training feature name feature_list_path = '%s/xnamelist.pcl' % out_folder_path with open(feature_list_path, 'rb') as in_file: xnamelist = pickle.load(in_file) #load the test data # test_x = loadData(starttime_q1,endtime_q1) # test_y = yload(Y_table_name, starttime_q1,endtime_q1) test_x = loadData(starttime_test, endtime_test) test_y = yload(Y_table_name, starttime_test, endtime_test, ConfigQuant) test_y.drop('time_stamp', axis=1, inplace=True) test_data = pd.merge(test_x, test_y, on=['date', 'code'], how='left') del test_x, test_y gc.collect() #preprocessing testing data try: test_data.drop_duplicates(['code', 'date'], inplace=True) if 'index' in test_data.columns.tolist(): test_data = test_data.drop(['index'], axis=1) except: logger.error('test_data error') # stock_index_q1 = test_data[['date','code']] test_data.drop(['date', 'code'], axis=1, inplace=True) # drop code & date #dataframe to save the result resultscoredf_h = pd.DataFrame() for day in Y_days: result = model_testing_new(day, test_data, xnamelist, season, year, logger, out_folder_path) result = result.rename( { 'precision_1': '%dS%d_%dD_precision_1' % (year, season, day), 'recall_1': '%dS%d_%dD_recall_1' % (year, season, day) }, axis=0) # y_score = pd.DataFrame(y_score) # y_score.columns = ["y_1_%sD_%sPCT"%(day,rate)] # stock_index_q1 = pd.concat([stock_index_q1,y_score],axis=1) resultscoredf_h = resultscoredf_h.append(result) # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('testing s%d has finished' % season) # print(nowtime) # record statistics result_csv_path = '%s/precision_recall.csv' % out_predict_path tmp_columns = list(range(50, 100, 5)) tmp_columns = [round(x * 0.01, 2) for x in tmp_columns] tmp_columns.append('year') tmp_columns.append('season') resultscoredf_h = resultscoredf_h[tmp_columns] if os.path.exists(result_csv_path): resultscoredf_h.to_csv(result_csv_path, header=False, mode='a') else: resultscoredf_h.to_csv(result_csv_path) '_________________________________ Record Prediction __________________________________' # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('backtest data generating.') # print(nowtime) test_data = loadData(starttime_test, endtime_test) try: test_data.drop_duplicates(['code', 'date'], inplace=True) if 'index' in test_data.columns.tolist(): test_data = test_data.drop(['index'], axis=1) test_data.reindex() except: logger.error('train_data error') stock_index = test_data[['date', 'code']] test_data.drop(['date', 'code'], axis=1, inplace=True) test_data = test_data[ xnamelist] # make the columns consistent with training test_data = xgb.DMatrix(test_data, feature_names=xnamelist) for day in Y_days: xgb1 = xgb.Booster() xgb1.load_model('%s/train_model_%dD.m' % (out_folder_path, day)) y_score = xgb1.predict(test_data) y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % day]) stock_index = pd.concat([stock_index, y_score], axis=1) logger.info('day = %sD' % day) stock_index.to_csv("%s/stockscore_%ds%d.csv" % (out_predict_path, year, season), index=False, sep=',') # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('backtest data has generated. ')
def run(starttime_train, endtime_train, output_path, predict_path): y_days = [2, 5, 10, 20] # 按需要更改模型名称 global train_x, train_y, val_x, val_y, train_data, val_data # ============== objective function ============= def objective(args): params = { 'task': 'train', 'num_threads': 45, 'objective': 'binary', 'boosting': 'dart', 'verbosity': -1, 'tree_learner': 'data', 'seed': 66, 'min_data_in_leaf': 200, 'metric': 'auc', 'max_depth': args['max_depth'] + 6, 'learning_rate': args['learning_rate'], 'feature_fraction': args['feature_fraction'], 'bagging_fraction': args['bagging_fraction'], 'num_leaves': np.math.floor(2 ** (args['max_depth'] + 6) * 0.7) } clf = lgb.train(params, train_data, num_boost_round=1000000, valid_sets=[train_data, val_data], valid_names=['train', 'val'], early_stopping_rounds=15, verbose_eval=1000) y_score = clf.predict(val_x) fpr, tpr, threshods = roc_curve(val_y, y_score, pos_label=1) aucscore = auc(fpr, tpr) return -aucscore # ========================================== # ============= optimization parameter space =============== params_space = { 'learning_rate': hp.uniform('learning_rate', 0.05, 0.15), 'max_depth': hp.randint('max_depth', 10), 'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 0.9), 'feature_fraction': hp.uniform('feature_fraction', 0.7, 0.9), } # ========================================== x = loadData(starttime_train, endtime_train) y = yload(starttime_train, endtime_train, ConfigQuant) # drop features with large proportion of nan drop_nan_ratio = 0.7 tmp_nan_ratio = x.isnull().sum(axis=0) / x.shape[0] tmp_drop_cols = tmp_nan_ratio[tmp_nan_ratio > drop_nan_ratio].index.tolist() if len(tmp_drop_cols) > 0: print('drop nan columns:', tmp_drop_cols) x = x.drop(tmp_drop_cols, axis=1) xnamelist = x.columns.tolist() # feature names (without code & date) xnamelist.remove('code') xnamelist.remove('date') with open('%s/xnamelist.pcl' % output_path, 'wb') as tmp_fo: # save feature name to a backup position pickle.dump(xnamelist, tmp_fo) with open('%s/xnamelist.pcl' % predict_path, 'wb') as tmp_fo: # save feature name pickle.dump(xnamelist, tmp_fo) for tmp_day in y_days: y_name = 'Y_%dD' % tmp_day nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(y_name) print('training has started.') print(nowtime) tmp_y = y[['code', 'date', y_name]] tmp_data = pd.merge(x, tmp_y, on=['date', 'code'], how='inner') tmp_data.dropna(subset=[y_name], inplace=True) tmp_y = tmp_data[y_name] tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1) train_x, val_x, train_y, val_y = train_test_split(tmp_x, tmp_y, test_size=0.1, random_state=68) train_x.to_csv('train_features_%dD.csv' % tmp_day, index=False, encoding='gbk') train_y.to_csv('train_labels_%dD.csv' % tmp_day, index=False) val_x.to_csv('val_features_%dD.csv' % tmp_day, index=False, encoding='gbk') val_y.to_csv('val_labels_%dD.csv' % tmp_day, index=False) tmp_x.to_csv('whole_features_%dD.csv' % tmp_day, index=False, encoding='gbk') tmp_y.to_csv('whole_labels_%dD.csv' % tmp_day, index=False) del tmp_data, tmp_x, tmp_y gc.collect() train_data = lgb.Dataset(train_x, label=train_y) val_data = lgb.Dataset(val_x, label=val_y, reference=train_data) # 可以调整max_evals来进行更多的尝试 best_sln = fmin(objective, space=params_space, algo=tpe.suggest, max_evals=10) params = { 'task': 'train', 'num_threads': 45, 'objective': 'binary', 'boosting': 'dart', 'verbosity': -1, 'tree_learner': 'data', 'seed': 66, 'min_data_in_leaf': 200, 'metric': 'auc', 'learning_rate': best_sln['learning_rate'], 'feature_fraction': best_sln['feature_fraction'], 'max_depth': best_sln['max_depth'] + 6, 'bagging_fraction': best_sln['bagging_fraction'], 'num_leaves': np.math.floor(2 ** (best_sln['max_depth'] + 6) * 0.7), } clf = lgb.train(params, train_data, num_boost_round=3000, valid_sets=[train_data, val_data], valid_names=['train', 'val'], early_stopping_rounds=15, verbose_eval=1000) joblib.dump(clf, '%s/model_%s_%s.m' % (output_path, endtime_train, y_name)) # save model file to a backup location joblib.dump(clf, '%s/model_%s.m' % (predict_path, y_name)) # save model file importance = pd.DataFrame({'feature': clf.feature_name(), 'importance': clf.feature_importance('gain')}) importance.to_excel('%s/feature_importance_%s_%s.xlsx' % (output_path, endtime_train, y_name)) nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('training has finished') print(nowtime) del train_x, train_y, val_x, val_y gc.collect() del x, y gc.collect() '_____________________________________________________________________________'