def randomSelectPassSample(year, y_table_name): past_year = list(range(2012, year - 1)) sample_ratio = 0.2 train_data = pd.DataFrame([]) for tmp_yr in past_year: starttime = '%d-01-01' % tmp_yr endtime = '%d-01-01' % (tmp_yr + 1) train_x = loadData(starttime, endtime) train_y = yload(y_table_name, starttime, endtime) train_y.drop('time_stamp', axis=1, inplace=True) xnamelist = train_x.columns.tolist( ) # feature names (without code & date) xnamelist.remove('code') xnamelist.remove('date') sub_train_data = pd.merge(train_x, train_y, on=['date', 'code'], how='left') # preprocessing training data sub_train_data.drop_duplicates(['code', 'date'], inplace=True) # drop code & date sub_train_data.drop(['date', 'code'], axis=1, inplace=True) # random sample train_data = train_data.append( sub_train_data.sample(frac=sample_ratio)) return train_data
def run(yesterday, out_folder_path): Y_days = [2, 5, 10, 20] logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='%s/predict_live.log' % out_folder_path, filemode='w') # logger = logging.getLogger('predict_live') # # # load training feature name # feature_list_path = '%s/xnamelist.pcl' % out_folder_path # with open(feature_list_path, 'rb') as in_file: # xnamelist = pickle.load(in_file) # # # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # logger.info('backtest data generating.') # # print(nowtime) test_data = loadData(yesterday, yesterday) try: test_data.drop_duplicates(['code', 'date'], inplace=True) if 'index' in test_data.columns.tolist(): test_data = test_data.drop(['index'], axis=1) test_data.reindex() except: logger.error('train_data error') stock_index = test_data[['date', 'code']] test_data.drop(['date', 'code'], axis=1, inplace=True) test_data = test_data[xnamelist] # rearrange columns logger.info('features:') logger.info(' , '.join(test_data.columns.tolist())) test_data = xgb.DMatrix(test_data, feature_names=xnamelist) for day in Y_days: xgb1 = xgb.Booster() xgb1.load_model('%s/train_model_%dD.m' % (out_folder_path, day)) y_score = xgb1.predict(test_data) y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % day]) stock_index = pd.concat([stock_index, y_score], axis=1) logger.info('day = %sD' % day) stock_index.to_csv("%s/stockscore_live.csv" % (out_folder_path), index=False, sep=',') # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('backtest data has generated. ')
def run(starttime_train, endtime_train, out_folder_path): # ------------------ setting -------------------------------- Y_table_name = 'STOCK_TOP_BOTTOM_Y' Y_days = [2, 5, 10, 20] parameters = { 'silent': 1, #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 'nthread': 30, # cpu 线程数 默认最大 'learning_rate': 0.1, # 如同学习率 #min_child_weight=0.5, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'max_depth': 6, # 构建树的深度,越大越容易过拟合 'gamma': 0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。 'subsample': 0.9, # 随机采样训练样本 训练实例的子采样比 'max_delta_step': 0, #最大增量步长,我们允许每个树的权重估计。 'colsample_bytree': 0.9, # 生成树时进行的列采样 'reg_lambda': 1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 #reg_alpha=0, # L1 正则项参数 #scale_pos_weight=1.3, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重 #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标 #num_class=10, # 类别数,多分类与 multisoftmax 并用 ' n_estimators': 500, #树的个数 'objective': 'binary:logistic', 'booster': 'gbtree', 'seed': 100, #随机种子 'eval_metric': 'auc' } # create logger logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='%s/train_live.log' % out_folder_path, filemode='w') logger = logging.getLogger('train_live') '''---------------------------- training -----------------------------------''' # prepare training data # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('training has been started.') # logger.info(nowtime) #laod the train data part1 # train_x1 = loadData(starttime_train1,endtime_train1) # train_y1 = yload(Y_table_name, starttime_train1,endtime_train1) # train_x2 = loadData(starttime_train2,endtime_train2) # train_y2 = yload(Y_table_name, starttime_train2,endtime_train2) # train_x = train_x1.append(train_x2) # train_y = train_y1.append(train_y2) train_x = loadData(starttime_train, endtime_train) train_y = yload(Y_table_name, starttime_train, endtime_train) train_y.drop('time_stamp', axis=1, inplace=True) xnamelist = train_x.columns.tolist() # feature names (without code & date) xnamelist.remove('code') xnamelist.remove('date') logger.info('features:') logger.info(' , '.join(xnamelist)) train_data = pd.merge(train_x, train_y, on=['date', 'code'], how='left') # del train_x,train_y,train_x1,train_x2,train_y1,train_y2 del train_x, train_y gc.collect() # preprocessing training data try: train_data.drop_duplicates(['code', 'date'], inplace=True) if 'index' in train_data.columns.tolist(): train_data = train_data.drop(['index'], axis=1) except: logger.error('train_data error') train_data.drop(['date', 'code'], axis=1, inplace=True) # drop code & date # resultscoredf_h = pd.DataFrame() #training the model # for day in [2,5,10,20,30]: for day in Y_days: model_training(day, train_data, xnamelist, parameters, logger, out_folder_path) #delete all the variables del day, parameters, train_data gc.collect() # write feature list out_path = '%s/xnamelist.pcl' % out_folder_path with open(out_path, 'wb') as out_file: pickle.dump(xnamelist, out_file) logger.info('training has finished')
def run(year, season, output_path, predict_path): y_days = [2, 5, 10, 20] # ------------------ setting -------------------------------- season_start_date = {1: '-01-01', 2: '-04-01', 3: '-07-01', 4: '-10-01'} season_end_date = {1: '-03-31', 2: '-06-30', 3: '-09-30', 4: '-12-31'} starttime_train = str(year) + season_start_date[season] endtime_train = str(year + 1) + season_start_date[season] endtime_train = datetime.strftime( datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=30), '%Y-%m-%d') # minus 30 days to avoid usage of future data starttime_test = str(year + 1) + season_start_date[season] endtime_test = str(year + 1) + season_end_date[season] #def the training function and testing function #trainig function #按需要更改模型名称 global train_x, train_y, val_x, val_y, train_data, val_data # ============== objective function ============= def objective(args): params = { 'task': 'train', 'num_threads': 45, 'objective': 'binary', 'boosting': 'dart', 'verbosity': -1, 'tree_learner': 'data', 'seed': 66, 'min_data_in_leaf': 200, 'metric': 'auc', 'max_depth': args['max_depth'] + 6, 'learning_rate': args['learning_rate'], 'feature_fraction': args['feature_fraction'], 'bagging_fraction': args['bagging_fraction'], 'num_leaves': np.math.floor(2**(args['max_depth'] + 6) * 0.7) } clf = lgb.train(params, train_data, num_boost_round=1000000, valid_sets=[train_data, val_data], valid_names=['train', 'val'], early_stopping_rounds=20, verbose_eval=1000) y_score = clf.predict(val_x) fpr, tpr, threshods = roc_curve(val_y, y_score, pos_label=1) aucscore = auc(fpr, tpr) return -aucscore # ========================================== # ============= optimization parameter space =============== params_space = { 'learning_rate': hp.uniform('learning_rate', 0.05, 0.15), 'max_depth': hp.randint('max_depth', 10), 'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 0.9), 'feature_fraction': hp.uniform('feature_fraction', 0.7, 0.9), } # ========================================== x = loadData(starttime_train, endtime_train) y = yload(starttime_train, endtime_train) for tmp_day in y_days: y_name = 'Y_%dD' % tmp_day nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(y_name) print('training has started.') print(nowtime) tmp_y = y[['code', 'date', y_name]] tmp_data = pd.merge(x, tmp_y, on=['date', 'code'], how='inner') tmp_data.dropna(subset=[y_name], inplace=True) tmp_y = tmp_data[y_name] tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1) train_x, val_x, train_y, val_y = train_test_split(tmp_x, tmp_y, test_size=0.1, random_state=68) del tmp_data, tmp_x, tmp_y gc.collect() train_data = lgb.Dataset(train_x, label=train_y) val_data = lgb.Dataset(val_x, label=val_y, reference=train_data) #可以调整max_evals来进行更多的尝试 best_sln = fmin(objective, space=params_space, algo=tpe.suggest, max_evals=15) params = { 'task': 'train', 'num_threads': 45, 'objective': 'binary', 'boosting': 'dart', 'verbosity': -1, 'tree_learner': 'data', 'seed': 66, 'min_data_in_leaf': 200, 'metric': 'auc', 'learning_rate': best_sln['learning_rate'], 'feature_fraction': best_sln['feature_fraction'], 'max_depth': best_sln['max_depth'] + 6, 'bagging_fraction': best_sln['bagging_fraction'], 'num_leaves': np.math.floor(2**(best_sln['max_depth'] + 6) * 0.7), } clf = lgb.train(params, train_data, num_boost_round=10000000, valid_sets=[train_data, val_data], valid_names=['train', 'val'], early_stopping_rounds=20, verbose_eval=1000) joblib.dump( clf, '%s/model_%s_%s_%s.m' % (output_path, year + 1, season, y_name)) importance = pd.DataFrame({ 'feature': clf.feature_name(), 'importance': clf.feature_importance('gain') }) importance.to_excel('%s/feature_importance_%s_%s_%s.xlsx' % (output_path, year + 1, season, y_name)) nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('training has finished') print(nowtime) del train_x, train_y, val_x, val_y gc.collect() del x, y gc.collect() '_____________________________________________________________________________' nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('testing has been started') print(nowtime) #生成下一季度测试结果 test_x = loadData(starttime_test, endtime_test) test_y = yload(starttime_test, endtime_test) all_y_scores = test_x[['date', 'code']].copy() for tmp_day in y_days: y_name = 'Y_%dD' % tmp_day tmp_y = test_y[['code', 'date', y_name]] tmp_data = pd.merge(test_x, tmp_y, on=['date', 'code'], how='inner') values = {y_name: int(0)} tmp_data.fillna(value=values, inplace=True) tmp_data.reindex() stock_index = tmp_data[['date', 'code']] tmp_y = tmp_data[y_name] tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1) clf = joblib.load('%s/model_%s_%s_%s.m' % (output_path, year + 1, season, y_name)) y_score = clf.predict(tmp_x) y_boundle = pd.DataFrame({'proba': y_score, 'real': tmp_y}) y_boundle.sort_values(by='proba', ascending=False, inplace=True) y_boundle.reindex() tmp_list = np.repeat(np.nan, len(y_boundle)) tmp_list[:int(np.floor(len(y_boundle) / 100))] = 1 tmp_list[int(np.floor(len(y_boundle) / 100)):] = 0 y_boundle['predict'] = tmp_list accuracyscore = accuracy_score(y_boundle['real'], y_boundle['predict']) fpr, tpr, threshods = roc_curve(y_boundle['real'], y_score, pos_label=1) ks = np.max(np.abs(tpr - fpr)) aucscore = auc(fpr, tpr) precision = precision_score(y_boundle['real'], y_boundle['predict'], average='binary') recall = recall_score(y_boundle['real'], y_boundle['predict'], average='weighted') print( '___________________________________________________________________' ) print('%s_%s_%s' % (year, season, y_name)) print('precision:', precision) print('recall:', recall) print('auc:', aucscore) print('accuracyscore:', accuracyscore) print('K-S:', ks) print(classification_report(y_boundle['real'], y_boundle['predict'])) print(confusion_matrix(y_boundle['real'], y_boundle['predict'])) print( '___________________________________________________________________' ) #生成回测数据 y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % tmp_day]) stock_index = pd.concat([stock_index, y_score], axis=1) all_y_scores = all_y_scores.merge(stock_index, on=['date', 'code'], how='left') all_y_scores.to_csv('%s/stockscore_%ds%d.csv' % (predict_path, year, season)) nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('testing has finished') print(nowtime)
num_boost_round=100000,evals=[(train_data,'train'),(val_data,'val')], verbose_eval=5,early_stopping_rounds=20) y_score = xgb1.predict(val_data) # y_predict = np.int64(y_score>0.5) fpr,tpr,threshods = roc_curve(val_data.get_label(),y_score,pos_label = 1) aucscore = auc(fpr,tpr) print(aucscore) return -aucscore #def the training function and testing function nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('training has been started.') print(nowtime) #trainig function #按需要更改模型名称 x = loadData(starttime_train,endtime_train) y = yload(starttime_train,endtime_train) y = y[['code','date',yname]] tmp_data = pd.merge(x,y,on=['date','code'],how='inner') tmp_data.dropna(subset=[yname],inplace=True) tmp_data_1 = tmp_data[tmp_data[yname]==1] tmp_data_0 = tmp_data[tmp_data[yname]==0] if len(tmp_data_1)>len(tmp_data_0): tmp_data_1.sample(n=len(tmp_data_0),replace=False,random_state=68) else: tmp_data_0.sample(n=len(tmp_data_1),replace=False,random_state=68) tmp_data = tmp_data_0.append(tmp_data_1) y = tmp_data[yname] x = tmp_data.drop(['code','date',yname],axis=1) train_x,val_x,train_y,val_y = train_test_split(x,y,test_size=0.1,random_state=68) del tmp_data,x,y
#!/usr/bin/env python3
report = 'day = %sD,rate=%sPCT'%(str(day),str(rate)) print(report) print('-------------------------------------------------------') # save the evaluating result resultscore = [precision,recall,aucscore,accuracyscore,ks,str(classification_report(test_y,y_predict)),str(confusion_matrix(test_y,y_predict)),'%sD_%sPCT'%(str(day),str(rate)),stage] columnname = ['precision','recall','auc','accuracyscore','K-S','classification_report','confusion_matrix','modeltype','quantile'] result =pd.DataFrame(np.array(resultscore).reshape(1,9),columns = columnname) return result,y_score '''----------------------------split line-----------------------------------''' nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('training has been started.') print(nowtime) #laod the train data part1 train_x1 = loadData(starttime_train1,endtime_train1) train_y1 = yload(Y_table_name, starttime_train1,endtime_train1) train_x2 = loadData(starttime_train2,endtime_train2) train_y2 = yload(Y_table_name, starttime_train2,endtime_train2) train_x = train_x1.append(train_x2) train_y = train_y1.append(train_y2) #train_x = loadData(starttime_train,endtime_train) #train_y = yload(starttime_train,endtime_train) train_y.drop('time_stamp',axis=1,inplace=True) xnamelist = train_x.columns.tolist() xnamelist.remove('code') xnamelist.remove('date') train_data = pd.merge(train_x,train_y,on = ['date','code'],how='left') #del train_x,train_y,train_x1,train_x2,train_y1,train_y2 gc.collect()
def run(year, season, out_folder_path, out_predict_path): # ------------------ setting -------------------------------- season_start_date = {1: '-01-01', 2: '-04-01', 3: '-07-01', 4: '-10-01'} season_end_date = {1: '-03-31', 2: '-06-30', 3: '-09-30', 4: '-12-31'} starttime_train = str(year) + season_start_date[season] endtime_train = str(year + 1) + season_start_date[season] endtime_train = datetime.strftime( datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=1), '%Y-%m-%d') starttime_test = str(year + 1) + season_start_date[season] endtime_test = str(year + 1) + season_end_date[season] # starttime_train1 = '%s-01-01'%year # endtime_train1 = '%s-06-30'%year # # endtime_train1 = '%s-01-04' % year # starttime_train2 = '%s-07-01'%year # endtime_train2 = '%s-12-31'%year # # endtime_train2 = '%s-07-04' % year # starttime_q1 = '%s-01-01'%(year+1) # endtime_q1 = '%s-03-31'%(year+1) # # endtime_q1 = '%s-01-04' % (year + 1) # # starttime_q2 = '%s-04-01'%(year+1) # # endtime_q2 = '%s-06-30'%(year+1) # # excel_h = 'resultscore_%s.xlsx'%(year) Y_table_name = 'STOCK_TOP_BOTTOM_Y' Y_days = [2, 5, 10, 20] #starttime_train = '%s-06-21'%year #endtime_train = '%s-06-21'%year #starttime_q1 = '%s-06-21'%year #endtime_q1 = '%s-06-21'%year #starttime_q2 = '%s-06-21'%year #endtime_q2 = '%s-06-21'%year #excel_h = 'resultscore_%s.xlsx'%(year) #the paremeters of model parameters = { 'silent': 1, #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 'nthread': 30, # cpu 线程数 默认最大 'learning_rate': 0.1, # 如同学习率 #min_child_weight=0.5, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'max_depth': 6, # 构建树的深度,越大越容易过拟合 'gamma': 0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。 'subsample': 0.9, # 随机采样训练样本 训练实例的子采样比 'max_delta_step': 0, #最大增量步长,我们允许每个树的权重估计。 'colsample_bytree': 0.9, # 生成树时进行的列采样 'reg_lambda': 1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 #reg_alpha=0, # L1 正则项参数 #scale_pos_weight=1.3, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重 #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标 #num_class=10, # 类别数,多分类与 multisoftmax 并用 ' n_estimators': 500, #树的个数 'objective': 'binary:logistic', 'booster': 'gbtree', 'seed': 100, #随机种子 'eval_metric': 'auc' } #the return rate of stocks # return_rate = {'rate_2':[1,2,3,4,5], # 'rate_5':[2,3,5,7,10], # 'rate_10':[3,5,7,10,15], # 'rate_20':[4,7,10,15,20], # 'rate_30':[5,10,15,20,25] # } # ynamelist = [] # for day in [2,5,10,20,30]: # for rate in return_rate['rate_%s'%(str(day))]: # ynamelist.append('Y_%sD_%sPCT'%(day,rate)) # create logger logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='%s/%d_s%d.log' % (out_folder_path, year, season), filemode='w') logger = logging.getLogger('%d_s%d' % (year, season)) '''---------------------------- training -----------------------------------''' # prepare training data # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('training has been started.') # logger.info(nowtime) #laod the train data part1 # train_x1 = loadData(starttime_train1,endtime_train1) # train_y1 = yload(Y_table_name, starttime_train1,endtime_train1) # train_x2 = loadData(starttime_train2,endtime_train2) # train_y2 = yload(Y_table_name, starttime_train2,endtime_train2) # train_x = train_x1.append(train_x2) # train_y = train_y1.append(train_y2) his_train_data = randomSelectPassSample(year, Y_table_name) train_x = loadData(starttime_train, endtime_train) train_y = yload(Y_table_name, starttime_train, endtime_train) train_y.drop('time_stamp', axis=1, inplace=True) xnamelist = train_x.columns.tolist() # feature names (without code & date) xnamelist.remove('code') xnamelist.remove('date') train_data = pd.merge(train_x, train_y, on=['date', 'code'], how='left') # del train_x,train_y,train_x1,train_x2,train_y1,train_y2 del train_x, train_y gc.collect() # preprocessing training data try: train_data.drop_duplicates(['code', 'date'], inplace=True) if 'index' in train_data.columns.tolist(): train_data = train_data.drop(['index'], axis=1) except: logger.error('train_data error') train_data.drop(['date', 'code'], axis=1, inplace=True) # drop code & date # combine historical data & train_data train_data = train_data.append(his_train_data) del his_train_data gc.collect() # resultscoredf_h = pd.DataFrame() #training the model # for day in [2,5,10,20,30]: for day in Y_days: model_training(day, train_data, xnamelist, parameters, logger, out_folder_path) #delete all the variables del day, parameters, train_data gc.collect() # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('training has finished') # logger.info(nowtime) '''---------------------------- testing S1 -----------------------------------''' #S1 # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('testing_q1 has been started') # logger.info(nowtime) #load the test data # test_x = loadData(starttime_q1,endtime_q1) # test_y = yload(Y_table_name, starttime_q1,endtime_q1) test_x = loadData(starttime_test, endtime_test) test_y = yload(Y_table_name, starttime_test, endtime_test) test_y.drop('time_stamp', axis=1, inplace=True) test_data = pd.merge(test_x, test_y, on=['date', 'code'], how='left') del test_x, test_y gc.collect() #preprocessing testing data try: test_data.drop_duplicates(['code', 'date'], inplace=True) if 'index' in test_data.columns.tolist(): test_data = test_data.drop(['index'], axis=1) except: logger.error('test_data error') # stock_index_q1 = test_data[['date','code']] test_data.drop(['date', 'code'], axis=1, inplace=True) # drop code & date #dataframe to save the result resultscoredf_h = pd.DataFrame() for day in Y_days: result = model_testing(day, test_data, xnamelist, season, logger, out_folder_path) # y_score = pd.DataFrame(y_score) # y_score.columns = ["y_1_%sD_%sPCT"%(day,rate)] # stock_index_q1 = pd.concat([stock_index_q1,y_score],axis=1) resultscoredf_h = resultscoredf_h.append(result) # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('testing s%d has finished' % season) # print(nowtime) # '''---------------------------- Training S2 -----------------------------------''' # #S2 # # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # logger.info('testing_q2 has been started') # # print(nowtime) # # #load the test data # test_x = loadData(starttime_q2,endtime_q2) # test_y = yload(Y_table_name, starttime_q2,endtime_q2) # test_y.drop('time_stamp',axis=1,inplace=True) # test_data = pd.merge(test_x,test_y,on = ['date','code'],how='left') # # # # del test_x,test_y # gc.collect() # #preprocessing of the original data # try: # test_data.drop_duplicates(['code','date'],inplace = True) # if 'index' in test_data.columns.tolist(): # test_data = test_data.drop(['index'],axis = 1) # except: # logger.error('train_data error') # # # stock_index_q2 = test_data[['date','code']] # test_data.drop(['date','code'],axis=1,inplace=True) # # #release the memory # gc.collect() # time.sleep(20) # # #dataframe to save the result # for day in [2,5,10,20,30]: # for rate in return_rate['rate_%s'%(str(day))]: # result = model_testing(day,rate,test_data,xnamelist,'Q2') # # y_score = pd.DataFrame(y_score) # # y_score.columns = ["y_1_%sD_%sPCT"%(day,rate)] # # stock_index_q2 = pd.concat([stock_index_q2,y_score],axis=1) # resultscoredf_h = resultscoredf_h.append(result) # # # stock_index = stock_index_q1.append(stock_index_q2) # # stock_index.to_excel(excel_h) # # resultscoredf_h.to_excel(excel_h) # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # print('testing_q2 has finished') # print(nowtime) '_________________________________ Record Prediction __________________________________' # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('backtest data generating.') # print(nowtime) test_data = loadData(starttime_test, endtime_test) try: test_data.drop_duplicates(['code', 'date'], inplace=True) if 'index' in test_data.columns.tolist(): test_data = test_data.drop(['index'], axis=1) test_data.reindex() except: logger.error('train_data error') stock_index = test_data[['date', 'code']] test_data.drop(['date', 'code'], axis=1, inplace=True) test_data = xgb.DMatrix(test_data, feature_names=xnamelist) for day in Y_days: xgb1 = xgb.Booster() xgb1.load_model('%s/train_model_%dD.m' % (out_folder_path, day)) y_score = xgb1.predict(test_data) y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % day]) stock_index = pd.concat([stock_index, y_score], axis=1) logger.info('day = %sD' % day) stock_index.to_csv("%s/stockscore_%ds%d.csv" % (out_predict_path, year, season), index=False, sep=',') # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logger.info('backtest data has generated. ')
def run(year, season, yname='Y_20D'): # ------------------ setting -------------------------------- season_start_date = {1: '-01-01', 2: '-04-01', 3: '-07-01', 4: '-10-01'} season_end_date = {1: '-03-31', 2: '-06-30', 3: '-09-30', 4: '-12-31'} starttime_train = str(year) + season_start_date[season] endtime_train = str(year + 1) + season_start_date[season] endtime_train = datetime.strftime( datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=1), '%Y-%m-%d') starttime_test = str(year + 1) + season_start_date[season] endtime_test = str(year + 1) + season_end_date[season] #def the training function and testing function nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('training has started.') print(nowtime) #trainig function #按需要更改模型名称 global train_x, train_y, val_x, val_y, train_data, val_data x = loadData(starttime_train, endtime_train) y = yload(starttime_train, endtime_train) y = y[['code', 'date', yname]] tmp_data = pd.merge(x, y, on=['date', 'code'], how='inner') tmp_data.dropna(subset=[yname], inplace=True) y = tmp_data[yname] x = tmp_data.drop(['code', 'date', yname], axis=1) train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.1, random_state=68) del tmp_data, x, y gc.collect() train_data = lgb.Dataset(train_x, label=train_y) val_data = lgb.Dataset(val_x, label=val_y, reference=train_data) def objective(args): params = { 'task': 'train', 'num_threads': 15, 'objective': 'binary', 'boosting': 'dart', 'verbose': -1, 'tree_learner': 'data', 'seed': 66, 'min_data_in_leaf': 300, 'metric': 'auc', 'max_depth': args['max_depth'] + 6, 'learning_rate': args['learning_rate'], 'feature_fraction': args['feature_fraction'], 'bagging_fraction': args['bagging_fraction'], 'num_leaves': np.math.floor(2**(args['max_depth'] + 6) / 2) } clf = lgb.train(params, train_data, num_boost_round=1000000, valid_sets=[train_data, val_data], valid_names=['train', 'val'], early_stopping_rounds=20, verbose_eval=100) y_score = clf.predict(val_x) fpr, tpr, threshods = roc_curve(val_y, y_score, pos_label=1) aucscore = auc(fpr, tpr) return -aucscore params_space = { 'learning_rate': hp.uniform('learning_rate', 0.05, 0.15), 'max_depth': hp.randint('max_depth', 10), 'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 0.9), 'feature_fraction': hp.uniform('feature_fraction', 0.7, 0.9), } #可以调整max_evals来进行更多的尝试 best_sln = fmin(objective, space=params_space, algo=tpe.suggest, max_evals=20) params = { 'task': 'train', 'num_threads': 15, 'objective': 'binary', 'boosting': 'dart', 'verbose': 0, 'tree_learner': 'data', 'seed': 66, 'min_data_in_leaf': 300, 'metric': 'auc', 'learning_rate': best_sln['learning_rate'], 'feature_fraction': best_sln['feature_fraction'], 'max_depth': best_sln['max_depth'] + 6, 'bagging_fraction': best_sln['bagging_fraction'], 'num_leaves': np.math.floor(2**(best_sln['max_depth'] + 6) / 2), } clf = lgb.train(params, train_data, num_boost_round=1000000, valid_sets=[train_data, val_data], valid_names=['train', 'val'], early_stopping_rounds=20, verbose_eval=100) joblib.dump(clf, 'model_%s_%s.m' % (year + 1, season)) importance = pd.DataFrame({ 'feature': clf.feature_name(), 'importance': clf.feature_importance('gain') }) importance.to_excel('feature_importance_%s_%s.xlsx' % (year + 1, season)) nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('training has finished') print(nowtime) '_____________________________________________________________________________' nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('testing has been started') print(nowtime) #生成下一季度测试结果 test_x = loadData(starttime_test, endtime_test) test_y = yload(starttime_test, endtime_test) test_y = test_y[['code', 'date', yname]] tmp_data = pd.merge(test_x, test_y, on=['date', 'code'], how='inner') values = {yname: int(0)} tmp_data.fillna(value=values, inplace=True) tmp_data.reindex() stock_index = tmp_data[['date', 'code']] test_y = tmp_data[yname] test_x = tmp_data.drop(['code', 'date', yname], axis=1) clf = joblib.load('model_%s_%s.m' % (year + 1, season)) y_score = clf.predict(test_x) y_predict = np.int64(y_score > 0.9) accuracyscore = accuracy_score(test_y, y_predict) fpr, tpr, threshods = roc_curve(test_y, y_score, pos_label=1) ks = np.max(np.abs(tpr - fpr)) aucscore = auc(fpr, tpr) precision = precision_score(test_y, y_predict, average='binary') recall = recall_score(test_y, y_predict, average='weighted') print('precision:', precision) print('recall:', recall) print('auc:', aucscore) print('accuracyscore:', accuracyscore) print('K-S:', ks) print(classification_report(test_y, y_predict)) print(confusion_matrix(test_y, y_predict)) #生成回测数据 y_score = pd.DataFrame(y_score, columns=['proba_1']) stock_index = pd.concat([stock_index, y_score], axis=1) stock_index.to_csv('backtest_data_%s_%s.csv' % (year + 1, season)) nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('testing has finished') print(nowtime)