Пример #1
0
def randomSelectPassSample(year, y_table_name):
    past_year = list(range(2012, year - 1))
    sample_ratio = 0.2

    train_data = pd.DataFrame([])
    for tmp_yr in past_year:
        starttime = '%d-01-01' % tmp_yr
        endtime = '%d-01-01' % (tmp_yr + 1)

        train_x = loadData(starttime, endtime)
        train_y = yload(y_table_name, starttime, endtime)
        train_y.drop('time_stamp', axis=1, inplace=True)

        xnamelist = train_x.columns.tolist(
        )  # feature names (without code & date)
        xnamelist.remove('code')
        xnamelist.remove('date')
        sub_train_data = pd.merge(train_x,
                                  train_y,
                                  on=['date', 'code'],
                                  how='left')

        # preprocessing training data
        sub_train_data.drop_duplicates(['code', 'date'], inplace=True)

        # drop code & date
        sub_train_data.drop(['date', 'code'], axis=1, inplace=True)

        # random sample
        train_data = train_data.append(
            sub_train_data.sample(frac=sample_ratio))

    return train_data
Пример #2
0
def run(yesterday, out_folder_path):
    Y_days = [2, 5, 10, 20]

    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s] %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        filename='%s/predict_live.log' % out_folder_path,
                        filemode='w')

    # logger = logging.getLogger('predict_live')
    #
    # # load training feature name
    # feature_list_path = '%s/xnamelist.pcl' % out_folder_path
    # with open(feature_list_path, 'rb') as in_file:
    #     xnamelist = pickle.load(in_file)
    #
    # # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    # logger.info('backtest data generating.')
    # # print(nowtime)

    test_data = loadData(yesterday, yesterday)
    try:
        test_data.drop_duplicates(['code', 'date'], inplace=True)
        if 'index' in test_data.columns.tolist():
            test_data = test_data.drop(['index'], axis=1)
        test_data.reindex()
    except:
        logger.error('train_data error')

    stock_index = test_data[['date', 'code']]
    test_data.drop(['date', 'code'], axis=1, inplace=True)
    test_data = test_data[xnamelist]  # rearrange columns

    logger.info('features:')
    logger.info(' , '.join(test_data.columns.tolist()))

    test_data = xgb.DMatrix(test_data, feature_names=xnamelist)

    for day in Y_days:
        xgb1 = xgb.Booster()
        xgb1.load_model('%s/train_model_%dD.m' % (out_folder_path, day))
        y_score = xgb1.predict(test_data)
        y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % day])
        stock_index = pd.concat([stock_index, y_score], axis=1)
        logger.info('day = %sD' % day)
    stock_index.to_csv("%s/stockscore_live.csv" % (out_folder_path),
                       index=False,
                       sep=',')

    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('backtest data has generated. ')
def run(starttime_train, endtime_train, out_folder_path):
    # ------------------  setting --------------------------------

    Y_table_name = 'STOCK_TOP_BOTTOM_Y'
    Y_days = [2, 5, 10, 20]

    parameters = {
        'silent': 1,  #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
        'nthread': 30,  # cpu 线程数 默认最大
        'learning_rate': 0.1,  # 如同学习率
        #min_child_weight=0.5,              # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
        #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
        #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
        'max_depth': 6,  # 构建树的深度,越大越容易过拟合
        'gamma': 0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
        'subsample': 0.9,  # 随机采样训练样本 训练实例的子采样比
        'max_delta_step': 0,  #最大增量步长,我们允许每个树的权重估计。
        'colsample_bytree': 0.9,  # 生成树时进行的列采样
        'reg_lambda': 1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
        #reg_alpha=0,                       # L1 正则项参数
        #scale_pos_weight=1.3,              #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
        #objective= 'multi:softmax',        #多分类的问题 指定学习任务和相应的学习目标
        #num_class=10,                      # 类别数,多分类与 multisoftmax 并用
        ' n_estimators': 500,  #树的个数
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'seed': 100,  #随机种子
        'eval_metric': 'auc'
    }

    # create logger
    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s] %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        filename='%s/train_live.log' % out_folder_path,
                        filemode='w')

    logger = logging.getLogger('train_live')
    '''---------------------------- training -----------------------------------'''
    # prepare training data
    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('training has been started.')
    # logger.info(nowtime)
    #laod the train data part1
    # train_x1 = loadData(starttime_train1,endtime_train1)
    # train_y1 = yload(Y_table_name, starttime_train1,endtime_train1)
    # train_x2 = loadData(starttime_train2,endtime_train2)
    # train_y2 = yload(Y_table_name, starttime_train2,endtime_train2)
    # train_x = train_x1.append(train_x2)
    # train_y = train_y1.append(train_y2)
    train_x = loadData(starttime_train, endtime_train)
    train_y = yload(Y_table_name, starttime_train, endtime_train)
    train_y.drop('time_stamp', axis=1, inplace=True)
    xnamelist = train_x.columns.tolist()  # feature names (without code & date)
    xnamelist.remove('code')
    xnamelist.remove('date')

    logger.info('features:')
    logger.info(' , '.join(xnamelist))

    train_data = pd.merge(train_x, train_y, on=['date', 'code'], how='left')
    # del train_x,train_y,train_x1,train_x2,train_y1,train_y2
    del train_x, train_y
    gc.collect()

    # preprocessing training data
    try:
        train_data.drop_duplicates(['code', 'date'], inplace=True)
        if 'index' in train_data.columns.tolist():
            train_data = train_data.drop(['index'], axis=1)
    except:
        logger.error('train_data error')

    train_data.drop(['date', 'code'], axis=1, inplace=True)  # drop code & date

    # resultscoredf_h = pd.DataFrame()

    #training the model
    # for day in [2,5,10,20,30]:
    for day in Y_days:
        model_training(day, train_data, xnamelist, parameters, logger,
                       out_folder_path)
    #delete all the variables
    del day, parameters, train_data
    gc.collect()

    # write feature list
    out_path = '%s/xnamelist.pcl' % out_folder_path
    with open(out_path, 'wb') as out_file:
        pickle.dump(xnamelist, out_file)

    logger.info('training has finished')
def run(year, season, output_path, predict_path):
    y_days = [2, 5, 10, 20]
    # ------------------  setting --------------------------------
    season_start_date = {1: '-01-01', 2: '-04-01', 3: '-07-01', 4: '-10-01'}
    season_end_date = {1: '-03-31', 2: '-06-30', 3: '-09-30', 4: '-12-31'}

    starttime_train = str(year) + season_start_date[season]
    endtime_train = str(year + 1) + season_start_date[season]
    endtime_train = datetime.strftime(
        datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=30),
        '%Y-%m-%d')  # minus 30 days to avoid usage of future data

    starttime_test = str(year + 1) + season_start_date[season]
    endtime_test = str(year + 1) + season_end_date[season]

    #def the training function and testing function
    #trainig function
    #按需要更改模型名称
    global train_x, train_y, val_x, val_y, train_data, val_data

    # ============== objective function =============
    def objective(args):
        params = {
            'task': 'train',
            'num_threads': 45,
            'objective': 'binary',
            'boosting': 'dart',
            'verbosity': -1,
            'tree_learner': 'data',
            'seed': 66,
            'min_data_in_leaf': 200,
            'metric': 'auc',
            'max_depth': args['max_depth'] + 6,
            'learning_rate': args['learning_rate'],
            'feature_fraction': args['feature_fraction'],
            'bagging_fraction': args['bagging_fraction'],
            'num_leaves': np.math.floor(2**(args['max_depth'] + 6) * 0.7)
        }
        clf = lgb.train(params,
                        train_data,
                        num_boost_round=1000000,
                        valid_sets=[train_data, val_data],
                        valid_names=['train', 'val'],
                        early_stopping_rounds=20,
                        verbose_eval=1000)

        y_score = clf.predict(val_x)
        fpr, tpr, threshods = roc_curve(val_y, y_score, pos_label=1)
        aucscore = auc(fpr, tpr)
        return -aucscore

    # ==========================================
    # ============= optimization parameter space ===============
    params_space = {
        'learning_rate': hp.uniform('learning_rate', 0.05, 0.15),
        'max_depth': hp.randint('max_depth', 10),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 0.9),
        'feature_fraction': hp.uniform('feature_fraction', 0.7, 0.9),
    }
    # ==========================================

    x = loadData(starttime_train, endtime_train)
    y = yload(starttime_train, endtime_train)
    for tmp_day in y_days:
        y_name = 'Y_%dD' % tmp_day
        nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(y_name)
        print('training has started.')
        print(nowtime)

        tmp_y = y[['code', 'date', y_name]]
        tmp_data = pd.merge(x, tmp_y, on=['date', 'code'], how='inner')
        tmp_data.dropna(subset=[y_name], inplace=True)
        tmp_y = tmp_data[y_name]
        tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1)
        train_x, val_x, train_y, val_y = train_test_split(tmp_x,
                                                          tmp_y,
                                                          test_size=0.1,
                                                          random_state=68)
        del tmp_data, tmp_x, tmp_y
        gc.collect()
        train_data = lgb.Dataset(train_x, label=train_y)
        val_data = lgb.Dataset(val_x, label=val_y, reference=train_data)

        #可以调整max_evals来进行更多的尝试
        best_sln = fmin(objective,
                        space=params_space,
                        algo=tpe.suggest,
                        max_evals=15)

        params = {
            'task': 'train',
            'num_threads': 45,
            'objective': 'binary',
            'boosting': 'dart',
            'verbosity': -1,
            'tree_learner': 'data',
            'seed': 66,
            'min_data_in_leaf': 200,
            'metric': 'auc',
            'learning_rate': best_sln['learning_rate'],
            'feature_fraction': best_sln['feature_fraction'],
            'max_depth': best_sln['max_depth'] + 6,
            'bagging_fraction': best_sln['bagging_fraction'],
            'num_leaves': np.math.floor(2**(best_sln['max_depth'] + 6) * 0.7),
        }

        clf = lgb.train(params,
                        train_data,
                        num_boost_round=10000000,
                        valid_sets=[train_data, val_data],
                        valid_names=['train', 'val'],
                        early_stopping_rounds=20,
                        verbose_eval=1000)

        joblib.dump(
            clf,
            '%s/model_%s_%s_%s.m' % (output_path, year + 1, season, y_name))
        importance = pd.DataFrame({
            'feature': clf.feature_name(),
            'importance': clf.feature_importance('gain')
        })
        importance.to_excel('%s/feature_importance_%s_%s_%s.xlsx' %
                            (output_path, year + 1, season, y_name))

        nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('training has finished')
        print(nowtime)
        del train_x, train_y, val_x, val_y
        gc.collect()

    del x, y
    gc.collect()
    '_____________________________________________________________________________'
    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('testing has been started')
    print(nowtime)
    #生成下一季度测试结果
    test_x = loadData(starttime_test, endtime_test)
    test_y = yload(starttime_test, endtime_test)

    all_y_scores = test_x[['date', 'code']].copy()
    for tmp_day in y_days:
        y_name = 'Y_%dD' % tmp_day
        tmp_y = test_y[['code', 'date', y_name]]
        tmp_data = pd.merge(test_x, tmp_y, on=['date', 'code'], how='inner')
        values = {y_name: int(0)}
        tmp_data.fillna(value=values, inplace=True)
        tmp_data.reindex()
        stock_index = tmp_data[['date', 'code']]
        tmp_y = tmp_data[y_name]
        tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1)
        clf = joblib.load('%s/model_%s_%s_%s.m' %
                          (output_path, year + 1, season, y_name))
        y_score = clf.predict(tmp_x)
        y_boundle = pd.DataFrame({'proba': y_score, 'real': tmp_y})
        y_boundle.sort_values(by='proba', ascending=False, inplace=True)
        y_boundle.reindex()
        tmp_list = np.repeat(np.nan, len(y_boundle))
        tmp_list[:int(np.floor(len(y_boundle) / 100))] = 1
        tmp_list[int(np.floor(len(y_boundle) / 100)):] = 0

        y_boundle['predict'] = tmp_list
        accuracyscore = accuracy_score(y_boundle['real'], y_boundle['predict'])
        fpr, tpr, threshods = roc_curve(y_boundle['real'],
                                        y_score,
                                        pos_label=1)
        ks = np.max(np.abs(tpr - fpr))
        aucscore = auc(fpr, tpr)
        precision = precision_score(y_boundle['real'],
                                    y_boundle['predict'],
                                    average='binary')
        recall = recall_score(y_boundle['real'],
                              y_boundle['predict'],
                              average='weighted')
        print(
            '___________________________________________________________________'
        )
        print('%s_%s_%s' % (year, season, y_name))
        print('precision:', precision)
        print('recall:', recall)
        print('auc:', aucscore)
        print('accuracyscore:', accuracyscore)
        print('K-S:', ks)
        print(classification_report(y_boundle['real'], y_boundle['predict']))
        print(confusion_matrix(y_boundle['real'], y_boundle['predict']))
        print(
            '___________________________________________________________________'
        )

        #生成回测数据
        y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % tmp_day])
        stock_index = pd.concat([stock_index, y_score], axis=1)
        all_y_scores = all_y_scores.merge(stock_index,
                                          on=['date', 'code'],
                                          how='left')

    all_y_scores.to_csv('%s/stockscore_%ds%d.csv' %
                        (predict_path, year, season))

    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('testing has finished')
    print(nowtime)
Пример #5
0
                num_boost_round=100000,evals=[(train_data,'train'),(val_data,'val')],
                verbose_eval=5,early_stopping_rounds=20)
    y_score = xgb1.predict(val_data)
#    y_predict = np.int64(y_score>0.5)
    fpr,tpr,threshods = roc_curve(val_data.get_label(),y_score,pos_label = 1)
    aucscore = auc(fpr,tpr)
    print(aucscore)
    return -aucscore

#def the training function and testing function
nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print('training has been started.')
print(nowtime)
#trainig function
#按需要更改模型名称
x = loadData(starttime_train,endtime_train)
y = yload(starttime_train,endtime_train)
y = y[['code','date',yname]]
tmp_data = pd.merge(x,y,on=['date','code'],how='inner')
tmp_data.dropna(subset=[yname],inplace=True)
tmp_data_1 = tmp_data[tmp_data[yname]==1]
tmp_data_0 = tmp_data[tmp_data[yname]==0]
if len(tmp_data_1)>len(tmp_data_0):
    tmp_data_1.sample(n=len(tmp_data_0),replace=False,random_state=68)
else:
    tmp_data_0.sample(n=len(tmp_data_1),replace=False,random_state=68)
tmp_data = tmp_data_0.append(tmp_data_1)
y = tmp_data[yname]
x = tmp_data.drop(['code','date',yname],axis=1)
train_x,val_x,train_y,val_y = train_test_split(x,y,test_size=0.1,random_state=68)
del tmp_data,x,y
Пример #6
0
#!/usr/bin/env python3
Пример #7
0
    report = 'day = %sD,rate=%sPCT'%(str(day),str(rate))
    print(report)
    print('-------------------------------------------------------')
#   save the evaluating result
    resultscore = [precision,recall,aucscore,accuracyscore,ks,str(classification_report(test_y,y_predict)),str(confusion_matrix(test_y,y_predict)),'%sD_%sPCT'%(str(day),str(rate)),stage]
    columnname = ['precision','recall','auc','accuracyscore','K-S','classification_report','confusion_matrix','modeltype','quantile']
    result =pd.DataFrame(np.array(resultscore).reshape(1,9),columns = columnname)
    return result,y_score


'''----------------------------split line-----------------------------------'''
nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print('training has been started.')
print(nowtime)
#laod the train data part1
train_x1 = loadData(starttime_train1,endtime_train1)
train_y1 = yload(Y_table_name, starttime_train1,endtime_train1)
train_x2 = loadData(starttime_train2,endtime_train2)
train_y2 = yload(Y_table_name, starttime_train2,endtime_train2)
train_x = train_x1.append(train_x2)
train_y = train_y1.append(train_y2)
#train_x = loadData(starttime_train,endtime_train)
#train_y = yload(starttime_train,endtime_train)
train_y.drop('time_stamp',axis=1,inplace=True)
xnamelist = train_x.columns.tolist()
xnamelist.remove('code')
xnamelist.remove('date')
train_data = pd.merge(train_x,train_y,on = ['date','code'],how='left')
#del train_x,train_y,train_x1,train_x2,train_y1,train_y2
gc.collect()
Пример #8
0
def run(year, season, out_folder_path, out_predict_path):
    # ------------------  setting --------------------------------
    season_start_date = {1: '-01-01', 2: '-04-01', 3: '-07-01', 4: '-10-01'}
    season_end_date = {1: '-03-31', 2: '-06-30', 3: '-09-30', 4: '-12-31'}
    starttime_train = str(year) + season_start_date[season]
    endtime_train = str(year + 1) + season_start_date[season]
    endtime_train = datetime.strftime(
        datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=1),
        '%Y-%m-%d')

    starttime_test = str(year + 1) + season_start_date[season]
    endtime_test = str(year + 1) + season_end_date[season]

    # starttime_train1 = '%s-01-01'%year
    # endtime_train1 = '%s-06-30'%year
    # # endtime_train1 = '%s-01-04' % year
    # starttime_train2 = '%s-07-01'%year
    # endtime_train2 = '%s-12-31'%year
    # # endtime_train2 = '%s-07-04' % year
    # starttime_q1 = '%s-01-01'%(year+1)
    # endtime_q1 = '%s-03-31'%(year+1)
    # # endtime_q1 = '%s-01-04' % (year + 1)
    # # starttime_q2 = '%s-04-01'%(year+1)
    # # endtime_q2 = '%s-06-30'%(year+1)
    # # excel_h = 'resultscore_%s.xlsx'%(year)

    Y_table_name = 'STOCK_TOP_BOTTOM_Y'
    Y_days = [2, 5, 10, 20]

    #starttime_train = '%s-06-21'%year
    #endtime_train = '%s-06-21'%year
    #starttime_q1 = '%s-06-21'%year
    #endtime_q1 = '%s-06-21'%year
    #starttime_q2 = '%s-06-21'%year
    #endtime_q2 = '%s-06-21'%year
    #excel_h = 'resultscore_%s.xlsx'%(year)

    #the paremeters of model
    parameters = {
        'silent': 1,  #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
        'nthread': 30,  # cpu 线程数 默认最大
        'learning_rate': 0.1,  # 如同学习率
        #min_child_weight=0.5,              # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
        #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
        #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
        'max_depth': 6,  # 构建树的深度,越大越容易过拟合
        'gamma': 0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
        'subsample': 0.9,  # 随机采样训练样本 训练实例的子采样比
        'max_delta_step': 0,  #最大增量步长,我们允许每个树的权重估计。
        'colsample_bytree': 0.9,  # 生成树时进行的列采样
        'reg_lambda': 1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
        #reg_alpha=0,                       # L1 正则项参数
        #scale_pos_weight=1.3,              #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
        #objective= 'multi:softmax',        #多分类的问题 指定学习任务和相应的学习目标
        #num_class=10,                      # 类别数,多分类与 multisoftmax 并用
        ' n_estimators': 500,  #树的个数
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'seed': 100,  #随机种子
        'eval_metric': 'auc'
    }

    #the return rate of stocks
    # return_rate = {'rate_2':[1,2,3,4,5],
    #                'rate_5':[2,3,5,7,10],
    #                'rate_10':[3,5,7,10,15],
    #                'rate_20':[4,7,10,15,20],
    #                'rate_30':[5,10,15,20,25]
    #         }

    # ynamelist = []
    # for day in [2,5,10,20,30]:
    #     for rate in return_rate['rate_%s'%(str(day))]:
    #         ynamelist.append('Y_%sD_%sPCT'%(day,rate))

    # create logger
    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s] %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        filename='%s/%d_s%d.log' %
                        (out_folder_path, year, season),
                        filemode='w')

    logger = logging.getLogger('%d_s%d' % (year, season))
    '''---------------------------- training -----------------------------------'''
    # prepare training data
    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('training has been started.')
    # logger.info(nowtime)
    #laod the train data part1
    # train_x1 = loadData(starttime_train1,endtime_train1)
    # train_y1 = yload(Y_table_name, starttime_train1,endtime_train1)
    # train_x2 = loadData(starttime_train2,endtime_train2)
    # train_y2 = yload(Y_table_name, starttime_train2,endtime_train2)
    # train_x = train_x1.append(train_x2)
    # train_y = train_y1.append(train_y2)
    his_train_data = randomSelectPassSample(year, Y_table_name)

    train_x = loadData(starttime_train, endtime_train)
    train_y = yload(Y_table_name, starttime_train, endtime_train)
    train_y.drop('time_stamp', axis=1, inplace=True)
    xnamelist = train_x.columns.tolist()  # feature names (without code & date)
    xnamelist.remove('code')
    xnamelist.remove('date')
    train_data = pd.merge(train_x, train_y, on=['date', 'code'], how='left')
    # del train_x,train_y,train_x1,train_x2,train_y1,train_y2
    del train_x, train_y
    gc.collect()

    # preprocessing training data
    try:
        train_data.drop_duplicates(['code', 'date'], inplace=True)
        if 'index' in train_data.columns.tolist():
            train_data = train_data.drop(['index'], axis=1)
    except:
        logger.error('train_data error')

    train_data.drop(['date', 'code'], axis=1, inplace=True)  # drop code & date

    # combine historical data & train_data
    train_data = train_data.append(his_train_data)
    del his_train_data
    gc.collect()

    # resultscoredf_h = pd.DataFrame()

    #training the model
    # for day in [2,5,10,20,30]:
    for day in Y_days:
        model_training(day, train_data, xnamelist, parameters, logger,
                       out_folder_path)
    #delete all the variables
    del day, parameters, train_data
    gc.collect()

    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('training has finished')
    # logger.info(nowtime)
    '''---------------------------- testing S1 -----------------------------------'''
    #S1
    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('testing_q1 has been started')
    # logger.info(nowtime)

    #load the test data
    # test_x = loadData(starttime_q1,endtime_q1)
    # test_y = yload(Y_table_name, starttime_q1,endtime_q1)
    test_x = loadData(starttime_test, endtime_test)
    test_y = yload(Y_table_name, starttime_test, endtime_test)
    test_y.drop('time_stamp', axis=1, inplace=True)
    test_data = pd.merge(test_x, test_y, on=['date', 'code'], how='left')

    del test_x, test_y
    gc.collect()

    #preprocessing testing data
    try:
        test_data.drop_duplicates(['code', 'date'], inplace=True)
        if 'index' in test_data.columns.tolist():
            test_data = test_data.drop(['index'], axis=1)
    except:
        logger.error('test_data error')

    # stock_index_q1 = test_data[['date','code']]
    test_data.drop(['date', 'code'], axis=1, inplace=True)  # drop code & date

    #dataframe to save the result
    resultscoredf_h = pd.DataFrame()

    for day in Y_days:
        result = model_testing(day, test_data, xnamelist, season, logger,
                               out_folder_path)
        # y_score = pd.DataFrame(y_score)
        # y_score.columns = ["y_1_%sD_%sPCT"%(day,rate)]
        # stock_index_q1 = pd.concat([stock_index_q1,y_score],axis=1)
        resultscoredf_h = resultscoredf_h.append(result)

    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('testing s%d has finished' % season)
    # print(nowtime)

    # '''---------------------------- Training S2 -----------------------------------'''
    # #S2
    # # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    # logger.info('testing_q2 has been started')
    # # print(nowtime)
    #
    # #load the test data
    # test_x = loadData(starttime_q2,endtime_q2)
    # test_y = yload(Y_table_name, starttime_q2,endtime_q2)
    # test_y.drop('time_stamp',axis=1,inplace=True)
    # test_data = pd.merge(test_x,test_y,on = ['date','code'],how='left')
    #
    #
    #
    # del test_x,test_y
    # gc.collect()
    # #preprocessing of the original data
    # try:
    #     test_data.drop_duplicates(['code','date'],inplace = True)
    #     if 'index' in test_data.columns.tolist():
    #         test_data = test_data.drop(['index'],axis = 1)
    # except:
    #     logger.error('train_data error')
    #
    # # stock_index_q2 = test_data[['date','code']]
    # test_data.drop(['date','code'],axis=1,inplace=True)
    #
    # #release the memory
    # gc.collect()
    # time.sleep(20)
    #
    # #dataframe to save the result
    # for day in [2,5,10,20,30]:
    #     for rate in return_rate['rate_%s'%(str(day))]:
    #         result = model_testing(day,rate,test_data,xnamelist,'Q2')
    #         # y_score = pd.DataFrame(y_score)
    #         # y_score.columns = ["y_1_%sD_%sPCT"%(day,rate)]
    #         # stock_index_q2 = pd.concat([stock_index_q2,y_score],axis=1)
    #         resultscoredf_h = resultscoredf_h.append(result)
    #
    # # stock_index = stock_index_q1.append(stock_index_q2)
    # # stock_index.to_excel(excel_h)
    #
    # resultscoredf_h.to_excel(excel_h)
    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    # print('testing_q2 has finished')
    # print(nowtime)

    '_________________________________ Record Prediction __________________________________'
    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('backtest data generating.')
    # print(nowtime)

    test_data = loadData(starttime_test, endtime_test)
    try:
        test_data.drop_duplicates(['code', 'date'], inplace=True)
        if 'index' in test_data.columns.tolist():
            test_data = test_data.drop(['index'], axis=1)
        test_data.reindex()
    except:
        logger.error('train_data error')

    stock_index = test_data[['date', 'code']]
    test_data.drop(['date', 'code'], axis=1, inplace=True)
    test_data = xgb.DMatrix(test_data, feature_names=xnamelist)

    for day in Y_days:
        xgb1 = xgb.Booster()
        xgb1.load_model('%s/train_model_%dD.m' % (out_folder_path, day))
        y_score = xgb1.predict(test_data)
        y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % day])
        stock_index = pd.concat([stock_index, y_score], axis=1)
        logger.info('day = %sD' % day)
    stock_index.to_csv("%s/stockscore_%ds%d.csv" %
                       (out_predict_path, year, season),
                       index=False,
                       sep=',')

    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('backtest data has generated. ')
Пример #9
0
def run(year, season, yname='Y_20D'):
    # ------------------  setting --------------------------------
    season_start_date = {1: '-01-01', 2: '-04-01', 3: '-07-01', 4: '-10-01'}
    season_end_date = {1: '-03-31', 2: '-06-30', 3: '-09-30', 4: '-12-31'}

    starttime_train = str(year) + season_start_date[season]
    endtime_train = str(year + 1) + season_start_date[season]
    endtime_train = datetime.strftime(
        datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=1),
        '%Y-%m-%d')

    starttime_test = str(year + 1) + season_start_date[season]
    endtime_test = str(year + 1) + season_end_date[season]

    #def the training function and testing function
    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('training has started.')
    print(nowtime)
    #trainig function
    #按需要更改模型名称
    global train_x, train_y, val_x, val_y, train_data, val_data

    x = loadData(starttime_train, endtime_train)
    y = yload(starttime_train, endtime_train)
    y = y[['code', 'date', yname]]
    tmp_data = pd.merge(x, y, on=['date', 'code'], how='inner')
    tmp_data.dropna(subset=[yname], inplace=True)
    y = tmp_data[yname]
    x = tmp_data.drop(['code', 'date', yname], axis=1)
    train_x, val_x, train_y, val_y = train_test_split(x,
                                                      y,
                                                      test_size=0.1,
                                                      random_state=68)
    del tmp_data, x, y
    gc.collect()
    train_data = lgb.Dataset(train_x, label=train_y)
    val_data = lgb.Dataset(val_x, label=val_y, reference=train_data)

    def objective(args):
        params = {
            'task': 'train',
            'num_threads': 15,
            'objective': 'binary',
            'boosting': 'dart',
            'verbose': -1,
            'tree_learner': 'data',
            'seed': 66,
            'min_data_in_leaf': 300,
            'metric': 'auc',
            'max_depth': args['max_depth'] + 6,
            'learning_rate': args['learning_rate'],
            'feature_fraction': args['feature_fraction'],
            'bagging_fraction': args['bagging_fraction'],
            'num_leaves': np.math.floor(2**(args['max_depth'] + 6) / 2)
        }
        clf = lgb.train(params,
                        train_data,
                        num_boost_round=1000000,
                        valid_sets=[train_data, val_data],
                        valid_names=['train', 'val'],
                        early_stopping_rounds=20,
                        verbose_eval=100)

        y_score = clf.predict(val_x)
        fpr, tpr, threshods = roc_curve(val_y, y_score, pos_label=1)
        aucscore = auc(fpr, tpr)
        return -aucscore

    params_space = {
        'learning_rate': hp.uniform('learning_rate', 0.05, 0.15),
        'max_depth': hp.randint('max_depth', 10),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 0.9),
        'feature_fraction': hp.uniform('feature_fraction', 0.7, 0.9),
    }

    #可以调整max_evals来进行更多的尝试
    best_sln = fmin(objective,
                    space=params_space,
                    algo=tpe.suggest,
                    max_evals=20)

    params = {
        'task': 'train',
        'num_threads': 15,
        'objective': 'binary',
        'boosting': 'dart',
        'verbose': 0,
        'tree_learner': 'data',
        'seed': 66,
        'min_data_in_leaf': 300,
        'metric': 'auc',
        'learning_rate': best_sln['learning_rate'],
        'feature_fraction': best_sln['feature_fraction'],
        'max_depth': best_sln['max_depth'] + 6,
        'bagging_fraction': best_sln['bagging_fraction'],
        'num_leaves': np.math.floor(2**(best_sln['max_depth'] + 6) / 2),
    }

    clf = lgb.train(params,
                    train_data,
                    num_boost_round=1000000,
                    valid_sets=[train_data, val_data],
                    valid_names=['train', 'val'],
                    early_stopping_rounds=20,
                    verbose_eval=100)

    joblib.dump(clf, 'model_%s_%s.m' % (year + 1, season))
    importance = pd.DataFrame({
        'feature': clf.feature_name(),
        'importance': clf.feature_importance('gain')
    })
    importance.to_excel('feature_importance_%s_%s.xlsx' % (year + 1, season))

    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('training has finished')
    print(nowtime)

    '_____________________________________________________________________________'
    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('testing has been started')
    print(nowtime)
    #生成下一季度测试结果
    test_x = loadData(starttime_test, endtime_test)
    test_y = yload(starttime_test, endtime_test)
    test_y = test_y[['code', 'date', yname]]
    tmp_data = pd.merge(test_x, test_y, on=['date', 'code'], how='inner')
    values = {yname: int(0)}
    tmp_data.fillna(value=values, inplace=True)
    tmp_data.reindex()
    stock_index = tmp_data[['date', 'code']]
    test_y = tmp_data[yname]
    test_x = tmp_data.drop(['code', 'date', yname], axis=1)
    clf = joblib.load('model_%s_%s.m' % (year + 1, season))
    y_score = clf.predict(test_x)
    y_predict = np.int64(y_score > 0.9)
    accuracyscore = accuracy_score(test_y, y_predict)
    fpr, tpr, threshods = roc_curve(test_y, y_score, pos_label=1)
    ks = np.max(np.abs(tpr - fpr))
    aucscore = auc(fpr, tpr)
    precision = precision_score(test_y, y_predict, average='binary')
    recall = recall_score(test_y, y_predict, average='weighted')
    print('precision:', precision)
    print('recall:', recall)
    print('auc:', aucscore)
    print('accuracyscore:', accuracyscore)
    print('K-S:', ks)
    print(classification_report(test_y, y_predict))
    print(confusion_matrix(test_y, y_predict))

    #生成回测数据
    y_score = pd.DataFrame(y_score, columns=['proba_1'])
    stock_index = pd.concat([stock_index, y_score], axis=1)
    stock_index.to_csv('backtest_data_%s_%s.csv' % (year + 1, season))

    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('testing has finished')
    print(nowtime)