def predictComplete(predict_path, backup_path):
    y_days = [2, 5, 10, 20]
    result_table_name = 'LGBM_LIVE_PREDICTION_FINAL'
    calendar_table_name = 'TRADE_CALENDAR'
    '__________________________________prediction___________________________________________'
    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('predicting has been started')
    print(nowtime)

    # find the latest trade day
    sql_engine = create_engine(
        'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.
        format(**ConfigQuant))
    today, yesterday, trade_calendar = getLatestTradeDay(
        sql_engine, calendar_table_name)

    if today not in trade_calendar:
        print('%s is not trade date' % today)
        return

    # read features
    test_x = loadData(yesterday, yesterday)

    with open('%s/xnamelist.pcl' % predict_path,
              'rb') as tmp_fo:  # load feature name
        xnamelist = pickle.load(tmp_fo)

    resultscoredf_h = pd.DataFrame([])
    all_y_scores = test_x[['date', 'code']].copy()
    test_x = test_x[xnamelist]  # select  features used in model training

    # check data type of loaded data
    tmp_dtypes = test_x.dtypes
    tmp_dtypes = tmp_dtypes[tmp_dtypes == 'O']
    if tmp_dtypes.size > 0:
        print('data corrupted')
        raise ValueError

    for tmp_day in y_days:
        y_name = 'Y_%dD' % tmp_day
        clf = joblib.load('%s/model_%s.m' % (predict_path, y_name))
        y_score = clf.predict(test_x)

        # 生成回测数据
        all_y_scores.loc[:, y_name] = y_score

    # write prediction in local folder
    all_y_scores.to_csv('%s/stockscore_%s.csv' % (backup_path, yesterday))
    # write prediction to database
    writeDB(result_table_name, all_y_scores, sql_engine)

    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(nowtime)
Пример #2
0
def predictComplete(predict_path, backup_path):
    y_days = [2, 5, 10, 20]
    result_table_name = 'LGBM_LIVE_PREDICTION_FINAL'
    '__________________________________prediction___________________________________________'
    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('predicting has been started')
    print(nowtime)

    # find the latest trade day
    today = datetime.strftime(datetime.now(), '%Y-%m-%d')
    sql_engine = create_engine(
        'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.
        format(**ConfigQuant))
    sql_statement = "select max(`date`) from %s where `date` < '%s'" % (
        'TRADE_CALENDAR', today)
    sql_conn = sql_engine.connect()
    yesterday = pd.read_sql(sql_statement, sql_conn)
    sql_conn.close()
    yesterday = yesterday.iloc[0, 0]

    # read features
    test_x = loadData(yesterday, yesterday)

    with open('%s/xnamelist.pcl' % predict_path,
              'rb') as tmp_fo:  # load feature name
        xnamelist = pickle.load(tmp_fo)

    resultscoredf_h = pd.DataFrame([])
    all_y_scores = test_x[['date', 'code']].copy()
    test_x = test_x[xnamelist]  # select  features used in model training
    for tmp_day in y_days:
        y_name = 'Y_%dD' % tmp_day
        clf = joblib.load('%s/model_%s.m' % (predict_path, y_name))
        y_score = clf.predict(test_x)

        # 生成回测数据
        all_y_scores.loc[:, y_name] = y_score

    # write prediction in local folder
    all_y_scores.to_csv('%s/stockscore_%s.csv' % (backup_path, yesterday))
    # write prediction to database
    writeDB(result_table_name, all_y_scores, sql_engine)

    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('testing has finished')
    print(nowtime)
def run(year, season, output_path, predict_path):
    y_days = [2, 5, 10, 20]
    # ------------------  setting --------------------------------
    # season_start_date = {
    #     1: '-01-01',
    #     2: '-04-01',
    #     3: '-07-01',
    #     4: '-10-01'
    # }
    # season_end_date = {
    #     1: '-03-31',
    #     2: '-06-30',
    #     3: '-09-30',
    #     4: '-12-31'
    # }

    season_start_date = {
        1: '-01-01',
        2: '-03-01',
        3: '-05-01',
        4: '-07-01',
        5: '-09-01',
        6: '-11-01'
    }
    season_end_date = {
        1: '-02-31',
        2: '-04-31',
        3: '-06-31',
        4: '-08-31',
        5: '-10-31',
        6: '-12-31'
    }

    starttime_train = str(year) + season_start_date[season]
    endtime_train = str(year + 1) + season_start_date[season]
    endtime_train = datetime.strftime(
        datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=30),
        '%Y-%m-%d')  # minus 30 days to avoid usage of future data

    starttime_test = str(year + 1) + season_start_date[season]
    endtime_test = str(year + 1) + season_end_date[season]

    # # def the training function and testing function
    # # trainig function
    # # 按需要更改模型名称
    # global train_x, train_y, val_x, val_y, train_data, val_data
    #
    # # ============== objective function =============
    # def objective(args):
    #     params = {
    #         'task': 'train',
    #         'num_threads': 45,
    #         'objective': 'binary',
    #         'boosting': 'dart',
    #         'verbosity': -1,
    #         'tree_learner': 'data',
    #         'seed': 66,
    #         'min_data_in_leaf': 200,
    #         'metric': 'auc',
    #         'max_depth': args['max_depth'] + 6,
    #         'learning_rate': args['learning_rate'],
    #         'feature_fraction': args['feature_fraction'],
    #         'bagging_fraction': args['bagging_fraction'],
    #         'num_leaves': np.math.floor(2 ** (args['max_depth'] + 6) * 0.7)
    #     }
    #     clf = lgb.train(params, train_data, num_boost_round=1000000,
    #                     valid_sets=[train_data, val_data], valid_names=['train', 'val'],
    #                     early_stopping_rounds=15, verbose_eval=1000)
    #
    #     y_score = clf.predict(val_x)
    #     fpr, tpr, threshods = roc_curve(val_y, y_score, pos_label=1)
    #     aucscore = auc(fpr, tpr)
    #     return -aucscore
    #
    # # ==========================================
    # # ============= optimization parameter space ===============
    # params_space = {
    #     'learning_rate': hp.uniform('learning_rate', 0.05, 0.15),
    #     'max_depth': hp.randint('max_depth', 10),
    #     'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 0.9),
    #     'feature_fraction': hp.uniform('feature_fraction', 0.7, 0.9),
    # }
    # # ==========================================
    #
    # x = loadData(starttime_train, endtime_train)
    # y = yload(starttime_train, endtime_train)
    #
    # # drop features with large proportion of nan
    # drop_nan_ratio = 0.7
    # tmp_nan_ratio = x.isnull().sum(axis=0) / x.shape[0]
    # tmp_drop_cols = tmp_nan_ratio[tmp_nan_ratio > drop_nan_ratio].index.tolist()
    # if len(tmp_drop_cols) > 0:
    #     print('drop nan columns:', tmp_drop_cols)
    #     x = x.drop(tmp_drop_cols, axis=1)
    # xnamelist = x.columns.tolist()  # feature names (without code & date)
    # xnamelist.remove('code')
    # xnamelist.remove('date')
    # with open('%s/xnamelist.pcl' % output_path, 'wb') as tmp_fo:  # save feature name
    #     pickle.dump(xnamelist, tmp_fo)
    #
    # for tmp_day in y_days:
    #     y_name = 'Y_%dD' % tmp_day
    #     nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    #     print(y_name)
    #     print('training has started.')
    #     print(nowtime)
    #
    #     tmp_y = y[['code', 'date', y_name]]
    #     tmp_data = pd.merge(x, tmp_y, on=['date', 'code'], how='inner')
    #     tmp_data.dropna(subset=[y_name], inplace=True)
    #     tmp_y = tmp_data[y_name]
    #     tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1)
    #     train_x, val_x, train_y, val_y = train_test_split(tmp_x, tmp_y, test_size=0.1, random_state=68)
    #     del tmp_data, tmp_x, tmp_y
    #     gc.collect()
    #     train_data = lgb.Dataset(train_x, label=train_y)
    #     val_data = lgb.Dataset(val_x, label=val_y, reference=train_data)
    #
    #     # 可以调整max_evals来进行更多的尝试
    #     best_sln = fmin(objective, space=params_space, algo=tpe.suggest, max_evals=10)
    #
    #     params = {
    #         'task': 'train',
    #         'num_threads': 45,
    #         'objective': 'binary',
    #         'boosting': 'dart',
    #         'verbosity': -1,
    #         'tree_learner': 'data',
    #         'seed': 66,
    #         'min_data_in_leaf': 200,
    #         'metric': 'auc',
    #         'learning_rate': best_sln['learning_rate'],
    #         'feature_fraction': best_sln['feature_fraction'],
    #         'max_depth': best_sln['max_depth'] + 6,
    #         'bagging_fraction': best_sln['bagging_fraction'],
    #         'num_leaves': np.math.floor(2 ** (best_sln['max_depth'] + 6) * 0.7),
    #     }
    #
    #     clf = lgb.train(params, train_data, num_boost_round=3000, valid_sets=[train_data, val_data],
    #                     valid_names=['train', 'val'], early_stopping_rounds=15, verbose_eval=1000)
    #
    #     joblib.dump(clf, '%s/model_%s_%s_%s.m' % (output_path, year + 1, season, y_name))
    #     importance = pd.DataFrame({'feature': clf.feature_name(), 'importance': clf.feature_importance('gain')})
    #     importance.to_excel('%s/feature_importance_%s_%s_%s.xlsx' % (output_path, year + 1, season, y_name))
    #
    #     nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    #     print('training has finished')
    #     print(nowtime)
    #     del train_x, train_y, val_x, val_y
    #     gc.collect()
    #
    # del x, y
    # gc.collect()
    '_____________________________________________________________________________'
    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('testing has been started')
    print(nowtime)
    # 生成下一季度测试结果
    test_x = loadData(starttime_test, endtime_test)
    test_y = yload(starttime_test, endtime_test)

    with open('%s/xnamelist.pcl' % output_path,
              'rb') as tmp_fo:  # load feature name
        xnamelist = pickle.load(tmp_fo)

    resultscoredf_h = pd.DataFrame([])
    all_y_scores = test_x[['date', 'code']].copy()
    for tmp_day in y_days:
        y_name = 'Y_%dD' % tmp_day
        tmp_y = test_y[['code', 'date', y_name]]
        tmp_data = pd.merge(test_x, tmp_y, on=['date', 'code'], how='inner')
        values = {y_name: int(0)}
        tmp_data.fillna(value=values, inplace=True)
        tmp_data.reindex()
        stock_index = tmp_data[['date', 'code']]
        tmp_y = tmp_data[y_name]
        tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1)
        tmp_x = tmp_x[xnamelist]  # filter out features used to train model
        clf = joblib.load('%s/model_%s_%s_%s.m' %
                          (output_path, year + 1, season, y_name))
        y_score = clf.predict(tmp_x)
        y_boundle = pd.DataFrame({'proba': y_score, 'real': tmp_y})
        y_boundle.sort_values(by='proba', ascending=False, inplace=True)
        y_boundle.reindex()
        tmp_list = np.repeat(np.nan, len(y_boundle))
        tmp_list[:int(np.floor(len(y_boundle) / 100))] = 1
        tmp_list[int(np.floor(len(y_boundle) / 100)):] = 0

        y_boundle['predict'] = tmp_list
        accuracyscore = accuracy_score(y_boundle['real'], y_boundle['predict'])
        fpr, tpr, threshods = roc_curve(y_boundle['real'],
                                        y_score,
                                        pos_label=1)
        ks = np.max(np.abs(tpr - fpr))
        aucscore = auc(fpr, tpr)
        precision = precision_score(y_boundle['real'],
                                    y_boundle['predict'],
                                    average='binary')
        recall = recall_score(y_boundle['real'],
                              y_boundle['predict'],
                              average='weighted')
        print(
            '___________________________________________________________________'
        )
        print('%s_%s_%s' % (year, season, y_name))
        print('precision:', precision)
        print('recall:', recall)
        print('auc:', aucscore)
        print('accuracyscore:', accuracyscore)
        print('K-S:', ks)
        print(classification_report(y_boundle['real'], y_boundle['predict']))
        print(confusion_matrix(y_boundle['real'], y_boundle['predict']))
        print(
            '___________________________________________________________________'
        )

        # check score under different thresholds
        threshold_list = list(range(50, 100, 5))
        threshold_list = [round(x * 0.01, 2) for x in threshold_list]
        scores_list = {}
        scores_list['year'] = [year, year]
        scores_list['season'] = [season, season]
        for tmp_thrhd in threshold_list:
            tmp_y_predict = np.int64(y_score > tmp_thrhd)
            tmp_precision = precision_score(tmp_y,
                                            tmp_y_predict,
                                            average='binary')
            tmp_recall = recall_score(tmp_y, tmp_y_predict, average='binary')
            scores_list[tmp_thrhd] = [tmp_precision, tmp_recall]
        scores_list = pd.DataFrame(
            scores_list,
            index=[
                '%dS%d_%dDprecision_1' % (year, season, tmp_day),
                '%dS%d_%dDrecall_1' % (year, season, tmp_day)
            ])
        tmp_columns = threshold_list + ['year', 'season']
        scores_list = scores_list[tmp_columns]  # rearrange columns
        if resultscoredf_h.empty:
            resultscoredf_h = scores_list
        else:
            resultscoredf_h = resultscoredf_h.append(scores_list)

        # 生成回测数据
        y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % tmp_day])
        stock_index = pd.concat([stock_index, y_score], axis=1)
        all_y_scores = all_y_scores.merge(stock_index,
                                          on=['date', 'code'],
                                          how='left')

    # all_y_scores.to_csv('%s/stockscore_%ds%d.csv' % (predict_path, year, season))
    result_csv_path = '%s/precision_recall_remake.csv' % predict_path
    if os.path.exists(result_csv_path):
        resultscoredf_h.to_csv(result_csv_path, header=False, mode='a')
    else:
        resultscoredf_h.to_csv(result_csv_path)

    nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('testing has finished')
    print(nowtime)
Пример #4
0
def run(year, season, out_folder_path, out_predict_path):
    # ------------------  setting --------------------------------
    # season_start_date ={
    #     1: '-01-01',
    #     2: '-04-01',
    #     3: '-07-01',
    #     4: '-10-01'
    # }
    # season_end_date = {
    #     1: '-03-31',
    #     2: '-06-30',
    #     3: '-09-30',
    #     4: '-12-31'
    # }
    season_start_date = {
        1: '-01-01',
        2: '-03-01',
        3: '-05-01',
        4: '-07-01',
        5: '-09-01',
        6: '-11-01'
    }
    season_end_date = {
        1: '-02-31',
        2: '-04-31',
        3: '-06-31',
        4: '-08-31',
        5: '-10-31',
        6: '-12-31'
    }
    starttime_train = str(year) + season_start_date[season]
    endtime_train = str(year + 1) + season_start_date[season]
    endtime_train = datetime.strftime(
        datetime.strptime(endtime_train, '%Y-%m-%d') - timedelta(days=30),
        '%Y-%m-%d')  # drop 30 days to avoid using future data in training

    starttime_test = str(year + 1) + season_start_date[season]
    endtime_test = str(year + 1) + season_end_date[season]

    # starttime_train = '2012-01-01'
    # endtime_train = '2012-01-14'
    #
    # starttime_test = '2013-01-01'
    # endtime_test = '2013-01-04'

    # starttime_train1 = '%s-01-01'%year
    # endtime_train1 = '%s-06-30'%year
    # # endtime_train1 = '%s-01-04' % year
    # starttime_train2 = '%s-07-01'%year
    # endtime_train2 = '%s-12-31'%year
    # # endtime_train2 = '%s-07-04' % year
    # starttime_q1 = '%s-01-01'%(year+1)
    # endtime_q1 = '%s-03-31'%(year+1)
    # # endtime_q1 = '%s-01-04' % (year + 1)
    # # starttime_q2 = '%s-04-01'%(year+1)
    # # endtime_q2 = '%s-06-30'%(year+1)
    # # excel_h = 'resultscore_%s.xlsx'%(year)

    Y_table_name = 'STOCK_TOP_BOTTOM_Y'
    Y_days = [2, 5, 10, 20]

    #starttime_train = '%s-06-21'%year
    #endtime_train = '%s-06-21'%year
    #starttime_q1 = '%s-06-21'%year
    #endtime_q1 = '%s-06-21'%year
    #starttime_q2 = '%s-06-21'%year
    #endtime_q2 = '%s-06-21'%year
    #excel_h = 'resultscore_%s.xlsx'%(year)

    #the scope of paremeters of model
    params_space = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'nthread': 50,
        'learning_rate': hp.uniform("learning_rate", 0.05, 0.15),
        'max_depth': hp.randint('max_depth', 10),
        'subsample': hp.uniform("subsample", 0.5, 0.9),
        'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 0.9),
    }

    # parameters = {
    #             'silent':1 ,                        #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
    #             'nthread':30,                       # cpu 线程数 默认最大
    #             'learning_rate':0.1,                # 如同学习率
    #             #min_child_weight=0.5,              # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
    #                                                 #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    #                                                 #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
    #             'max_depth':6,                      # 构建树的深度,越大越容易过拟合
    #             'gamma':0,                          # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
    #             'subsample':0.9,                    # 随机采样训练样本 训练实例的子采样比
    #             'max_delta_step':0,                 #最大增量步长,我们允许每个树的权重估计。
    #             'colsample_bytree':0.9,             # 生成树时进行的列采样
    #             'reg_lambda':1,                     # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
    #             #reg_alpha=0,                       # L1 正则项参数
    #             #scale_pos_weight=1.3,              #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
    #             #objective= 'multi:softmax',        #多分类的问题 指定学习任务和相应的学习目标
    #             #num_class=10,                      # 类别数,多分类与 multisoftmax 并用
    #            ' n_estimators':500,                 #树的个数
    #             'seed':100,                          #随机种子
    #             'eval_metric': 'auc'
    #         }

    #the return rate of stocks
    # return_rate = {'rate_2':[1,2,3,4,5],
    #                'rate_5':[2,3,5,7,10],
    #                'rate_10':[3,5,7,10,15],
    #                'rate_20':[4,7,10,15,20],
    #                'rate_30':[5,10,15,20,25]
    #         }

    # ynamelist = []
    # for day in [2,5,10,20,30]:
    #     for rate in return_rate['rate_%s'%(str(day))]:
    #         ynamelist.append('Y_%sD_%sPCT'%(day,rate))

    # create logger
    logger = logging.getLogger('%d_s%d' % (year, season))
    tmp_log_path = '%s/%d_s%d.log' % (out_folder_path, year, season)
    tmp_log_file_handler = logging.FileHandler(tmp_log_path)
    tmp_fmt = logging.Formatter("%(asctime)s %(threadName)-10s %(message)s",
                                "%Y-%m-%d %H:%M:%S")
    tmp_log_file_handler.setFormatter(tmp_fmt)
    logger.addHandler(tmp_log_file_handler)
    logger.setLevel(logging.INFO)
    # logging.basicConfig(level=logging.INFO,
    #                     format='[%(asctime)s] %(message)s',
    #                     datefmt='%Y-%m-%d %H:%M:%S',
    #                     filename='%s/%d_s%d.log' % (out_folder_path, year, season),
    #                     filemode='w')
    #
    # logger = logging.getLogger('%d_s%d'%(year, season))
    '''---------------------------- training -----------------------------------'''
    # prepare training data
    logger.info('training has been started.')

    tmp_dt_start = datetime.strptime(starttime_train, '%Y-%m-%d')
    tmp_dt_end = datetime.strptime(endtime_train, '%Y-%m-%d')
    tmp_dt_mid = tmp_dt_start + (tmp_dt_end - tmp_dt_start) / 2
    end1_train = datetime.strftime(tmp_dt_mid, '%Y-%m-%d')
    start2_train = datetime.strftime(tmp_dt_mid + timedelta(days=1),
                                     '%Y-%m-%d')

    train_x1 = loadData(starttime_train, end1_train)
    train_x2 = loadData(start2_train, endtime_train)
    train_x = train_x1.append(train_x2)
    del train_x1, train_x2
    gc.collect()

    # train_x = loadData(starttime_train,endtime_train)
    train_y = yload(Y_table_name, starttime_train, endtime_train, ConfigQuant)
    train_y.drop('time_stamp', axis=1, inplace=True)

    # drop features that are mostly nan
    drop_nan_ratio = 0.7
    tmp_nan_ratio = train_x.isnull().sum(axis=0) / train_x.shape[0]
    tmp_drop_cols = tmp_nan_ratio[
        tmp_nan_ratio > drop_nan_ratio].index.tolist()
    if len(tmp_drop_cols) > 0:
        print('drop nan columns:', tmp_drop_cols)
        train_x = train_x.drop(tmp_drop_cols, axis=1)

    xnamelist = train_x.columns.tolist()  # feature names (without code & date)
    xnamelist.remove('code')
    xnamelist.remove('date')

    # check if there is object type columns in the data
    tmp_dtypes = train_x[xnamelist].dtypes
    tmp_dtypes = tmp_dtypes[tmp_dtypes == 'O']
    if tmp_dtypes.size > 0:
        print(tmp_dtypes)

    # merge X, y
    train_data = pd.merge(train_x, train_y, on=['date', 'code'], how='left')
    del train_x, train_y
    gc.collect()

    # save training feature name
    out_path = '%s/xnamelist.pcl' % out_folder_path
    with open(out_path, 'wb') as out_file:
        pickle.dump(xnamelist, out_file)

    # preprocessing training data
    try:
        train_data.drop_duplicates(['code', 'date'], inplace=True)
        train_data = train_data.sort_values('date', ascending=True)
    except:
        logger.error('train_data error')

    train_data.drop(['date', 'code'], axis=1, inplace=True)  # drop code & date

    #training the model
    # for day in [2,5,10,20,30]:
    for day in Y_days:
        model_training(day, train_data, xnamelist, params_space, logger,
                       out_folder_path)
    #delete all the variables
    del day, params_space, train_data
    gc.collect()

    logger.info('training has finished')
    '''---------------------------- Testing -----------------------------------'''
    #S1
    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('testing_q1 has been started')
    # logger.info(nowtime)

    # load training feature name
    feature_list_path = '%s/xnamelist.pcl' % out_folder_path
    with open(feature_list_path, 'rb') as in_file:
        xnamelist = pickle.load(in_file)

    #load the test data
    # test_x = loadData(starttime_q1,endtime_q1)
    # test_y = yload(Y_table_name, starttime_q1,endtime_q1)
    test_x = loadData(starttime_test, endtime_test)
    test_y = yload(Y_table_name, starttime_test, endtime_test, ConfigQuant)
    test_y.drop('time_stamp', axis=1, inplace=True)
    test_data = pd.merge(test_x, test_y, on=['date', 'code'], how='left')

    del test_x, test_y
    gc.collect()

    #preprocessing testing data
    try:
        test_data.drop_duplicates(['code', 'date'], inplace=True)
        if 'index' in test_data.columns.tolist():
            test_data = test_data.drop(['index'], axis=1)
    except:
        logger.error('test_data error')

    # stock_index_q1 = test_data[['date','code']]
    test_data.drop(['date', 'code'], axis=1, inplace=True)  # drop code & date

    #dataframe to save the result
    resultscoredf_h = pd.DataFrame()

    for day in Y_days:
        result = model_testing_new(day, test_data, xnamelist, season, year,
                                   logger, out_folder_path)
        result = result.rename(
            {
                'precision_1': '%dS%d_%dD_precision_1' % (year, season, day),
                'recall_1': '%dS%d_%dD_recall_1' % (year, season, day)
            },
            axis=0)
        # y_score = pd.DataFrame(y_score)
        # y_score.columns = ["y_1_%sD_%sPCT"%(day,rate)]
        # stock_index_q1 = pd.concat([stock_index_q1,y_score],axis=1)
        resultscoredf_h = resultscoredf_h.append(result)

    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('testing s%d has finished' % season)
    # print(nowtime)

    # record statistics
    result_csv_path = '%s/precision_recall.csv' % out_predict_path
    tmp_columns = list(range(50, 100, 5))
    tmp_columns = [round(x * 0.01, 2) for x in tmp_columns]
    tmp_columns.append('year')
    tmp_columns.append('season')
    resultscoredf_h = resultscoredf_h[tmp_columns]
    if os.path.exists(result_csv_path):
        resultscoredf_h.to_csv(result_csv_path, header=False, mode='a')
    else:
        resultscoredf_h.to_csv(result_csv_path)

    '_________________________________ Record Prediction __________________________________'
    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('backtest data generating.')
    # print(nowtime)

    test_data = loadData(starttime_test, endtime_test)
    try:
        test_data.drop_duplicates(['code', 'date'], inplace=True)
        if 'index' in test_data.columns.tolist():
            test_data = test_data.drop(['index'], axis=1)
        test_data.reindex()
    except:
        logger.error('train_data error')

    stock_index = test_data[['date', 'code']]
    test_data.drop(['date', 'code'], axis=1, inplace=True)
    test_data = test_data[
        xnamelist]  # make the columns consistent with training
    test_data = xgb.DMatrix(test_data, feature_names=xnamelist)

    for day in Y_days:
        xgb1 = xgb.Booster()
        xgb1.load_model('%s/train_model_%dD.m' % (out_folder_path, day))
        y_score = xgb1.predict(test_data)
        y_score = pd.DataFrame(y_score, columns=['proba_1_%dD' % day])
        stock_index = pd.concat([stock_index, y_score], axis=1)
        logger.info('day = %sD' % day)
    stock_index.to_csv("%s/stockscore_%ds%d.csv" %
                       (out_predict_path, year, season),
                       index=False,
                       sep=',')

    # nowtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    logger.info('backtest data has generated. ')
Пример #5
0
def run(starttime_train, endtime_train, output_path, predict_path):
    y_days = [2, 5, 10, 20]

    # 按需要更改模型名称
    global train_x, train_y, val_x, val_y, train_data, val_data

    # ============== objective function =============
    def objective(args):
        params = {
            'task': 'train',
            'num_threads': 45,
            'objective': 'binary',
            'boosting': 'dart',
            'verbosity': -1,
            'tree_learner': 'data',
            'seed': 66,
            'min_data_in_leaf': 200,
            'metric': 'auc',
            'max_depth': args['max_depth'] + 6,
            'learning_rate': args['learning_rate'],
            'feature_fraction': args['feature_fraction'],
            'bagging_fraction': args['bagging_fraction'],
            'num_leaves': np.math.floor(2 ** (args['max_depth'] + 6) * 0.7)
        }
        clf = lgb.train(params, train_data, num_boost_round=1000000,
                        valid_sets=[train_data, val_data], valid_names=['train', 'val'],
                        early_stopping_rounds=15, verbose_eval=1000)

        y_score = clf.predict(val_x)
        fpr, tpr, threshods = roc_curve(val_y, y_score, pos_label=1)
        aucscore = auc(fpr, tpr)
        return -aucscore

    # ==========================================
    # ============= optimization parameter space ===============
    params_space = {
        'learning_rate': hp.uniform('learning_rate', 0.05, 0.15),
        'max_depth': hp.randint('max_depth', 10),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 0.9),
        'feature_fraction': hp.uniform('feature_fraction', 0.7, 0.9),
    }
    # ==========================================

    x = loadData(starttime_train, endtime_train)
    y = yload(starttime_train, endtime_train, ConfigQuant)

    # drop features with large proportion of nan
    drop_nan_ratio = 0.7
    tmp_nan_ratio = x.isnull().sum(axis=0) / x.shape[0]
    tmp_drop_cols = tmp_nan_ratio[tmp_nan_ratio > drop_nan_ratio].index.tolist()
    if len(tmp_drop_cols) > 0:
        print('drop nan columns:', tmp_drop_cols)
        x = x.drop(tmp_drop_cols, axis=1)
    xnamelist = x.columns.tolist()  # feature names (without code & date)
    xnamelist.remove('code')
    xnamelist.remove('date')
    with open('%s/xnamelist.pcl' % output_path, 'wb') as tmp_fo:  # save feature name to a backup position
        pickle.dump(xnamelist, tmp_fo)
    with open('%s/xnamelist.pcl' % predict_path, 'wb') as tmp_fo:  # save feature name
        pickle.dump(xnamelist, tmp_fo)

    for tmp_day in y_days:
        y_name = 'Y_%dD' % tmp_day
        nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(y_name)
        print('training has started.')
        print(nowtime)

        tmp_y = y[['code', 'date', y_name]]
        tmp_data = pd.merge(x, tmp_y, on=['date', 'code'], how='inner')
        tmp_data.dropna(subset=[y_name], inplace=True)
        tmp_y = tmp_data[y_name]
        tmp_x = tmp_data.drop(['code', 'date', y_name], axis=1)
        train_x, val_x, train_y, val_y = train_test_split(tmp_x, tmp_y, test_size=0.1, random_state=68)

        train_x.to_csv('train_features_%dD.csv' % tmp_day, index=False, encoding='gbk')
        train_y.to_csv('train_labels_%dD.csv' % tmp_day, index=False)
        val_x.to_csv('val_features_%dD.csv' % tmp_day, index=False, encoding='gbk')
        val_y.to_csv('val_labels_%dD.csv' % tmp_day, index=False)
        tmp_x.to_csv('whole_features_%dD.csv' % tmp_day, index=False, encoding='gbk')
        tmp_y.to_csv('whole_labels_%dD.csv' % tmp_day, index=False)

        del tmp_data, tmp_x, tmp_y
        gc.collect()
        train_data = lgb.Dataset(train_x, label=train_y)
        val_data = lgb.Dataset(val_x, label=val_y, reference=train_data)

        # 可以调整max_evals来进行更多的尝试
        best_sln = fmin(objective, space=params_space, algo=tpe.suggest, max_evals=10)

        params = {
            'task': 'train',
            'num_threads': 45,
            'objective': 'binary',
            'boosting': 'dart',
            'verbosity': -1,
            'tree_learner': 'data',
            'seed': 66,
            'min_data_in_leaf': 200,
            'metric': 'auc',
            'learning_rate': best_sln['learning_rate'],
            'feature_fraction': best_sln['feature_fraction'],
            'max_depth': best_sln['max_depth'] + 6,
            'bagging_fraction': best_sln['bagging_fraction'],
            'num_leaves': np.math.floor(2 ** (best_sln['max_depth'] + 6) * 0.7),
        }

        clf = lgb.train(params, train_data, num_boost_round=3000, valid_sets=[train_data, val_data],
                        valid_names=['train', 'val'], early_stopping_rounds=15, verbose_eval=1000)

        joblib.dump(clf, '%s/model_%s_%s.m' % (output_path, endtime_train, y_name)) # save model file to a backup location
        joblib.dump(clf, '%s/model_%s.m' % (predict_path, y_name))  # save model file
        importance = pd.DataFrame({'feature': clf.feature_name(), 'importance': clf.feature_importance('gain')})
        importance.to_excel('%s/feature_importance_%s_%s.xlsx' % (output_path, endtime_train, y_name))

        nowtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print('training has finished')
        print(nowtime)
        del train_x, train_y, val_x, val_y
        gc.collect()

    del x, y
    gc.collect()
    '_____________________________________________________________________________'