# """
# 因子描述性统计
factor_describe = {}
for fac in fac_data.keys():
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    # 保存因子的描述性统计
    factor_describe[fac] = fac_data[fac].T.describe().T
    # 打印因子的描述性统计均值
    print(fac, fac_data[fac].T.describe().mean(axis=1))
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
# """
# 以下一日的开盘1小时tvwap到再下一日的开盘1小时tvwap收益率作为预测目标
begin = '2017-01-01'
end = '2021-03-02'
tvwap = fetch_data.fetch(begin, end, ['stock_twap_0930_1030'])  # adjtvwap?
fac_data['next_re'] = uc.ts_delay(tvwap['stock_twap_0930_1030'],
                                  -2) / uc.ts_delay(
                                      tvwap['stock_twap_0930_1030'], -1) - 1
fac_data['next_re'] = fac_data['next_re'].dropna(how='all')

# 将每天的对应数据合并
new_f = {}
for k, v in fac_data.items():
    new_v = pd.DataFrame(v.stack())
    new_v.columns = [k]
    new_f[k] = new_v
new_f = pd.concat(new_f.values(), axis=1)

f = open(data_pat + '/fac_reshape.pkl', 'wb')  # 记得修改
pickle.dump(new_f, f, -1)
f.close()
# new_f.reset_index().to_csv(data_pat + '/fac_reshape.csv',index=False,encoding='gbk')
Пример #2
0
mine_summary = query_data.get_alphafactors_info(user='******')
# 调整正负
factor_value_adj = {}
for summa in mine_summary:
    if summa['factor_name'] in list(factor_value.keys()):
        if 'IC' in list(summa['perf']['1_d'].keys()):
            factor_value_adj[summa['factor_name']] = factor_value[
                summa['factor_name']] * uc.sign(summa['perf']['1_d']['IC'])
        else:
            factor_value_adj[summa['factor_name']] = factor_value[
                summa['factor_name']] * uc.sign(
                    summa['perf']['1_d']['ic-mean'])

# 建立股票在未来n日的涨跌标签
oc_data = fetch_data.fetch(begin, end, ['stock_adjopen', 'stock_adjclose'])
ud_tag = uc.ts_delay(oc_data['stock_adjclose'], -pred_window) / uc.ts_delay(
    oc_data['stock_adjopen'], -1) - 1  # 以第二日的开盘价买入
ud_tag = ud_tag.mask(ud_tag > 0, 1)
ud_tag = ud_tag.mask(ud_tag < 0, 0)

# 股票因子值的reshape
new_f = {}
for k, v in factor_value_adj.items():
    new_v = pd.DataFrame(v.stack())
    new_v.columns = [k]
    new_f[k] = new_v
new_f = pd.concat(new_f.values(), axis=1)

# 滚动生成上涨概率预测
prediction = {}
bay = GaussianNB()
for date in trade_days:
    sub_data = new_f.loc[date, ]
    model = sm.OLS(sub_data.iloc[:, -1],
                   sm.add_constant(sub_data.iloc[:, 0:-1]),
                   missing='drop').fit()
    coef[date] = model.params
    R_sq[date] = model.rsquared_adj
    print(date)
coef_param = pd.concat(coef.values(), axis=1, keys=coef.keys())
coef_param = pd.DataFrame(coef_param.values.T,
                          index=coef_param.columns,
                          columns=coef_param.index)  # 转置
r2_param = pd.DataFrame(R_sq.values(),
                        index=R_sq.keys(),
                        columns=['R_square_adj'])
coef_param = uc.ts_delay(coef_param, 2)  # 2天后才能用估计出的参数
r2_param = uc.ts_delay(r2_param, 2)  # 2天后才能用估计出的参数
coef_param = coef_param.groupby(
    coef_param.index.strftime('%Y-%m')).mean()  # 每个月更新一次权重
r2_param = r2_param.groupby(
    r2_param.index.strftime('%Y-%m')).mean()  # 每个月更新一次权重
plt.figure()
plt.plot(r2_param.index, r2_param['R_square_adj'])
plt.show()
coef_param.to_csv(data_pat + '/linear_regress_m/coef_param.csv',
                  encoding='gbk')
r2_param.to_csv(data_pat + '/linear_regress_m/r2_param.csv', encoding='gbk')

# 画出因子暴露时间序列
le = np.size(coef_param, 0)
la = math.ceil(4 * (le / 100)**(2 / 9))
la = math.ceil(4*(le/100)**(2/9))
for coef_name in coef_param.columns:
    plt.figure()
    plt.plot(coef_param.index, coef_param[coef_name])
    plt.plot(coef_param.index, coef_param[coef_name].rolling(20).mean())
    plt.title(coef_name, fontproperties="SimSun")
    plt.show()
    model = sm.OLS(coef_param[coef_name], [1 for i in range(le)]).fit(cov_type='HAC', cov_kwds={'maxlags': la})
    print(model.summary())  # 有些因子的系数显著为负?多因子回归的影响


# 求收益率预测值(只用最近一次截面回归得到的系数)
fac = {}
new_f['const'] = 1
new_f = new_f.drop(['next_re'], axis=1)
coef_param2 = pd.concat([new_f.reset_index(level=1).iloc[:, 0], uc.ts_delay(coef_param, 2)], axis=1)  # 2天后才能用估计出的参数
coef_param2 = coef_param2.set_index([coef_param2.index, 'level_1'])
pred = (coef_param2 * new_f).sum(axis=1, min_count=2)  # 至少包含一个变量和一个const
pred = pred.unstack()
pred = pred.dropna(how='all')
fac['nearest'] = pred
f = open(data_pat + '/linear_regress_7/nearest/fac.pkl', 'wb')  # 记得修改
pickle.dump(fac, f, -1)
f.close()


# 求收益率预测值(用过去20日截面回归得到的系数的平均值)
fac = {}
coef_param3 = pd.concat([new_f.reset_index(level=1).iloc[:, 0], uc.ts_delay(coef_param.rolling(20).mean(), 2)], axis=1)  # 2天后才能用估计出的参数
coef_param3 = coef_param3.set_index([coef_param3.index, 'level_1'])
pred2 = (coef_param3 * new_f).sum(axis=1, min_count=2)  # 至少包含一个变量和一个const
    plt.plot(coef_param.index, coef_param[coef_name])
    plt.plot(coef_param.index, coef_param[coef_name].rolling(20).mean())
    plt.title(coef_name, fontproperties="SimSun")
    plt.show()
    model = sm.OLS(coef_param[coef_name],
                   [1 for i in range(le)]).fit(cov_type='HAC',
                                               cov_kwds={'maxlags': la})
    print(model.summary())  # 有些因子的系数显著为负?多因子回归的影响

# 求收益率预测值(只用最近一次截面回归得到的系数)
fac = {}
new_f['const'] = 1
new_f = new_f.drop(['next_re'], axis=1)
coef_param2 = pd.concat(
    [new_f.reset_index(level=1).iloc[:, 0],
     uc.ts_delay(coef_param, 2)],
    axis=1)  # 2天后才能用估计出的参数
coef_param2 = coef_param2.set_index([coef_param2.index, 'level_1'])
pred = (coef_param2 * new_f).sum(axis=1, min_count=2)  # 至少包含一个变量和一个const
pred = pred.unstack()
pred = pred.dropna(how='all')
fac['nearest'] = pred
f = open(data_pat + '/linear_regress_22/nearest/fac.pkl', 'wb')  # 记得修改
pickle.dump(fac, f, -1)
f.close()

# 求收益率预测值(用过去20日截面回归得到的系数的平均值)
fac = {}
coef_param3 = pd.concat([
    new_f.reset_index(level=1).iloc[:, 0],
    uc.ts_delay(coef_param.rolling(20).mean(), 2)
from copy import deepcopy
import numpy as np
import time
import json
from collections import Counter

data_pat = 'E:/FT_Users/LihaiYang/Files/factor_comb_data/fac_meaning/5group'  # 记得修改

# 计算未来1、3、5、10、20日收益率,以开盘1小时tvwap为标准
begin = '2015-01-01'  # 记得修改
end = '2020-02-28'
end1 = '2019-12-31'
data = fetch_data.fetch(begin, end, ['stock_adjtwap_0930_1030'])
index_data = fetch_data.fetch(begin, end, ['index_close'], '000905')
stock_re = {}
stock_re['1_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'],
                              -2) / uc.ts_delay(
                                  data['stock_adjtwap_0930_1030'], -1) - 1
stock_re['3_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'],
                              -4) / uc.ts_delay(
                                  data['stock_adjtwap_0930_1030'], -1) - 1
stock_re['5_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'],
                              -6) / uc.ts_delay(
                                  data['stock_adjtwap_0930_1030'], -1) - 1
stock_re['10_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'],
                               -11) / uc.ts_delay(
                                   data['stock_adjtwap_0930_1030'], -1) - 1
stock_re['20_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'],
                               -21) / uc.ts_delay(
                                   data['stock_adjtwap_0930_1030'], -1) - 1
trade_days = query_data.get_trade_days('d',
                                       from_trade_day=begin,
plt.figure()
plt.plot(r2_param.index, r2_param['R_square_adj'])
plt.plot(r2_param.index, r2_param['R_square_adj'].rolling(20).mean())
plt.show()
coef_param.to_csv(data_pat + '/ols/coef_param.csv', encoding='gbk')
r2_param.to_csv(data_pat + '/ols/r2_param.csv', encoding='gbk')

# 求收益率预测值
new_f['const'] = 1
new_f = new_f.drop(['stock_rela'], axis=1)
fac = {}

# 只用最近一次截面回归得到的系数
coef_param2 = pd.concat(
    [new_f.reset_index(level=1).iloc[:, 0],
     uc.ts_delay(coef_param, 11)],
    axis=1)  # 11天后才能用估计出的参数,记得修改
coef_param2 = coef_param2.set_index([coef_param2.index, 'level_1'])
pred = (coef_param2 * new_f).sum(axis=1, min_count=1)
pred = pred.unstack()
pred = pred.dropna(how='all')
fac['nearest'] = pred

# 用过去20日截面回归得到的系数的平均值
coef_param3 = pd.concat([
    new_f.reset_index(level=1).iloc[:, 0],
    uc.ts_delay(coef_param.rolling(20).mean(), 11)
],
                        axis=1)  # 11天后才能用估计出的参数,记得修改
coef_param3 = coef_param3.set_index([coef_param3.index, 'level_1'])
pred2 = (coef_param3 * new_f).sum(axis=1, min_count=1)