Пример #1
0
    def get_all_data(self,factor_name):
        timeline = ts.get_trading_date(self.start, self.end)
        ret = pd.read_parquet('/home/sharedFold/zhaorui/ret.parquet')

        read_path = r'/home/xiaonan/factor_wxn/factor/'
        read_path = os.path.join(read_path, factor_name)
        
        ans = []
        
        for date in timeline:
            path = read_path + r'/' + str(date) + '.csv'
            if os.path.exists(path):
                factor = pd.read_csv(path, index_col=None, header=None)
            else:
                continue
            factor.columns = ['code', 'nouse','values']
            factor.loc[:,'date'] = date
            factor = factor[factor['code'].isin(self.pool.loc[date,'code'].values)].copy()
            ans.append(factor)

        factor = pd.concat(ans)
        factor.reset_index(inplace = True)
        factor.drop(['index','nouse'], axis = 1, inplace = True)
        
        
        self.allData = pd.merge(factor, ret, how = 'left', left_on=['code','date'], right_on=['code','dt'])
Пример #2
0
def factorTimeInterval():
    ans = {}
    path = r'/home/xiaonan/factor_wxn/factor/'
    for i in os.listdir(path):
        if i == '.directory':
            continue
        ans[i] = {}
        _ = os.path.join(path, i)
        datelist = os.listdir(_)
        datelist = [int(i[:8]) for i in datelist if i != '.directory']
        start = np.min(datelist)
        end = np.max(datelist)
        t = ts.get_trading_date(start, end)
        ans[i]['start'] = start
        ans[i]['end'] = end
        print('****************************************')
        print('factor_name: ', i)
        if len(t) != len(datelist):
            print('{}: the trading date is missing {}!!!'.format(
                i,
                len(t) - len(datelist)))
        print('start  time: ', start)
        print('end    time: ', end)
    print('****************************************')

    pd.DataFrame(ans).T.to_csv('factorTimeInterval.csv')
Пример #3
0
    def get_factor(self, start, end, factor_name, delay):
        def get_one(date):
            path = read_path + r'/' + str(date) + '.csv'
            factor = pd.read_csv(path, index_col=None, header=None)
            factor.columns = ['code', 'nouse', 'values']
            factor.loc[:, 'date'] = ts.get_nxt_trading_dates(date,
                                                             delay + 1)[-1]
            return factor

        timeline = ts.get_trading_date(start, end)

        read_path = os.path.join(self.factor_path, factor_name)

        ans = Parallel(10)(delayed(get_one)(date) for date in tqdm(timeline))
        '''
        for date in timeline:
            path = read_path + r'/' + str(date) + '.csv'
            factor = pd.read_csv(path, index_col=None, header=None)
            factor.columns = ['code', 'nouse','values']
            factor.loc[:,'date'] = date
            ans.append(factor)
        '''
        factor = pd.concat(ans)
        factor.reset_index(inplace=True)
        factor.drop(['index', 'nouse'], axis=1, inplace=True)

        return factor
Пример #4
0
    def regression(start, end, factor, ret):
        path = r'/home/xiaonan/factor_wxn/factor/'
        path = os.path.join(path, factor)

        x = {}
        y = {}
        for date in ts.get_trading_date(start, end):
            _ans = pd.read_csv(path + r'/' + str(date) + r'.csv',
                               index_col=0,
                               header=None)
            _ans.columns = ['nouse', 'values']

            x[date] = _ans.loc[:, 'values']

            y[date] = ret[['code', 'y_close_5'
                           ]][ret['dt'] == date].set_index('code')['y_close_5']

        x = pd.DataFrame(x).T.fillna(0)
        y = pd.DataFrame(y).T.fillna(0)
        code_list = x.columns
        y = y.loc[:, x.columns]
        x = x.to_dict('series')
        y = y.to_dict('series')

        beta = {}
        for code in code_list:
            _x = sm.add_constant(x[code])
            _y = y[code]
            model = sm.OLS(_y, _x)
            r = model.fit()
            beta[code] = r.params.iloc[-1]

        beta = pd.DataFrame(beta, index=['beta']).T
        lambdai = {}
        y = pd.DataFrame(y).T.to_dict('series')
        for date in ts.get_trading_date(start, end):
            model = sm.OLS(y[date], sm.add_constant(beta))
            r = model.fit()
            lambdai[date] = r.params.iloc[-1]

        lambdai = pd.DataFrame(lambdai, index=['lambda']).T
        lambdai.to_csv('lambda.csv')
        print(lambdai.mean())
Пример #5
0
def count_nan():
    ans = []
    timeline = ts.get_trading_date(start, end)
    path = r'/home/xiaonan/factor_wxn/rawFactor/'
    for factor_name in tqdm(os.listdir(path)):
        if factor_name == '.directory':
            continue
        for date in timeline:
            read_path = path + factor_name + r'/' + str(date) + r'.csv'
            data = pd.read_csv(read_path, index_col=0, header=None)
            data.columns = ['nouse', 'values']
            if data['values'].isnull().sum() >= 30:
                ans.append((factor_name, date))
    np.save('list', ans)
Пример #6
0
    def run(self):
        """
        start:int
        end:int
        factor_list:a list contains some functions
        """
        timeline = ts.get_trading_date(self.start, self.end)

        for date in tqdm(timeline):
            # ********************  load data and preprocessing  ********************
            start = datetime.datetime.now()
            x1, x3, x4 = datetime.timedelta(0), datetime.timedelta(
                0), datetime.timedelta(0)

            stkdata = data.data._get_snap(date)
            x1 = datetime.datetime.now() - start

            time1 = datetime.datetime.now()

            save_list0 = []  # save raw factor
            save_list1 = []  # save nor factor

            MAX_WORKERS = 12
            res = Parallel(n_jobs=MAX_WORKERS)(
                delayed(self.pipiline)(func, stkdata)
                for func in tqdm(self.factor_list))

            # ******************** save raw data and normalization data  ********************
            for factor, _factor, func in res:
                save_list0.append((factor, date, func.__name__))
                save_list1.append((_factor, date, func.__name__))

            time2 = datetime.datetime.now()
            x3 += time2 - time1
            time5 = datetime.datetime.now()

            tools.tools.save2raw(save_list0)
            tools.tools.save2nor(save_list1)

            end = datetime.datetime.now()
            x4 = end - time5

            print('snapshot data')
            print('load data:  ', x1)
            print('cal factor:  ', x3)
            print('save factor:  ', x4)
            print('all time:  ', end - start)
Пример #7
0
    def group_test(self,
                   start,
                   end,
                   factor_name,
                   n=5,
                   trading_settlement='close2close',
                   delay=0,
                   day=1,
                   plot=True):
        self.preprocessing(start, end, factor_name, delay)

        _factor = self.factor[factor_name].copy()

        allData = pd.merge(_factor,
                           self.ret,
                           how='left',
                           left_on=['code', 'date'],
                           right_on=['code', 'dt'])
        allData = allData.values

        timeline = ts.get_trading_date(start, end)
        _ret = {}
        _val = {}
        for i in range(allData.shape[0]):
            if allData[i, 2] in _ret.keys():
                pass
            else:
                _ret[allData[i, 2]] = []
                _val[allData[i, 2]] = []

            if np.isnan(allData[i, 5]):
                _ret[allData[i, 2]].append(0)
            else:
                _ret[allData[i, 2]].append(allData[i, 5])
            if np.isnan(allData[i, 1]):
                _val[allData[i, 2]].append(0)
            else:
                _val[allData[i, 2]].append(allData[i, 1])

        for i in range(n):
            locals()['group_{}'.format(i)] = []

        for i in tqdm(timeline):
            _ret_i = np.array(_ret[i])

            _val_i = np.array(_val[i])
            percent = []
            for j in range(n + 1):
                percent.append(np.percentile(_val_i, 100 / n * j))
            percent[-1] += 1

            for j in range(n):
                lay = (_val_i >= percent[j]) & (_val_i < percent[j + 1])
                if np.sum(lay) > 0:
                    if np.isnan(np.mean(_ret_i[lay])):
                        locals()['group_{}'.format(j)].append(np.mean(_ret_i))
                    else:
                        locals()['group_{}'.format(j)].append(
                            np.mean(_ret_i[lay]))
                else:
                    locals()['group_{}'.format(j)].append(np.mean(_ret_i))

        if os.path.exists(self.write_path + factor_name + r'/'):
            pass
        else:
            os.mkdir(self.write_path + factor_name + r'/')

        group = {}
        for i in range(n):
            group[str(i)] = locals()['group_{}'.format(i)]
        _ = ((pd.DataFrame(group, index=list(map(str, timeline))) +
              1)).cumprod()
        # _ = (pd.DataFrame(group, index = list(map(str, timeline)))).cumsum() + 1
        pd.DataFrame(group, index=list(map(str, timeline))).to_csv('group.csv')
        if plot:
            self.ReportFig.groupFig(_.apply(lambda x: x - x.mean(), axis=1),
                                    self.write_path + factor_name + r'/')

        return group
Пример #8
0
    def ratio(self,
              start,
              end,
              factor_name,
              freq='3M',
              trading_settlement='close2close',
              delay=0,
              day=1):
        fre = int(freq[:-1])
        fee = 0.9995
        group = self.group_test(start,
                                end,
                                factor_name,
                                trading_settlement='close2close',
                                delay=0,
                                day=1,
                                plot=False)

        l = np.cumsum(np.array(group['0'])) + 1
        s = np.cumsum(np.array(group['4'])) + 1
        longshort = np.cumsum(-np.array(group['4']) + np.array(group['0'])) + 1
        '''
        fig,ax = plt.subplots()
        ax.plot(longshort, color = 'y')
        ax.plot(l,color = 'b')
        ax.plot(s,color = 'r')
        plt.show(fig)
        '''
        # l = np.cumprod((np.array(group['0']) + 1) * fee)
        # s = np.cumprod((np.array(group['0']) + 1) * fee)

        # longshort = np.cumprod((1 - np.array(group['4']) + np.array(group['0'])) * fee)

        timeline = ts.get_trading_date(start, end)
        timeline = np.array(timeline)
        time_split = [timeline[0]]

        while (time_split[-1] <= timeline[-1]):
            Y, a = divmod(time_split[-1], 10000)
            M, D = divmod(a, 100)
            time_split.append(10000 * (Y + (M + fre) // 12) + 100 *
                              ((M + fre) % 12) + D)

        ans = {
            'DateRange': {},
            'TradingDays': {},
            'ReturnRatio': {},
            'LongReturnRatio': {},
            'ShortReturnRatio': {},
            'SharpeRatio': {},
            'WinRatio': {},
            'ProfitCrossRatio': {},
            'MaxDrawdown': {},
            'CalmarRatio': {},
        }

        for i in range(len(time_split) - 1):
            lay = (timeline >= time_split[i]) & (timeline < time_split[i + 1])
            time = timeline[lay]
            data = longshort[lay]
            longdata = l[lay]
            shortdata = s[lay]

            ans['DateRange'][i] = '{}~{}'.format(time_split[i],
                                                 time_split[i + 1])
            ans['TradingDays'][i] = self.CRatio.dateRange(time)
            ans['ReturnRatio'][i], ans['LongReturnRatio'][i], ans[
                'ShortReturnRatio'][i] = self.CRatio.allRet(
                    time, data, longdata, shortdata)
            ans['SharpeRatio'][i] = self.CRatio.sharpe(time, data)
            ans['WinRatio'][i] = self.CRatio.win_ratio(time, data)
            ans['ProfitCrossRatio'][i] = self.CRatio.pcr(time, data)
            ans['MaxDrawdown'][i] = self.CRatio.maxdrawdown(time, data)
            ans['CalmarRatio'][i] = self.CRatio.calmar(time, data)

        # print dataframe
        printDataFrame(pd.DataFrame(ans))
        '''
        x = 18
        for i in pd.DataFrame(ans).columns:
            print(i + (x+1 - len(i)) * ' ' + '|', end = '')
        print('')
        for i,j in pd.DataFrame(ans).iterrows():
            for k in j.index:
                if isinstance(j[k], str):
                    res = j[k]
                else:
                    res = str(np.round(j[k],4))
                print(res, (x - len(res)) * ' ' + '|', end = '')
            print('')
        '''
        pd.DataFrame(ans).to_csv(self.write_path + factor_name + r'/ratio.csv',
                                 index=None)
Пример #9
0
    def ic_test(self,
                start,
                end,
                factor_name,
                trading_settlement='close2close',
                delay=0,
                day=1):
        self.preprocessing(start, end, factor_name, delay)

        _factor = self.factor[factor_name].copy()

        allData = pd.merge(_factor,
                           self.ret,
                           how='left',
                           left_on=['code', 'date'],
                           right_on=['code', 'dt'])

        # print(allData.columns)
        allData = allData.values

        timeline = ts.get_trading_date(start, end)
        _ret = {}
        _val = {}
        for i in range(allData.shape[0]):
            if allData[i, 2] in _ret.keys():
                pass
            else:
                _ret[allData[i, 2]] = []
                _val[allData[i, 2]] = []

            if np.isnan(allData[i, 5]):
                _ret[allData[i, 2]].append(0)
            else:
                _ret[allData[i, 2]].append(allData[i, 5])
            if np.isnan(allData[i, 1]):
                _val[allData[i, 2]].append(0)
            else:
                _val[allData[i, 2]].append(allData[i, 1])

        kinds = ['AllIC', 'xBottom', 'xTop', 'yBottom', 'yTop']
        ic = {}
        for i in kinds:
            ic[i] = []

        for i in tqdm(timeline):
            a = paraCorr.one_corr(np.array(_ret[i]), np.array(_val[i]))
            if ic['AllIC']:
                ic['AllIC'].append(ic['AllIC'][-1] + a)
            else:
                ic['AllIC'].append(a)

            lay = (np.array(_ret[i]) > np.median(_ret[i]))
            ytop = paraCorr.one_corr(
                np.array(_ret[i])[lay],
                np.array(_val[i])[lay])
            if ic['yTop']:
                ic['yTop'].append(ic['yTop'][-1] + ytop)
            else:
                ic['yTop'].append(ytop)

            lay = (np.array(_ret[i]) < np.median(_ret[i]))
            ybottom = paraCorr.one_corr(
                np.array(_ret[i])[lay],
                np.array(_val[i])[lay])
            if ic['yBottom']:
                ic['yBottom'].append(ic['yBottom'][-1] + ybottom)
            else:
                ic['yBottom'].append(ybottom)

            lay = (np.array(_val[i]) > np.median(_val[i]))
            xtop = paraCorr.one_corr(
                np.array(_ret[i])[lay],
                np.array(_val[i])[lay])
            if ic['xTop']:
                ic['xTop'].append(ic['xTop'][-1] + xtop)
            else:
                ic['xTop'].append(xtop)

            lay = (np.array(_val[i]) < np.median(_val[i]))
            xbottom = paraCorr.one_corr(
                np.array(_ret[i])[lay],
                np.array(_val[i])[lay])
            if ic['xBottom']:
                ic['xBottom'].append(ic['xBottom'][-1] + xbottom)
            else:
                ic['xBottom'].append(xbottom)

        if os.path.exists(self.write_path + factor_name + r'/'):
            pass
        else:
            os.mkdir(self.write_path + factor_name + r'/')

        self.ReportFig.icFig(pd.DataFrame(ic, index=list(map(str, timeline))),
                             self.write_path + factor_name + r'/')
        pd.DataFrame(ic, index=list(map(str, timeline))).to_csv('ic.csv')
        return ic
Пример #10
0
def portfolio_beta(start,
                   end,
                   factor_list,
                   inSample=100,
                   outSample=10,
                   univ_name='TOP2000'):
    NUM = 200
    print('loading data......')
    ret = pd.read_parquet('/home/sharedFold/zhaorui/ret.parquet')
    pool = dM.load_universe(start, end, univ_name=univ_name)
    timeline = ts.get_trading_date(start, end)
    allFactor = pd.DataFrame()
    for factor_name in factor_list:
        read_path = '/home/xiaonan/factor_wxn/factor/' + factor_name + r'/'
        ans = []
        for date in timeline:
            data = pd.read_csv(read_path + str(date) + r'.csv', header=None)
            data.drop([1], inplace=True, axis=1)
            data.columns = ['code', factor_name]
            data.loc[:, 'date'] = date
            ans.append(data)
        ans = pd.concat(ans)
        if allFactor.empty:
            allFactor = ans.copy()
        else:
            allFactor = pd.merge(allFactor,
                                 ans,
                                 left_on=['code', 'date'],
                                 right_on=['code', 'date'],
                                 how='outer')
    allData = pd.merge(allFactor,
                       ret.loc[:, ['y_close_1', 'code', 'dt']],
                       left_on=['code', 'date'],
                       right_on=['code', 'dt'],
                       how='left')

    print('loading data end\n')
    print('backtest started')
    allInSample, allOutSample, allNeutral = [], [], []
    predict_days = np.arange(inSample, len(timeline), outSample)
    for i in tqdm(predict_days):
        inPool = pool.loc[timeline[i - inSample]:timeline[i]]
        if i + outSample < len(timeline):
            outPool = pool.loc[timeline[i]:timeline[i + outSample]]
        else:
            outPool = pool.loc[timeline[i]:]

        data_inSample = allData[
            allData.code.isin(np.unique(inPool.values.flatten()))
            & allData.date.isin(timeline[i - inSample:i])].copy()
        data_outSample = allData[
            allData.code.isin(np.unique(outPool.values.flatten()))
            & allData.date.isin(timeline[i:i + outSample])].copy()

        x = data_inSample.loc[:, factor_list].fillna(
            data_inSample.loc[:, factor_list].mean())
        y = data_inSample.y_close_1.fillna(data_inSample.y_close_1.mean())
        from sklearn.ensemble import RandomForestRegressor
        model = LinearRegression()
        r = model.fit(y=y.to_frame(), X=x)

        x = data_inSample.loc[:, factor_list].fillna(
            data_inSample.loc[:, factor_list].mean())
        predict_inSample = r.predict(x)
        data_inSample.loc[:, 'predict'] = predict_inSample

        x = data_outSample.loc[:, factor_list].fillna(
            data_outSample.loc[:, factor_list].mean())
        predict_outSample = r.predict(x)
        data_outSample.loc[:, 'predict'] = predict_outSample

        _in = data_inSample.groupby('date').apply(
            lambda x: x[['predict', 'y_close_1']].sort_values('predict').iloc[
                -NUM:, 1].mean()).copy()
        _out = data_outSample.groupby('date').apply(
            lambda x: x[['predict', 'y_close_1']].sort_values('predict').iloc[
                -NUM:, 1].mean()).copy()
        _out_neutral = _out - data_outSample.groupby('date').apply(
            lambda x: x['y_close_1'].mean()).copy()

        allInSample.append(_in.loc[timeline[i - inSample]:timeline[i]])
        allOutSample.append(_out)
        allNeutral.append(_out_neutral)

    allInSample = pd.concat(allInSample).sort_index()
    allInSample.index = allInSample.index.map(str)

    allOutSample = pd.concat(allOutSample).sort_index()
    allOutSample.index = allOutSample.index.map(str)

    allNeutral = pd.concat(allNeutral).sort_index()
    allNeutral.index = allNeutral.index.map(str)

    fig, ax = plt.subplots(3, 1)

    ax[0].plot((0.9997 * (1 + allInSample)).cumprod(), color='r')
    ax[1].plot((0.9997 * (1 + allOutSample)).cumprod(), color='b')
    ax[2].plot((0.9997 * (1 + allNeutral)).cumprod(), color='y')

    for t in [0, 1, 2]:
        for i, tick in enumerate(ax[t].get_xticklabels()):
            if i % 128 == 0:
                tick.set_visible(True)
                tick.set_rotation(30)
            else:
                tick.set_visible(False)

        for i, tick in enumerate(ax[t].get_xticklines()):
            if i % 128 == 0:
                tick.set_visible(True)
            else:
                tick.set_visible(False)

    fig.savefig('model_beta.jpg')
    # plt.show(fig)

    return (0.9997 * (1 + allNeutral)).cumprod()
Пример #11
0
def plot_IC(start, end, name, retinterval='y_close_5', plot=False):
    plt.rcParams['figure.figsize'] = (18, 6)
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['xtick.direction'] = 'in'
    plt.rcParams['ytick.direction'] = 'in'

    timeline = ts.get_trading_date(start, end)
    ret = pd.read_parquet('/home/sharedFold/zhaorui/ret.parquet')
    # ret = ret[ret['dt'].isin(timeline)].copy()

    ic = {}
    ic['ic_values'] = {}
    read_path = r'/home/xiaonan/factor_wxn/factor/'
    read_path = os.path.join(read_path, name)
    pool = dM.load_universe(start, end, univ_name='TOP2000')
    pool = pool.apply(lambda x: x.apply(lambda x: int(x))).copy()
    for date in timeline:
        if os.path.exists(read_path + r'/' + str(date) + '.csv'):
            factor = pd.read_csv(read_path + r'/' + str(date) + '.csv',
                                 index_col=0,
                                 header=None)
        else:
            continue
        factor.columns = ['nouse', 'values']
        _ret = ret[ret['dt'] == date].set_index('code').copy()
        comindex = _ret.index & factor.index

        _ic = factor.loc[pool.loc[date, 'code'].values,
                         ['values']].corrwith(_ret.loc[pool.loc[date,
                                                                'code'].values,
                                                       retinterval],
                                              method='spearman').iloc[0]
        ic['ic_values'][str(date)] = _ic

    ans = pd.DataFrame(ic).sort_index()
    print(ans.mean().iloc[0])

    fig, ax = plt.subplots()

    ax.bar(x=ans.index,
           height=ans['ic_values'].apply(lambda x: x if x >= 0 else 0).values,
           color='#13CCB1',
           label='ic(left +)')
    ax.bar(x=ans.index,
           height=ans['ic_values'].apply(lambda x: x if x < 0 else 0).values,
           color='#EACC80',
           label='ic(left -)')

    for i in ['top', 'bottom', 'left', 'right']:
        ax.spines[i].set_visible(True)
    ax.yaxis.grid(linestyle='--', alpha=0.3)
    ax.set_title('{} {}'.format(name, round(ans.mean().iloc[0], 3)))

    ax.legend(loc='upper left')
    ax1 = ax.twinx()
    ax1.plot(ans.cumsum(), color='darkgrey', label='acc_ic(right)')
    ax1.legend(loc='upper right')

    for i, tick in enumerate(ax.get_xticklabels()):
        if i % 128 == 0:
            tick.set_visible(True)
            tick.set_rotation(30)
        else:
            tick.set_visible(False)
    for i, tick in enumerate(ax.get_xticklines()):
        if i % 128 == 0:
            tick.set_visible(True)
        else:
            tick.set_visible(False)

    plt.savefig('./rankIcFig/{}.jpg'.format(name))
    if plot:
        plt.show(fig)