Exemplo n.º 1
0
    def test_neutralize_explain_output(self):
        y = self.y[:, 0].flatten()

        calc_res, other_stats = neutralize(self.x, y, detail=True)

        model = LinearRegression(fit_intercept=False)
        model.fit(self.x, y)

        exp_res = y - self.x @ model.coef_.T
        exp_explained = self.x * model.coef_.T

        np.testing.assert_array_almost_equal(calc_res, exp_res.reshape(-1, 1))
        np.testing.assert_array_almost_equal(other_stats['explained'][:, :, 0],
                                             exp_explained)

        calc_res, other_stats = neutralize(self.x, self.y, detail=True)

        model = LinearRegression(fit_intercept=False)
        model.fit(self.x, self.y)

        exp_res = self.y - self.x @ model.coef_.T
        np.testing.assert_array_almost_equal(calc_res, exp_res)

        for i in range(self.y.shape[1]):
            exp_explained = self.x * model.coef_.T[:, i]
            np.testing.assert_array_almost_equal(
                other_stats['explained'][:, :, i], exp_explained)
Exemplo n.º 2
0
    def test_neutralize_explain_output_with_group(self):
        y = self.y[:, 0].flatten()

        calc_res, other_stats = neutralize(self.x, y, self.groups, detail=True)

        model = LinearRegression(fit_intercept=False)
        for i in range(30):
            curr_x = self.x[self.groups == i]
            curr_y = y[self.groups == i]
            model.fit(curr_x, curr_y)
            exp_res = curr_y - curr_x @ model.coef_.T
            exp_explained = curr_x * model.coef_.T
            np.testing.assert_array_almost_equal(calc_res[self.groups == i], exp_res.reshape(-1, 1))
            np.testing.assert_array_almost_equal(other_stats['explained'][self.groups == i, :, 0],
                                                 exp_explained)

        calc_res, other_stats = neutralize(self.x, self.y, self.groups, detail=True)

        model = LinearRegression(fit_intercept=False)
        for i in range(30):
            curr_x = self.x[self.groups == i]
            curr_y = self.y[self.groups == i]
            model.fit(curr_x, curr_y)
            exp_res = curr_y - curr_x @ model.coef_.T
            np.testing.assert_array_almost_equal(calc_res[self.groups == i], exp_res)

            for j in range(self.y.shape[1]):
                exp_explained = curr_x * model.coef_.T[:, j]
                np.testing.assert_array_almost_equal(
                    other_stats['explained'][self.groups == i, :, j], exp_explained)
def process_data(total_data, factor_cols, risk_cols):
    risk_values = total_data[risk_cols].values
    factor_values = total_data[factor_cols].values
    processed_values = np.zeros(factor_values.shape)

    for i in range(processed_values.shape[1]):
        try:
            processed_values[:, i] = neutralize(risk_values,
                                                standardize(winsorize_normal(factor_values[:, [i]]))).flatten()
        except np.linalg.linalg.LinAlgError:
            processed_values[:, i] = neutralize(risk_values,
                                                winsorize_normal(factor_values[:, [i]])).flatten()
    return processed_values
Exemplo n.º 4
0
 def calc_decay(self, factors, decay_interval=5):
     interval = decay_interval + 1
     decay_dict = {}
     for column in self._columns:
         factors_names = []
         factors_list = []
         values = {}
         grouped = factors.groupby(by='code')
         for k, group in grouped:
             group = group.sort_values(by='trade_date', ascending=True)
             for i in range(1, interval):
                 group[str(i) + '_' + column] = group[column].shift(0+i)
             factors_list += group[-interval:].to_dict(orient='records')
         new_factors_sets = pd.DataFrame(factors_list)
         for i in range(1, interval):
             factors_names.append(str(i) + '_' + column)
         industry_dummy = pd.get_dummies(new_factors_sets.indexSymbol)
         neutralized_factors = neutralize(industry_dummy.values.astype(float),
                              new_factors_sets[factors_names].values,
                              groups=new_factors_sets['trade_date'])
         new_factors_sets[factors_names] = neutralized_factors
         for f in factors_names:
             ic_series = new_factors_sets.groupby('trade_date').apply(lambda x: np.corrcoef(x[f].fillna(0), x['chgPct'])[0, 1])
             values[f] = ic_series.mean()
         values = pd.DataFrame([values])
         values.columns=['q' + str(i) for i in range(1, decay_interval+1)]
         decay_dict[column] = values
     return decay_dict
Exemplo n.º 5
0
def benchmark_neutralize_with_groups(n_samples: int, n_features: int,
                                     n_loops: int, n_groups: int) -> None:
    print("-" * 60)
    print("Starting least square fitting with group benchmarking")
    print(
        "Parameters(n_samples: {0}, n_features: {1}, n_loops: {2}, n_groups: {3})"
        .format(n_samples, n_features, n_loops, n_groups))
    y = np.random.randn(n_samples, 5)
    x = np.random.randn(n_samples, n_features)
    groups = np.random.randint(n_groups, size=n_samples)

    start = dt.datetime.now()
    for _ in range(n_loops):
        _ = neutralize(x, y, groups)
    impl_model_time = dt.datetime.now() - start

    print('{0:20s}: {1}'.format('Implemented model', impl_model_time))

    start = dt.datetime.now()

    model = LinearRegression(fit_intercept=False)
    for _ in range(n_loops):
        for i in range(n_groups):
            curr_x = x[groups == i]
            curr_y = y[groups == i]
            model.fit(curr_x, curr_y)
            _ = curr_y - curr_x @ model.coef_.T
    benchmark_model_time = dt.datetime.now() - start

    print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
Exemplo n.º 6
0
def benchmark_neutralize(n_samples: int, n_features: int,
                         n_loops: int) -> None:
    print("-" * 60)
    print("Starting least square fitting benchmarking")
    print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format(
        n_samples, n_features, n_loops))

    y = np.random.randn(n_samples, 5)
    x = np.random.randn(n_samples, n_features)

    start = dt.datetime.now()
    for _ in range(n_loops):
        calc_res = neutralize(x, y)
    impl_model_time = dt.datetime.now() - start

    print('{0:20s}: {1}'.format('Implemented model', impl_model_time))

    start = dt.datetime.now()
    for _ in range(n_loops):
        benchmark_model = LinearRegression(fit_intercept=False)
        benchmark_model.fit(x, y)
        exp_res = y - x @ benchmark_model.coef_.T
    benchmark_model_time = dt.datetime.now() - start

    print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))

    np.testing.assert_array_almost_equal(calc_res, exp_res)
Exemplo n.º 7
0
def risk_analysis(
        net_weight_series: pd.Series, next_bar_return_series: pd.Series,
        risk_table: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    group_idx = net_weight_series.index.values.astype(int)
    net_pos = net_weight_series.values.reshape((-1, 1))
    risk_factor_cols = risk_table.columns

    idiosyncratic, other_stats = neutralize(risk_table.values,
                                            next_bar_return_series.values,
                                            group_idx,
                                            detail=True)

    systematic = other_stats['explained']
    exposure = other_stats['exposure']

    explained_table = np.hstack((idiosyncratic, systematic[:, :, 0]))
    cols = ['idiosyncratic']
    cols.extend(risk_factor_cols)

    explained_table = pd.DataFrame(explained_table * net_pos,
                                   columns=cols,
                                   index=net_weight_series.index)
    exposure_table = pd.DataFrame(exposure[:, :, 0] * net_pos,
                                  columns=risk_factor_cols,
                                  index=net_weight_series.index)
    return explained_table, exposure_table.groupby(level=0).first()
Exemplo n.º 8
0
    def test_neutralize(self):
        calc_res = neutralize(self.x, self.y)

        model = LinearRegression(fit_intercept=False)
        model.fit(self.x, self.y)

        exp_res = self.y - self.x @ model.coef_.T

        np.testing.assert_array_almost_equal(calc_res, exp_res)
Exemplo n.º 9
0
    def test_neutralize_with_group(self):

        calc_res = neutralize(self.x, self.y, self.groups)

        model = LinearRegression(fit_intercept=False)
        for i in range(30):
            curr_x = self.x[self.groups == i]
            curr_y = self.y[self.groups == i]
            model.fit(curr_x, curr_y)
            exp_res = curr_y - curr_x @ model.coef_.T
            np.testing.assert_array_almost_equal(calc_res[self.groups == i], exp_res)
Exemplo n.º 10
0
def factor_processing(raw_factor: np.ndarray,
                      pre_process: Optional[List] = None,
                      risk_factors: Optional[np.ndarray] = None) -> np.ndarray:

    new_factor = raw_factor

    if pre_process:
        for p in pre_process:
            new_factor = p(new_factor)

    if risk_factors is not None:
        new_factor = neutralize(risk_factors, new_factor)

    return new_factor
Exemplo n.º 11
0
    def test_factor_processing(self):
        new_factor = factor_processing(self.raw_factor)
        np.testing.assert_array_almost_equal(new_factor, self.raw_factor)

        new_factor = factor_processing(self.raw_factor,
                                       pre_process=[standardize, winsorize_normal])

        np.testing.assert_array_almost_equal(new_factor, winsorize_normal(standardize(self.raw_factor)))

        new_factor = factor_processing(self.raw_factor,
                                       pre_process=[standardize, winsorize_normal],
                                       risk_factors=self.risk_factor)

        np.testing.assert_array_almost_equal(new_factor, neutralize(self.risk_factor,
                                                                    winsorize_normal(standardize(self.raw_factor))))
Exemplo n.º 12
0
def factor_processing(raw_factors: np.ndarray,
                      pre_process: Optional[List] = None,
                      risk_factors: Optional[np.ndarray] = None,
                      post_process: Optional[List] = None,
                      groups=None) -> np.ndarray:

    new_factors = raw_factors

    if pre_process:
        for p in pre_process:
            new_factors = p(new_factors, groups=groups)

    if risk_factors is not None:
        risk_factors = risk_factors[:, risk_factors.sum(axis=0) != 0]
        new_factors = neutralize(risk_factors, new_factors, groups=groups)

    if post_process:
        for p in post_process:
            new_factors = p(new_factors, groups=groups)

    return new_factors
Exemplo n.º 13
0
 def on_factor_processing(self, new_factors_sets, columns = []):
     calc_columns = columns if len(columns) > 0 else self._columns
     ### 根据因子种类不同,做nan处理,基本面因子(成长,价值,质量)采用行业中值处理,其他以0处理,
     #暂时以0处理
     for column in calc_columns:
         new_factors_sets[column] = new_factors_sets[column].fillna(0)
     #去极值
     for column in  calc_columns:
         new_factors_sets['winsorize_' + column] = winsorize_normal(new_factors_sets[column].values.reshape(-1,1),
                                                                    num_stds=1).flatten()
     #行业风险中性化
     for column in  calc_columns:
         new_factors_sets['neutralize_' + column] = neutralize(
             new_factors_sets[self._risk_columns].values.astype(float), 
             new_factors_sets['winsorize_' + column].values).flatten()
     
     #标准化
     for column in calc_columns:
         new_factors_sets['standardize_' + column] = standardize(
             new_factors_sets['neutralize_' + column].values.reshape(-1,1))
     #暂时以0处理
     for column in calc_columns:
         new_factors_sets[column] = new_factors_sets[column].fillna(0)
     return new_factors_sets
Exemplo n.º 14
0
def factor_processing(raw_factors: np.ndarray,
                      pre_process: Optional[List] = None,
                      risk_factors: Optional[np.ndarray] = None,
                      post_process: Optional[List] = None,
                      groups=None) -> np.ndarray:
    new_factors = raw_factors

    if pre_process:
        for p in pre_process:
            new_factors = p(new_factors, groups=groups)

    if risk_factors is not None:
        risk_factors = risk_factors[:, risk_factors.sum(axis=0) != 0]
        new_factors = neutralize(risk_factors, new_factors, groups=groups)

    if post_process:
        for p in post_process:
            if p.__name__ == 'winsorize_normal':
                alpha_logger.warning(
                    "winsorize_normal "
                    "normally should not be done after neutralize")
            new_factors = p(new_factors, groups=groups)

    return new_factors
Exemplo n.º 15
0
def update_daily_portfolio(ds, **kwargs):
    execution_date = kwargs['next_execution_date']

    if not isBizDay('china.sse', execution_date):
        logger.info("{0} is not a business day".format(execution_date))
        return 0

    prev_date = advanceDateByCalendar('china.sse', execution_date, '-1b')

    logger.info("factor data is loading for {0}".format(prev_date))
    logger.info("Current running date is {0}".format(execution_date))

    common_factors = ['EPSAfterNonRecurring', 'DivP']
    prod_factors = ['CFinc1', 'BDTO', 'RVOL']
    uqer_factors = ['CoppockCurve', 'EPS']

    factor_weights = np.array([-1.0, 2.0])
    factor_weights = factor_weights / factor_weights.sum()

    engine = sqlalchemy.create_engine('mysql+mysqldb://sa:[email protected]/multifactor?charset=utf8')
    engine2 = sqlalchemy.create_engine(
        'mysql+pymysql://sa:[email protected]:3306/multifactor?charset=utf8')

    common_factors_df = pd.read_sql("select Code, 申万一级行业, {0} from factor_data where Date = '{1}'"
                                    .format(','.join(common_factors), prev_date), engine)

    prod_factors_df = pd.read_sql("select Code, {0} from prod_500 where Date = '{1}'"
                                  .format(','.join(prod_factors), prev_date), engine)

    uqer_factor_df = pd.read_sql(
        "select Code, {0} from factor_uqer where Date = '{1}'".format(','.join(uqer_factors), prev_date), engine2)

    risk_factor_df = pd.read_sql("select Code, {0} from risk_factor_500 where Date = '{1}'"
                                 .format(','.join(risk_factors_500), prev_date), engine)

    index_components_df = get_etf_index_weight.get_nffund_idx_etf_component(prev_date.strftime('%Y%m%d'), index='zz500')
    index_industry_weights = get_etf_index_weight.get_sw_industry_weight(index_components_df)
    index_components_df.rename(columns={'weight': 'benchmark'}, inplace=True)

    total_data = pd.merge(common_factors_df, uqer_factor_df, on=['Code'])
    total_data = pd.merge(total_data, risk_factor_df, on=['Code'])
    total_data = pd.merge(total_data, index_components_df, on=['Code'])
    total_data = total_data[total_data['benchmark'] != 0]

    null_flags = np.any(np.isnan(total_data[uqer_factors]), axis=1)
    total_data.fillna(0, inplace=True)

    total_factors = uqer_factors
    risk_factors_names = risk_factors_500 + ['Market']
    total_data['Market'] = 1.

    all_factors = total_data[total_factors]
    risk_factors = total_data[risk_factors_names]

    factor_processed = neutralize(risk_factors.values,
                                  standardize(winsorize_normal(all_factors.values)))

    normed_factor = pd.DataFrame(factor_processed, columns=total_factors, index=[prev_date] * len(factor_processed))

    er = normed_factor @ factor_weights

    # portfolio construction

    bm = total_data['benchmark'].values
    lbound = np.zeros(len(total_data))
    ubound = 0.01 + bm
    risk_exposure = total_data[risk_factors_names].values

    ubound[null_flags] = 0.

    if len(bm) != 500:

        total_weight = index_industry_weights['weight'].sum()
        filtered = index_industry_weights[index_industry_weights.industry.isin(risk_factors_500)]

        ind_weights = filtered['weight'].values

        risk_lbound = np.concatenate([ind_weights / total_weight,
                                      [bm @ total_data['Size'].values / total_weight],
                                      [1.]], axis=0)
        risk_ubound = np.concatenate([ind_weights / total_weight,
                                      [bm @ total_data['Size'].values / total_weight],
                                      [1.]], axis=0)
    else:
        risk_lbound = bm @ risk_exposure
        risk_ubound = bm @ risk_exposure

    # get black list 1
    engine = sqlalchemy.create_engine('mssql+pymssql://sa:[email protected]/WindDB')
    black_list = pd.read_sql("select S_INFO_WINDCODE, S_INFO_LISTDATE, sum(S_SHARE_RATIO) as s_ratio from ASHARECOMPRESTRICTED \
                              where S_INFO_LISTDATE BETWEEN '{0}' and '{1}' " \
                             "GROUP BY S_INFO_WINDCODE, S_INFO_LISTDATE ORDER BY s_ratio DESC;"
                             .format((execution_date - dt.timedelta(days=7)).strftime('%Y%m%d'),
                                     (execution_date + dt.timedelta(days=14)).strftime('%Y%m%d')), engine)

    black_list = black_list[black_list['s_ratio'] >= 3.]
    black_list.S_INFO_WINDCODE = black_list.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0]))

    mask_array = total_data.Code.isin(black_list.S_INFO_WINDCODE)
    ubound[mask_array.values] = 0.

    # get black list 2
    black_list2 = pd.read_sql("select S_INFO_WINDCODE, AVG(S_WQ_AMOUNT) as avg_amount from ASHAREWEEKLYYIELD "
                              "where TRADE_DT < {1} and TRADE_DT >= {0} GROUP BY S_INFO_WINDCODE;"
                              .format((execution_date - dt.timedelta(days=30)).strftime('%Y%m%d'),
                                      execution_date.strftime('%Y%m%d')), engine)
    black_list2 = black_list2[black_list2['avg_amount'] <= 15000.]
    black_list2.S_INFO_WINDCODE = black_list2.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0]))

    mask_array2 = total_data.Code.isin(black_list2.S_INFO_WINDCODE)
    ubound[mask_array2.values] = 0.

    # get black list 3
    black_list3 = pd.read_sql("SELECT S_INFO_WINDCODE, S_DQ_SUSPENDDATE FROM ASHARETRADINGSUSPENSION AS a "
                              "WHERE a.S_DQ_SUSPENDDATE = (SELECT top 1 S_DQ_SUSPENDDATE FROM ASHARETRADINGSUSPENSION AS b "
                              "WHERE a.S_INFO_WINDCODE=b.S_INFO_WINDCODE and cast(floor(cast(b.OPDATE as float)) as datetime) <= '{0}' ORDER BY b.S_DQ_SUSPENDDATE DESC) "
                              "AND a.S_INFO_WINDCODE IN (SELECT S_INFO_WINDCODE FROM ASHAREDESCRIPTION AS c "
                              "WHERE c.S_INFO_DELISTDATE IS NULL) AND (a.S_DQ_SUSPENDDATE>='{1}' OR (a.S_DQ_RESUMPDATE IS NULL AND a.S_DQ_SUSPENDTYPE=444003000))"
                              .format(execution_date, execution_date.strftime('%Y%m%d')),
                              engine)
    black_list3.S_INFO_WINDCODE = black_list3.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0]))
    mask_array3 = total_data.Code.isin(black_list3.S_INFO_WINDCODE)
    ubound[mask_array3.values] = 0.

    # manual black list
    try:
        bk_list = pd.read_csv('~/mnt/sharespace/personal/licheng/portfolio/zz500_black_list/{0}.csv'.format(
            prev_date.strftime('%Y-%m-%d')),
                              encoding='gbk',
                              names=['code'])
        logger.info('Manual black list exists for the date: {0}'.format(prev_date.strftime('%Y-%m-%d')))
        for code in bk_list['code']:
            ubound[total_data.Code == int(code)] = 0.
    except FileNotFoundError:
        logger.info('No manual black list exists for the date: {0}'.format(prev_date.strftime('%Y-%m-%d')))

    weights = build_portfolio(er,
                              builder='linear',
                              risk_exposure=risk_exposure,
                              lbound=lbound,
                              ubound=ubound,
                              risk_target=(risk_lbound, risk_ubound),
                              solver='GLPK')

    portfolio = pd.DataFrame({'weight': weights,
                              'industry': total_data['申万一级行业'].values,
                              'zz500': total_data['benchmark'].values,
                              'er': er}, index=total_data.Code)

    client = pymongo.MongoClient('mongodb://10.63.6.176:27017')
    db = client.multifactor
    portfolio_collection = db.portfolio

    detail_info = {}
    for code, w, bm_w, ind, r in zip(total_data.Code.values, weights, total_data['benchmark'].values,
                                     total_data['申万一级行业'].values, er):
        detail_info[str(code)] = {
            'weight': w,
            'industry': ind,
            'zz500': bm_w,
            'er': r
        }

    portfolio_dict = {'Date': prev_date,
                      'portfolio': detail_info}

    portfolio_collection.delete_many({'Date': prev_date})
    portfolio_collection.insert_one(portfolio_dict)

    portfolio.to_csv('~/mnt/sharespace/personal/licheng/portfolio/zz500/{0}.csv'.format(prev_date.strftime('%Y-%m-%d')),
                     encoding='gbk')

    return 0
Exemplo n.º 16
0
        index_components, ref_date), engine)

total_data = pd.merge(common_factors_df, prod_factors_df, on=['Date', 'Code'])
total_data = pd.merge(total_data, risk_factor_df, on=['Date', 'Code'])
total_data = pd.merge(total_data, index_components_df, on=['Date', 'Code'])
total_data = total_data[total_data[index_components] != 0]
total_data[index_components] = total_data[index_components] / 100.0

total_factors = common_factors + prod_factors
risk_factors_names = risk_factors_500 + ['Market']
total_data['Market'] = 1.

all_factors = total_data[total_factors]
risk_factors = total_data[risk_factors_names]

factor_processed = neutralize(
    risk_factors.values, standardize(winsorize_normal(all_factors.values)))

normed_factor = pd.DataFrame(factor_processed,
                             columns=total_factors,
                             index=total_data.Date)

er = normed_factor @ factor_weights

# portfolio construction

bm = total_data[index_components].values
lbound = 0.
ubound = 0.01 + bm
lbound_exposure = -0.01
ubound_exposure = 0.01
risk_exposure = total_data[risk_factors_names].values