def batch_processing(x_values, y_values, groups, group_label, batch, risk_exp, pre_process, post_process): train_x_buckets = {} train_y_buckets = {} predict_x_buckets = {} predict_y_buckets = {} for i, start in enumerate(groups[:-batch]): end = groups[i + batch] left_index = bisect.bisect_left(group_label, start) right_index = bisect.bisect_left(group_label, end) this_raw_x = x_values[left_index:right_index] this_raw_y = y_values[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None train_x_buckets[end] = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) train_y_buckets[end] = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) left_index = bisect.bisect_right(group_label, start) right_index = bisect.bisect_right(group_label, end) sub_dates = group_label[left_index:right_index] this_raw_x = x_values[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index] this_raw_y = y_values[left_index:right_index] if len(this_raw_y) > 0: ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index] return train_x_buckets, train_y_buckets, predict_x_buckets, predict_y_buckets
def cs_impl(ref_date, factor_data, factor_name, risk_exposure, constraint_risk, industry_matrix, dx_returns): total_data = pd.merge(factor_data, risk_exposure, on='code') total_data = pd.merge(total_data, industry_matrix, on='code') total_data = total_data.replace([np.inf, -np.inf], np.nan).dropna() if len(total_data) < 0.33 * len(factor_data): alpha_logger.warning( f"valid data point({len(total_data)}) " f"is less than 33% of the total sample ({len(factor_data)}). Omit this run" ) return np.nan, np.nan, np.nan total_risk_exp = total_data[constraint_risk] er = total_data[[factor_name]].values.astype(float) er = factor_processing(er, [winsorize_normal, standardize], total_risk_exp.values, [standardize]).flatten() industry = total_data.industry_name.values codes = total_data.code.tolist() target_pos = pd.DataFrame({ 'code': codes, 'weight': er, 'industry': industry }) target_pos['weight'] = target_pos['weight'] / target_pos['weight'].abs( ).sum() target_pos = pd.merge(target_pos, dx_returns, on=['code']) target_pos = pd.merge(target_pos, total_data[['code'] + constraint_risk], on=['code']) total_risk_exp = target_pos[constraint_risk] activate_weight = target_pos['weight'].values excess_return = np.exp(target_pos[['dx']].values) - 1. excess_return = factor_processing( excess_return, [winsorize_normal, standardize], total_risk_exp.values, [winsorize_normal, standardize]).flatten() port_ret = np.log(activate_weight @ excess_return + 1.) ic = np.corrcoef(excess_return, activate_weight)[0, 1] x = sm.add_constant(activate_weight) results = sm.OLS(excess_return, x).fit() t_stats = results.tvalues[1] alpha_logger.info( f"{ref_date} is finished with {len(target_pos)} stocks for {factor_name}" ) alpha_logger.info( f"{ref_date} risk_exposure: " f"{np.sum(np.square(target_pos.weight.values @ target_pos[constraint_risk].values))}" ) return port_ret, ic, t_stats
def test_factor_processing(self): new_factor = factor_processing(self.raw_factor) np.testing.assert_array_almost_equal(new_factor, self.raw_factor) new_factor = factor_processing(self.raw_factor, pre_process=[standardize, winsorize_normal]) np.testing.assert_array_almost_equal(new_factor, winsorize_normal(standardize(self.raw_factor))) new_factor = factor_processing(self.raw_factor, pre_process=[standardize, winsorize_normal], risk_factors=self.risk_factor) np.testing.assert_array_almost_equal(new_factor, neutralize(self.risk_factor, winsorize_normal(standardize(self.raw_factor))))
def cs_impl(ref_date, factor_data, factor_name, risk_exposure, constraint_risk, industry_matrix, dx_returns): total_data = pd.merge(factor_data, risk_exposure, on='code') total_data = pd.merge(total_data, industry_matrix, on='code').dropna() total_risk_exp = total_data[constraint_risk] er = total_data[factor_name].values.astype(float) er = factor_processing(er, [], total_risk_exp.values, []).flatten() industry = total_data.industry_name.values codes = total_data.code.tolist() target_pos = pd.DataFrame({'code': codes, 'weight': er, 'industry': industry}) target_pos['weight'] = target_pos['weight'] / target_pos['weight'].abs().sum() target_pos = pd.merge(target_pos, dx_returns, on=['code']) target_pos = pd.merge(target_pos, total_data[['code'] + constraint_risk], on=['code']) activate_weight = target_pos.weight.values excess_return = np.exp(target_pos.dx.values) - 1. port_ret = np.log(activate_weight @ excess_return + 1.) ic = np.corrcoef(excess_return, activate_weight)[0, 1] x = sm.add_constant(activate_weight) results = sm.OLS(excess_return, x).fit() t_stats = results.tvalues[1] alpha_logger.info(f"{ref_date} is finished with {len(target_pos)} stocks for {factor_name}") alpha_logger.info(f"{ref_date} risk_exposure: " f"{np.sum(np.square(target_pos.weight.values @ target_pos[constraint_risk].values))}") return port_ret, ic, t_stats
def factor_analysis(factors: pd.DataFrame, factor_weights: np.ndarray, industry: np.ndarray, d1returns: np.ndarray = None, detail_analysis=True, benchmark: Optional[np.ndarray] = None, risk_exp: Optional[np.ndarray] = None, is_tradable: Optional[np.ndarray] = None, constraints: Optional[Constraints] = None, method='risk_neutral', **kwargs) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: if 'pre_process' in kwargs: pre_process = kwargs['pre_process'] del kwargs['pre_process'] else: pre_process = [winsorize_normal, standardize] if 'post_process' in kwargs: post_process = kwargs['post_process'] del kwargs['post_process'] else: post_process = [winsorize_normal, standardize] er = factor_processing(factors.values, pre_process, risk_exp, post_process) @ factor_weights return er_portfolio_analysis(er, industry, d1returns, constraints, detail_analysis, benchmark, is_tradable, method, **kwargs)
def websim_weighted(factor_df, factor_list): total_data = factor_df.copy() risk_data = total_data[['code', 'trade_date'] + industry_styles + ['SIZE']] forward_returns = total_data[['code', 'trade_date', 'ret']] factor_data = total_data[factor_list] #等权合成 ndiff_field = [ i for i in list(set(total_data.columns)) if i not in factor_list ] #合成前数据预处理 alpha_res = [] grouped = total_data.groupby(['trade_date']) for k, g in grouped: ret_preprocess = factor_processing( g[factor_list].fillna(0).values, pre_process=[winsorize_normal, standardize]) f = pd.DataFrame(ret_preprocess, columns=factor_list) for k in ndiff_field: f[k] = g[k].values alpha_res.append(f) alpha_data = pd.concat(alpha_res) alpha_data['conmbine'] = alpha_data[factor_list].mean(axis=1).values weight = Weighted() stats = weight.run(alpha_data, risk_data, forward_returns, 'conmbine') if abs(stats['fitness']) > 0.554246 and stats['sharpe'] > 1.243449: score = abs(stats['fitness']) else: score = abs(stats['fitness']) / 100 return abs(score)
def prc_win_std(params): df = params[0] factor_name = params[1] ret_preprocess = factor_processing( df[[factor_name]].values, pre_process=[winsorize_normal, standardize], ) df["prc_factor"] = ret_preprocess return df
def fetch_dx_return(self, ref_date: str, codes: Iterable[int], expiry_date: str = None, horizon: int = 0, offset: int = 0, neutralized_risks: list = None, pre_process=None, post_process=None, benchmark: int = None) -> pd.DataFrame: start_date = ref_date if not expiry_date: end_date = advanceDateByCalendar( 'china.sse', ref_date, str(1 + horizon + offset + DAILY_RETURN_OFFSET) + 'b').strftime('%Y-%m-%d') else: end_date = expiry_date query = select([ Market.trade_date, Market.code.label("code"), Market.chgPct.label("chgPct") ]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(codes), Market.flag == 1)).order_by(Market.trade_date, Market.code) df = pd.read_sql(query, self.session.bind).dropna() df = self._create_stats(df, horizon, offset) df = df[df.trade_date == ref_date] if benchmark: benchmark = _map_index_codes[benchmark] query = select( [IndexMarket.trade_date, IndexMarket.chgPct.label("chgPct")]).where( and_(IndexMarket.trade_date.between(start_date, end_date), IndexMarket.indexCode == benchmark, IndexMarket.flag == 1)) df2 = pd.read_sql(query, self.session.bind).dropna() df2 = self._create_stats(df2, horizon, offset, no_code=True) ind_ret = df2[df2.trade_date == ref_date]['dx'].values[0] df['dx'] = df['dx'] - ind_ret if neutralized_risks: _, risk_exp = self.fetch_risk_model(ref_date, codes) df = pd.merge(df, risk_exp, on='code').dropna() df[['dx']] = factor_processing( df[['dx']].values, pre_process=pre_process, risk_factors=df[neutralized_risks].values, post_process=post_process) return df[['code', 'dx']]
def test_quantile_analysis_with_factor_processing(self): f_df = pd.DataFrame(self.x) calculated = quantile_analysis(f_df, self.x_w, self.r, n_bins=self.n_bins, risk_exp=self.risk_exp, pre_process=[winsorize_normal, standardize], post_process=[standardize]) er = self.x_w @ factor_processing(self.x, [winsorize_normal, standardize], self.risk_exp, [standardize]).T expected = er_quantile_analysis(er, self.n_bins, self.r) np.testing.assert_array_almost_equal(calculated, expected)
def test_quantile_analysis_with_benchmark(self): f_df = pd.DataFrame(self.x) calculated = quantile_analysis(f_df, self.x_w, self.r, n_bins=self.n_bins, do_neutralize=True, benchmark=self.b_w, risk_exp=self.risk_exp, pre_process=[winsorize_normal, standardize], post_process=[standardize]) er = self.x_w @ factor_processing(self.x, [winsorize_normal, standardize], self.risk_exp, [standardize]).T raw_er = er_quantile_analysis(er, self.n_bins, self.r) expected = raw_er * self.b_w.sum() - np.dot(self.b_w, self.r) np.testing.assert_array_almost_equal(calculated, expected)
def quantile_analysis(factors: pd.DataFrame, factor_weights: np.ndarray, dx_return: np.ndarray, n_bins: int = 5, risk_exp: Optional[np.ndarray] = None, **kwargs): if 'pre_process' in kwargs: pre_process = kwargs['pre_process'] del kwargs['pre_process'] else: pre_process = [winsorize_normal, standardize] if 'post_process' in kwargs: post_process = kwargs['post_process'] del kwargs['post_process'] else: post_process = [standardize] er = factor_processing(factors.values, pre_process, risk_exp, post_process) @ factor_weights return er_quantile_analysis(er, n_bins, dx_return, **kwargs)
def fetch_dx_return(self, ref_date: str, codes: Iterable[int], expiry_date: str = None, horizon: int = 0, offset: int = 0, neutralized_risks: list = None, pre_process=None, post_process=None) -> pd.DataFrame: start_date = ref_date if not expiry_date: end_date = advanceDateByCalendar('china.sse', ref_date, str(1 + horizon + offset + DAILY_RETURN_OFFSET) + 'b').strftime('%Y%m%d') else: end_date = expiry_date stats = self._create_stats(Market, horizon, offset) query = select([Market.trade_date, Market.code, stats]).where( and_( Market.trade_date.between(start_date, end_date), Market.code.in_(codes) ) ) df = pd.read_sql(query, self.session.bind).dropna() df = df[df.trade_date == ref_date] if neutralized_risks: _, risk_exp = self.fetch_risk_model(ref_date, codes) df = pd.merge(df, risk_exp, on='code').dropna() df[['dx']] = factor_processing(df[['dx']].values, pre_process=pre_process, risk_factors=df[neutralized_risks].values, post_process=post_process) return df[['code', 'dx']].drop_duplicates(['code'])
def equal_combine(factor_df, factor_list): factor_df = factor_df.copy() ndiff_field = [ i for i in list(set(factor_df.columns)) if i not in factor_list ] #合成前数据预处理 alpha_res = [] grouped = factor_df.groupby(['trade_date']) for k, g in grouped: ret_preprocess = factor_processing( g[factor_list].fillna(0).values, pre_process=[winsorize_normal, standardize]) f = pd.DataFrame(ret_preprocess, columns=factor_list) for k in ndiff_field: f[k] = g[k].values alpha_res.append(f) total_data = pd.concat(alpha_res) total_data = factor_df total_data['conmbine'] = total_data[factor_list].mean(axis=1).values score = np.corrcoef(total_data['conmbine'].fillna(0).values, total_data['ret'].fillna(0).values)[0, 1] #score = abs(total_data['conmbine'].mean()) / 100 return abs(score)
def run(self, running_setting): alpha_logger.info("starting backting ...") total_data_groups = self.total_data.groupby('trade_date') rets = [] turn_overs = [] leverags = [] previous_pos = pd.DataFrame() executor = copy.deepcopy(running_setting.executor) positions = pd.DataFrame() if self.dask_client is None: models = {} for ref_date, _ in total_data_groups: models[ref_date] = train_model(ref_date.strftime('%Y-%m-%d'), self.alpha_model, self.data_meta) else: def worker(parameters): new_model = train_model(parameters[0].strftime('%Y-%m-%d'), parameters[1], parameters[2]) return parameters[0], new_model l = self.dask_client.map(worker, [(d[0], self.alpha_model, self.data_meta) for d in total_data_groups]) results = self.dask_client.gather(l) models = dict(results) for ref_date, this_data in total_data_groups: new_model = models[ref_date] codes = this_data.code.values.tolist() if previous_pos.empty: current_position = None else: previous_pos.set_index('code', inplace=True) remained_pos = previous_pos.loc[codes] remained_pos.fillna(0., inplace=True) current_position = remained_pos.weight.values if running_setting.rebalance_method == 'tv': risk_cov = self.total_risk_cov[self.total_risk_cov.trade_date == ref_date] sec_cov = self._generate_sec_cov(this_data, risk_cov) else: sec_cov = None benchmark_w = this_data.weight.values constraints = LinearConstraints(running_setting.bounds, this_data, benchmark_w) lbound, ubound = self._create_lu_bounds(running_setting, codes, benchmark_w) features = new_model.features dfs = [] for name in features: data_cleaned = this_data.dropna(subset=[name]) raw_factors = data_cleaned[[name]].values new_factors = factor_processing( raw_factors, pre_process=self.data_meta.pre_process, risk_factors=data_cleaned[ self.data_meta.neutralized_risk].values.astype(float) if self.data_meta.neutralized_risk else None, post_process=self.data_meta.post_process) df = pd.DataFrame(new_factors, columns=[name], index=data_cleaned.code) dfs.append(df) new_factors = pd.concat(dfs, axis=1) new_factors = new_factors.loc[codes].fillna(new_factors.median()) er = new_model.predict(new_factors).astype(float) alpha_logger.info('{0} re-balance: {1} codes'.format( ref_date, len(er))) target_pos = self._calculate_pos(running_setting, er, this_data, constraints, benchmark_w, lbound, ubound, sec_cov=sec_cov, current_position=current_position) target_pos['code'] = codes target_pos['trade_date'] = ref_date turn_over, executed_pos = executor.execute(target_pos=target_pos) leverage = executed_pos.weight.abs().sum() ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) - 1.) rets.append(np.log(1. + ret)) executor.set_current(executed_pos) turn_overs.append(turn_over) leverags.append(leverage) positions = positions.append(executed_pos) previous_pos = executed_pos positions['benchmark_weight'] = self.total_data['weight'].values positions['dx'] = self.total_data.dx.values trade_dates = positions.trade_date.unique() ret_df = pd.DataFrame( { 'returns': rets, 'turn_over': turn_overs, 'leverage': leverags }, index=trade_dates) ret_df['benchmark_returns'] = self.index_return['dx'] ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1], self.freq)] = 0. ret_df = ret_df.shift(1) ret_df.iloc[0] = 0. ret_df['excess_return'] = ret_df[ 'returns'] - ret_df['benchmark_returns'] * ret_df['leverage'] return ret_df, positions
def fetch_train_phase(engine, alpha_factors: Iterable[object], ref_date, frequency, universe, batch, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0) -> dict: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = _map_horizon(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() return_df, factor_df = df[['trade_date', 'code', 'dx']], df[['trade_date', 'code', 'isOpen'] + transformer.names] return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \ _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-2] start = dates[-batch - 1] else: end = dates[-1] start = dates[-batch] index = (date_label >= start) & (date_label <= end) this_raw_x = x_values[index] this_raw_y = y_values[index] if risk_exp is not None: this_risk_exp = risk_exp[index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ret = dict() ret['x_names'] = transformer.names ret['train'] = {'x': ne_x, 'y': ne_y} return ret
def fetch_predict_phase(engine, alpha_factors: Iterable[object], ref_date, frequency, universe, batch, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0): transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).dropna() names = transformer.names if neutralized_risk: risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] used_neutralized_risk = list(set(neutralized_risk).difference(names)) risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) risk_exp = train_x[neutralized_risk].values.astype(float) x_values = train_x[names].values.astype(float) else: train_x = factor_df.copy() risk_exp = None date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime() dates = np.unique(date_label) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-1] start = dates[-batch] # index = (date_label >= start) & (date_label <= end) left_index = bisect.bisect_left(date_label, start) right_index = bisect.bisect_right(date_label, end) this_raw_x = x_values[left_index:right_index] sub_dates = date_label[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) ne_x = ne_x[inner_left_index:inner_right_index] left_index = bisect.bisect_left(date_label, end) right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = {'x': ne_x, 'code': codes} return ret
total_data = pickle.load(file2) total_data = total_data.sort_values(by=['trade_date', 'code'], ascending=True) diff_filed = ['trade_date', 'code', 'ret'] factor_columns = [ i for i in list(set(total_data.columns)) if i not in ['trade_date', 'code', 'ret'] ] #所有因子数据做去极值和标准化处理 alpha_res = [] grouped = total_data.groupby(['trade_date']) for k, g in grouped: ret_preprocess = factor_processing( g[factor_columns].fillna(0).values, pre_process=[winsorize_normal, standardize]) f = pd.DataFrame(ret_preprocess, columns=factor_columns) for k in diff_filed: f[k] = g[k].values alpha_res.append(f) total_data = pd.concat(alpha_res) point = int(np.random.uniform(0, len(factor_columns)) / 2) ori_field = factor_columns[:point] add_field = factor_columns[point:] #best_code, best_field best_code, best_field = mutation_factors.genetic_run(total_data, diff_filed=diff_filed, strong_field=ori_field,
def fetch_predict_phase(engine, alpha_factors: Union[Transformer, Iterable[object]], ref_date, frequency, universe, batch=1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, fillna: str = None, fit_target: Union[Transformer, object] = None): if isinstance(alpha_factors, Transformer): transformer = alpha_factors else: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch - 1) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = map_freq(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) if fillna: factor_df = factor_df.groupby('trade_date').apply( lambda x: x.fillna(x.median())).reset_index(drop=True).dropna() else: factor_df = factor_df.dropna() if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) names = transformer.names if neutralized_risk: risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] used_neutralized_risk = list(set(neutralized_risk).difference(names)) risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) train_x = pd.merge(train_x, target_df, on=['trade_date', 'code'], how='left') risk_exp = train_x[neutralized_risk].values.astype(float) else: train_x = pd.merge(factor_df, target_df, on=['trade_date', 'code'], how='left') risk_exp = None train_x.dropna(inplace=True, subset=train_x.columns[:-1]) x_values = train_x[names].values.astype(float) y_values = train_x[['dx']].values.astype(float) date_label = pd.DatetimeIndex(train_x.trade_date).to_pydatetime() dates = np.unique(date_label) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-1] start = dates[-batch] if batch <= len(dates) else dates[0] left_index = bisect.bisect_left(date_label, start) right_index = bisect.bisect_right(date_label, end) this_raw_x = x_values[left_index:right_index] this_raw_y = y_values[left_index:right_index] sub_dates = date_label[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) ne_x = ne_x[inner_left_index:inner_right_index] ne_y = ne_y[inner_left_index:inner_right_index] left_index = bisect.bisect_left(date_label, end) right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None ne_y = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = { 'x': pd.DataFrame(ne_x, columns=transformer.names), 'code': codes, 'y': ne_y.flatten() } return ret
def fetch_train_phase(engine, alpha_factors: Union[Transformer, Iterable[object]], ref_date, frequency, universe, batch=1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, fit_target: Union[Transformer, object] = None) -> dict: if isinstance(alpha_factors, Transformer): transformer = alpha_factors else: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = map_freq(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() target_df, factor_df = df[['trade_date', 'code', 'dx']], df[['trade_date', 'code'] + transformer.names] target_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \ _merge_df(engine, transformer.names, factor_df, target_df, universe, dates, risk_model, neutralized_risk) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): pyFinAssert( len(dates) >= 2, ValueError, "No previous data for training for the date {0}".format(ref_date)) end = dates[-2] start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0] else: end = dates[-1] start = dates[-batch] if batch <= len(dates) else dates[0] index = (date_label >= start) & (date_label <= end) this_raw_x = x_values[index] this_raw_y = y_values[index] this_code = codes[index] if risk_exp is not None: this_risk_exp = risk_exp[index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ret = dict() ret['x_names'] = transformer.names ret['train'] = { 'x': pd.DataFrame(ne_x, columns=transformer.names), 'y': ne_y, 'code': this_code } return ret
def run(self, running_setting): alpha_logger.info("starting backting ...") total_data_groups = self.total_data.groupby('trade_date') rets = [] turn_overs = [] leverags = [] previous_pos = pd.DataFrame() executor = copy.deepcopy(running_setting.executor) positions = pd.DataFrame() if self.alpha_models is None: self.prepare_backtest_models() for ref_date, this_data in total_data_groups: risk_model = self.risk_models[ref_date] new_model = self.alpha_models[ref_date] codes = this_data.code.values.tolist() if previous_pos.empty: current_position = None else: previous_pos.set_index('code', inplace=True) remained_pos = previous_pos.reindex(codes) remained_pos.fillna(0., inplace=True) current_position = remained_pos.weight.values benchmark_w = this_data.weight.values constraints = LinearConstraints(running_setting.bounds, this_data, benchmark_w) lbound, ubound = self._create_lu_bounds(running_setting, codes, benchmark_w) this_data.fillna(0, inplace=True) new_factors = factor_processing( this_data[new_model.features].values, pre_process=self.data_meta.pre_process, risk_factors=this_data[self.data_meta.neutralized_risk].values. astype(float) if self.data_meta.neutralized_risk else None, post_process=self.data_meta.post_process) new_factors = pd.DataFrame(new_factors, columns=new_model.features, index=codes) er = new_model.predict(new_factors).astype(float) alpha_logger.info('{0} re-balance: {1} codes'.format( ref_date, len(er))) target_pos = self._calculate_pos( running_setting, er, this_data, constraints, benchmark_w, lbound, ubound, risk_model=risk_model.get_risk_profile(codes), current_position=current_position) target_pos['code'] = codes target_pos['trade_date'] = ref_date turn_over, executed_pos = executor.execute(target_pos=target_pos) leverage = executed_pos.weight.abs().sum() ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) - 1.) rets.append(np.log(1. + ret)) executor.set_current(executed_pos) turn_overs.append(turn_over) leverags.append(leverage) positions = positions.append(executed_pos) previous_pos = executed_pos positions['benchmark_weight'] = self.total_data['weight'].values positions['dx'] = self.total_data.dx.values trade_dates = positions.trade_date.unique() ret_df = pd.DataFrame( { 'returns': rets, 'turn_over': turn_overs, 'leverage': leverags }, index=trade_dates) ret_df['benchmark_returns'] = self.index_return['dx'] ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1], self.freq)] = 0. ret_df = ret_df.shift(1) ret_df.iloc[0] = 0. ret_df['excess_return'] = ret_df[ 'returns'] - ret_df['benchmark_returns'] * ret_df['leverage'] return ret_df, positions
def run(self): alpha_logger.info("starting backting ...") total_factors = self.engine.fetch_factor_range( self.running_setting.universe, self.alpha_model.formulas, dates=self.running_setting.dates) alpha_logger.info("alpha factor data loading finished ...") total_industry = self.engine.fetch_industry_matrix_range( self.running_setting.universe, dates=self.running_setting.dates, category=self.running_setting.industry_cat, level=self.running_setting.industry_level) alpha_logger.info("industry data loading finished ...") total_benchmark = self.engine.fetch_benchmark_range( dates=self.running_setting.dates, benchmark=self.running_setting.benchmark) alpha_logger.info("benchmark data loading finished ...") total_risk_cov, total_risk_exposure = self.engine.fetch_risk_model_range( self.running_setting.universe, dates=self.running_setting.dates, risk_model=self.data_meta.risk_model) alpha_logger.info("risk_model data loading finished ...") total_returns = self.engine.fetch_dx_return_range( self.running_setting.universe, dates=self.running_setting.dates, horizon=self.running_setting.horizon, offset=1) alpha_logger.info("returns data loading finished ...") total_data = pd.merge(total_factors, total_industry, on=['trade_date', 'code']) total_data = pd.merge(total_data, total_benchmark, on=['trade_date', 'code'], how='left') total_data.fillna({'weight': 0.}, inplace=True) total_data = pd.merge(total_data, total_returns, on=['trade_date', 'code']) total_data = pd.merge(total_data, total_risk_exposure, on=['trade_date', 'code']) is_in_benchmark = (total_data.weight > 0.).astype(float).reshape( (-1, 1)) total_data.loc[:, 'benchmark'] = is_in_benchmark total_data.loc[:, 'total'] = np.ones_like(is_in_benchmark) total_data.sort_values(['trade_date', 'code'], inplace=True) total_data_groups = total_data.groupby('trade_date') rets = [] turn_overs = [] leverags = [] previous_pos = pd.DataFrame() executor = copy.deepcopy(self.running_setting.executor) positions = pd.DataFrame() if self.dask_client is None: models = {} for ref_date, _ in total_data_groups: models[ref_date] = train_model(ref_date.strftime('%Y-%m-%d'), self.alpha_model, self.data_meta) else: def worker(parameters): new_model = train_model(parameters[0].strftime('%Y-%m-%d'), parameters[1], parameters[2]) return parameters[0], new_model l = self.dask_client.map(worker, [(d[0], self.alpha_model, self.data_meta) for d in total_data_groups]) results = self.dask_client.gather(l) models = dict(results) for ref_date, this_data in total_data_groups: new_model = models[ref_date] this_data = this_data.fillna( this_data[new_model.features].median()) codes = this_data.code.values.tolist() if self.running_setting.rebalance_method == 'tv': risk_cov = total_risk_cov[total_risk_cov.trade_date == ref_date] sec_cov = self._generate_sec_cov(this_data, risk_cov) else: sec_cov = None benchmark_w = this_data.weight.values constraints = LinearConstraints(self.running_setting.bounds, this_data, benchmark_w) lbound = np.maximum( 0., benchmark_w - self.running_setting.weights_bandwidth) ubound = self.running_setting.weights_bandwidth + benchmark_w if previous_pos.empty: current_position = None else: previous_pos.set_index('code', inplace=True) remained_pos = previous_pos.loc[codes] remained_pos.fillna(0., inplace=True) current_position = remained_pos.weight.values features = new_model.features raw_factors = this_data[features].values new_factors = factor_processing( raw_factors, pre_process=self.data_meta.pre_process, risk_factors=this_data[self.data_meta.neutralized_risk].values. astype(float) if self.data_meta.neutralized_risk else None, post_process=self.data_meta.post_process) er = new_model.predict(pd.DataFrame( new_factors, columns=features)).astype(float) alpha_logger.info('{0} re-balance: {1} codes'.format( ref_date, len(er))) target_pos = self._calculate_pos(er, this_data, constraints, benchmark_w, lbound, ubound, sec_cov=sec_cov, current_position=current_position, **self.running_setting.more_opts) target_pos['code'] = codes target_pos['trade_date'] = ref_date turn_over, executed_pos = executor.execute(target_pos=target_pos) leverage = executed_pos.weight.abs().sum() ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) - 1.) rets.append(np.log(1. + ret)) executor.set_current(executed_pos) turn_overs.append(turn_over) leverags.append(leverage) positions = positions.append(executed_pos) previous_pos = executed_pos positions['benchmark_weight'] = total_data['weight'].values positions['dx'] = total_data.dx.values trade_dates = positions.trade_date.unique() ret_df = pd.DataFrame( { 'returns': rets, 'turn_over': turn_overs, 'leverage': leverags }, index=trade_dates) index_return = self.engine.fetch_dx_return_index_range( self.running_setting.benchmark, dates=self.running_setting.dates, horizon=self.running_setting.horizon, offset=1).set_index('trade_date') ret_df['benchmark_returns'] = index_return['dx'] ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1], self.running_setting.freq)] = 0. ret_df = ret_df.shift(1) ret_df.iloc[0] = 0. ret_df['excess_return'] = ret_df[ 'returns'] - ret_df['benchmark_returns'] * ret_df['leverage'] return ret_df, positions