def __init__(self, alpha_model, data_meta, universe, start_date, end_date, freq, benchmark=905, industry_cat='sw_adj', industry_level=1, dask_client=None): self.alpha_model = alpha_model self.data_meta = data_meta self.universe = universe self.benchmark = benchmark self.dates = makeSchedule(start_date, end_date, freq, 'china.sse') self.dates = [d.strftime('%Y-%m-%d') for d in self.dates] self.industry_cat = industry_cat self.industry_level = industry_level self.freq = freq self.horizon = map_freq(freq) self.engine = SqlEngine(self.data_meta.data_source) self.dask_client = dask_client self.total_data = None self.index_return = None self.risk_models = None self.alpha_models = None
def __init__(self, alpha_model, data_meta, running_setting, dask_client=None): self.alpha_model = alpha_model self.data_meta = data_meta self.running_setting = running_setting self.engine = SqlEngine(self.data_meta.data_source) self.dask_client = dask_client
def prepare_data(engine: SqlEngine, factors: Union[Transformer, Iterable[object]], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0): if warm_start > 0: p = Period(frequency) p = Period(length=-warm_start * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', start_date, p).strftime('%Y-%m-%d') dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Forward) dates = [d.strftime('%Y-%m-%d') for d in dates] horizon = map_freq(frequency) if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).sort_values( ['trade_date', 'code']) alpha_logger.info("factor data loading finished") return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) alpha_logger.info("return data loading finished") industry_df = engine.fetch_industry_range(universe, dates=dates) alpha_logger.info("industry data loading finished") benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) alpha_logger.info("benchmark data loading finished") df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, industry_df, on=['trade_date', 'code']) df['weight'] = df['weight'].fillna(0.) return dates, df[['trade_date', 'code', 'dx']], df[[ 'trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry' ] + transformer.names]
def fetch_train_data(self, ref_date, alpha_model: ModelBase): return fetch_train_phase(SqlEngine(self.data_source), alpha_model.formulas, ref_date, self.freq, self.universe, self.batch, self.neutralized_risk, self.risk_model, self.pre_process, self.post_process, self.warm_start, fit_target=alpha_model.fit_target)
def __init__(self, freq: str, universe: Universe, batch: int, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, data_source: str = None): self.data_source = data_source self.engine = SqlEngine(self.data_source) self.freq = freq self.universe = universe self.batch = batch self.neutralized_risk = neutralized_risk self.risk_model = risk_model self.pre_process = _map_process(pre_process) self.post_process = _map_process(post_process) self.warm_start = warm_start
def setUp(self): self.engine = SqlEngine(DATA_ENGINE_URI) dates_list = bizDatesList('china.sse', '2010-10-01', '2018-04-27') self.ref_date = random.choice(dates_list).strftime('%Y-%m-%d') alpha_logger.info("Test date: {0}".format(self.ref_date))
class TestSqlEngine(unittest.TestCase): def setUp(self): self.engine = SqlEngine(DATA_ENGINE_URI) dates_list = bizDatesList('china.sse', '2010-10-01', '2018-04-27') self.ref_date = random.choice(dates_list).strftime('%Y-%m-%d') alpha_logger.info("Test date: {0}".format(self.ref_date)) def test_sql_engine_fetch_codes(self): ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) query = select([UniverseTable.code]).where( and_(UniverseTable.trade_date == ref_date, or_(UniverseTable.zz500 == 1, UniverseTable.zz1000 == 1))) df = pd.read_sql(query, con=self.engine.engine).sort_values('code') self.assertListEqual(codes, list(df.code.values)) def test_sql_engine_fetch_codes_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes_range(universe, dates=ref_dates) query = select([UniverseTable.trade_date, UniverseTable.code]).where( and_(UniverseTable.trade_date.in_(ref_dates), or_(UniverseTable.zz500 == 1, UniverseTable.zz1000 == 1))) df = pd.read_sql(query, con=self.engine.engine).sort_values('code') for ref_date in ref_dates: calculated_codes = list( sorted(codes[codes.trade_date == ref_date].code.values)) expected_codes = list( sorted(df[df.trade_date == ref_date].code.values)) self.assertListEqual(calculated_codes, expected_codes) def test_sdl_engine_fetch_codes_with_exclude_universe(self): ref_date = self.ref_date universe = Universe('zz500') - Universe('cyb') codes = self.engine.fetch_codes(ref_date, universe) query = select([UniverseTable.code]).where( and_(UniverseTable.trade_date == ref_date, UniverseTable.zz500 == 1, UniverseTable.cyb == 0)) df = pd.read_sql(query, con=self.engine.engine).sort_values('code') self.assertListEqual(codes, list(df.code.values)) def test_sql_engine_fetch_dx_return(self): horizon = 4 offset = 1 ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) dx_return = self.engine.fetch_dx_return(ref_date, codes, horizon=horizon, offset=offset) start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([Market.code, Market.chgPct]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(dx_return.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) np.testing.assert_array_almost_equal(dx_return.dx.values, res.chgPct.values) horizon = 4 offset = 0 ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) dx_return = self.engine.fetch_dx_return(ref_date, codes, horizon=horizon, offset=offset) start_date = advanceDateByCalendar('china.sse', ref_date, '1b') end_date = advanceDateByCalendar('china.sse', ref_date, '5b') query = select([Market.code, Market.chgPct]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(dx_return.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) np.testing.assert_array_almost_equal(dx_return.dx.values, res.chgPct.values) def test_sql_engine_fetch_dx_return_with_benchmark(self): horizon = 4 offset = 1 benchmark = 300 ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) dx_return = self.engine.fetch_dx_return(ref_date, codes, horizon=horizon, offset=offset, benchmark=benchmark) start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([Market.code, Market.chgPct]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(dx_return.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) query = select( [IndexMarket.indexCode.label('code'), IndexMarket.chgPct]).where( and_(IndexMarket.trade_date.between(start_date, end_date), IndexMarket.indexCode == benchmark)) df = pd.read_sql(query, con=self.engine.engine) b_res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) np.testing.assert_array_almost_equal( dx_return.dx.values, res.chgPct.values - b_res.chgPct.values) horizon = 4 offset = 0 ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) dx_return = self.engine.fetch_dx_return(ref_date, codes, horizon=horizon, offset=offset, benchmark=benchmark) start_date = advanceDateByCalendar('china.sse', ref_date, '1b') end_date = advanceDateByCalendar('china.sse', ref_date, '5b') query = select([Market.code, Market.chgPct]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(dx_return.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) query = select( [IndexMarket.indexCode.label('code'), IndexMarket.chgPct]).where( and_(IndexMarket.trade_date.between(start_date, end_date), IndexMarket.indexCode == benchmark)) df = pd.read_sql(query, con=self.engine.engine) b_res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) np.testing.assert_array_almost_equal( dx_return.dx.values, res.chgPct.values - b_res.chgPct.values) def test_sql_engine_fetch_dx_return_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') universe = Universe('zz500') + Universe('zz1000') dx_return = self.engine.fetch_dx_return_range(universe, dates=ref_dates, horizon=4, offset=1) codes = self.engine.fetch_codes_range(universe, dates=ref_dates) groups = codes.groupby('trade_date') for ref_date, g in groups: start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([Market.code, Market.chgPct]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(g.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) calculated_return = dx_return[dx_return.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_return.dx.values, res.chgPct.values) def test_sql_engine_fetch_dx_return_range_with_benchmark(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') universe = Universe('zz500') + Universe('zz1000') benchmark = 300 dx_return = self.engine.fetch_dx_return_range(universe, dates=ref_dates, horizon=4, offset=1, benchmark=benchmark) codes = self.engine.fetch_codes_range(universe, dates=ref_dates) groups = codes.groupby('trade_date') for ref_date, g in groups: start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([Market.code, Market.chgPct]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(g.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) query = select( [IndexMarket.indexCode.label('code'), IndexMarket.chgPct]).where( and_(IndexMarket.trade_date.between(start_date, end_date), IndexMarket.indexCode == benchmark)) df = pd.read_sql(query, con=self.engine.engine) b_res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) calculated_return = dx_return[dx_return.trade_date == ref_date] np.testing.assert_array_almost_equal( calculated_return.dx.values, res.chgPct.values - b_res.chgPct.values) def test_sql_engine_fetch_dx_return_with_universe_adjustment(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', '2017-01-26', '-6m'), '2017-01-26', '60b', 'china.sse') universe = Universe('zz500') dx_return = self.engine.fetch_dx_return_range(universe, dates=ref_dates, horizon=4, offset=1) codes = self.engine.fetch_codes_range(universe, dates=ref_dates) groups = codes.groupby('trade_date') for ref_date, g in groups: start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([Market.code, Market.chgPct]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(g.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) calculated_return = dx_return[dx_return.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_return.dx.values, res.chgPct.values) def test_sql_engine_fetch_dx_return_index(self): horizon = 4 offset = 1 ref_date = self.ref_date dx_return = self.engine.fetch_dx_return_index(ref_date, 905, horizon=horizon, offset=offset) start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([IndexMarket.indexCode, IndexMarket.chgPct]).where( and_(IndexMarket.trade_date.between(start_date, end_date), IndexMarket.indexCode == 905)) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('indexCode').apply(lambda x: np.log(1. + x).sum()) np.testing.assert_array_almost_equal(dx_return.dx.values, res.chgPct.values) def test_sql_engine_fetch_dx_return_index_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') index_code = 906 dx_return = self.engine.fetch_dx_return_index_range(index_code, dates=ref_dates, horizon=4, offset=1) for ref_date in ref_dates: start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([IndexMarket.indexCode, IndexMarket.chgPct]).where( and_(IndexMarket.trade_date.between(start_date, end_date), IndexMarket.indexCode == index_code)) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('indexCode').apply(lambda x: np.log(1. + x).sum()) calculated_return = dx_return[dx_return.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_return.dx.values, res.chgPct.values) def test_sql_engine_fetch_factor(self): ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) factor = 'ROE' factor_data = self.engine.fetch_factor(ref_date, factor, codes) query = select([Uqer.code, Uqer.ROE]).where( and_(Uqer.trade_date == ref_date, Uqer.code.in_(codes))) df = pd.read_sql(query, con=self.engine.engine).sort_values('code') np.testing.assert_array_almost_equal(factor_data.ROE.values, df.ROE.values) def test_sql_engine_fetch_factor_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') universe = Universe('zz500') + Universe('zz1000') factor = 'ROE' factor_data = self.engine.fetch_factor_range(universe, factor, dates=ref_dates) codes = self.engine.fetch_codes_range(universe, dates=ref_dates) groups = codes.groupby('trade_date') for ref_date, g in groups: query = select([Uqer.code, Uqer.ROE]).where( and_(Uqer.trade_date == ref_date, Uqer.code.in_(g.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) calculated_factor = factor_data[factor_data.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_factor.ROE.values, df.ROE.values) def test_sql_engine_fetch_factor_range_forward(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') ref_dates = ref_dates + [ advanceDateByCalendar('china.sse', ref_dates[-1], '60b').strftime('%Y-%m-%d') ] universe = Universe('zz500') + Universe('zz1000') factor = 'ROE' factor_data = self.engine.fetch_factor_range_forward(universe, factor, dates=ref_dates) codes = self.engine.fetch_codes_range(universe, dates=ref_dates[:-1]) groups = codes.groupby('trade_date') for ref_date, g in groups: forward_ref_date = advanceDateByCalendar( 'china.sse', ref_date, '60b').strftime('%Y-%m-%d') query = select([Uqer.code, Uqer.ROE]).where( and_(Uqer.trade_date == forward_ref_date, Uqer.code.in_(g.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) calculated_factor = factor_data[factor_data.trade_date == ref_date] calculated_factor.set_index('code', inplace=True) calculated_factor = calculated_factor.loc[df.code] np.testing.assert_array_almost_equal(calculated_factor.dx.values, df.ROE.values) def test_sql_engine_fetch_benchmark(self): ref_date = self.ref_date benchmark = 906 index_data = self.engine.fetch_benchmark(ref_date, benchmark) query = select([ IndexComponent.code, (IndexComponent.weight / 100.).label('weight') ]).where( and_(IndexComponent.trade_date == ref_date, IndexComponent.indexCode == benchmark)) df = pd.read_sql(query, con=self.engine.engine) np.testing.assert_array_almost_equal(df.weight.values, index_data.weight.values) def test_sql_engine_fetch_benchmark_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-9m'), self.ref_date, '60b', 'china.sse') benchmark = 906 index_data = self.engine.fetch_benchmark_range(benchmark, dates=ref_dates) query = select([ IndexComponent.trade_date, IndexComponent.code, (IndexComponent.weight / 100.).label('weight') ]).where( and_(IndexComponent.trade_date.in_(ref_dates), IndexComponent.indexCode == benchmark)) df = pd.read_sql(query, con=self.engine.engine) for ref_date in ref_dates: calculated_data = index_data[index_data.trade_date == ref_date] expected_data = df[df.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_data.weight.values, expected_data.weight.values) def test_sql_engine_fetch_risk_model(self): ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) risk_cov, risk_exp = self.engine.fetch_risk_model(ref_date, codes, risk_model='short') self.assertListEqual(risk_cov.Factor.tolist(), risk_cov.columns[2:].tolist()) query = select([RiskCovShort ]).where(RiskCovShort.trade_date == ref_date) cov_df = pd.read_sql(query, con=self.engine.engine).sort_values('FactorID') factors = cov_df.Factor.tolist() np.testing.assert_array_almost_equal(risk_cov[factors].values, cov_df[factors].values) query = select([RiskExposure]).where( and_(RiskExposure.trade_date == ref_date, RiskExposure.code.in_(codes))) exp_df = pd.read_sql(query, con=self.engine.engine) np.testing.assert_array_almost_equal(exp_df[factors].values, risk_exp[factors].values) def test_sql_engine_fetch_industry_matrix(self): ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) ind_matrix = self.engine.fetch_industry_matrix(ref_date, codes, 'sw', 1) query = select([Industry.code, Industry.industryName1]).where( and_(Industry.trade_date == ref_date, Industry.industry == '申万行业分类', Industry.code.in_(codes))) df = pd.read_sql(query, con=self.engine.engine) df = pd.get_dummies(df, prefix="", prefix_sep="") self.assertEqual(len(ind_matrix), len(df)) np.testing.assert_array_almost_equal(df[ind_matrix.columns[2:]].values, ind_matrix.iloc[:, 2:].values) def test_sql_engine_fetch_factor_by_categories(self): ref_date = self.ref_date universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes(ref_date, universe) factor1 = {'f': CSRank('ROE', groups='sw1')} factor2 = {'f': CSQuantiles('ROE', groups='sw1')} raw_factor = 'ROE' df1 = self.engine.fetch_factor(ref_date, factor1, codes) df2 = self.engine.fetch_factor(ref_date, factor2, codes) df3 = self.engine.fetch_factor(ref_date, raw_factor, codes) ind_matrix = self.engine.fetch_industry_matrix(ref_date, codes, 'sw', 1) cols = sorted(ind_matrix.columns[2:].tolist()) series = (ind_matrix[cols] * np.array(range(1, len(cols) + 1))).sum(axis=1) df3['cat'] = series.values expected_rank = df3[[ 'ROE', 'cat' ]].groupby('cat').transform(lambda x: rankdata(x.values)) expected_rank[np.isnan(df3.ROE)] = np.nan df3['rank'] = expected_rank['ROE'].values np.testing.assert_array_almost_equal(df3['rank'].values, df1['f'].values) expected_quantile = df3[['ROE', 'cat']].groupby('cat').transform( lambda x: rankdata(x.values) / (len(x) + 1)) expected_quantile[np.isnan(df3.ROE)] = np.nan df3['quantile'] = expected_quantile['ROE'].values np.testing.assert_array_almost_equal(df3['quantile'].values, df2['f'].values)
fetch_data_package, map_freq) from PyFin.api import LAST, SHIFT freq = '60b' universe = Universe('custom', ['ashare_ex']) batch = 1 neutralized_risk = industry_styles risk_model = 'short' pre_process = [winsorize_normal, standardize] post_process = [standardize] warm_start = 3 data_source = None horizon = map_freq(freq) engine = SqlEngine(data_source) fit_intercept = True kernal_feature = 'roe_q' regress_features = {kernal_feature: LAST(kernal_feature), kernal_feature + '_l1': SHIFT(kernal_feature, 1), kernal_feature + '_l2': SHIFT(kernal_feature, 2), kernal_feature + '_l3': SHIFT(kernal_feature, 3) } const_features = {kernal_feature: LAST(kernal_feature)} fit_target = [kernal_feature] data_meta = DataMeta(freq=freq, universe=universe, batch=batch, neutralized_risk=neutralized_risk,
base_universe=self.base_universe, exclude_universe=self.exclude_universe, special_codes=self.special_codes, filter_cond=encode(self.filter_cond)) @classmethod def load(cls, universe_desc: dict): name = universe_desc['name'] base_universe = universe_desc['base_universe'] exclude_universe = universe_desc['exclude_universe'] special_codes = universe_desc['special_codes'] filter_cond = decode(universe_desc['filter_cond']) return cls(name=name, base_universe=base_universe, exclude_universe=exclude_universe, special_codes=special_codes, filter_cond=filter_cond) if __name__ == '__main__': from PyFin.api import * from alphamind.data.engines.sqlengine import SqlEngine engine = SqlEngine() universe = Universe('custom', ['zz800'], exclude_universe=['Bank']) print( universe.query(engine, start_date='2018-04-26', end_date='2018-04-26')) print(universe.query(engine, dates=['2017-12-21', '2017-12-25']))
def prepare_data(engine: SqlEngine, factors: Union[Transformer, Iterable[object]], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0, fit_target: Union[Transformer, object] = None): if warm_start > 0: p = Period(frequency) p = Period(length=-warm_start * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', start_date, p).strftime('%Y-%m-%d') dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Forward) dates = [d.strftime('%Y-%m-%d') for d in dates] horizon = map_freq(frequency) if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).sort_values( ['trade_date', 'code']) alpha_logger.info("factor data loading finished") if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) alpha_logger.info("fit target data loading finished") industry_df = engine.fetch_industry_range(universe, dates=dates) alpha_logger.info("industry data loading finished") benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) alpha_logger.info("benchmark data loading finished") df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, industry_df, on=['trade_date', 'code']) df['weight'] = df['weight'].fillna(0.) df.dropna(inplace=True) return dates, df[[ 'trade_date', 'code', 'dx' ]], df[['trade_date', 'code', 'weight', 'industry_code', 'industry'] + transformer.names]
class Strategy(object): def __init__(self, alpha_model, data_meta, universe, start_date, end_date, freq, benchmark=905, industry_cat='sw_adj', industry_level=1, dask_client=None): self.alpha_model = alpha_model self.data_meta = data_meta self.universe = universe self.benchmark = benchmark self.dates = makeSchedule(start_date, end_date, freq, 'china.sse') self.dates = [d.strftime('%Y-%m-%d') for d in self.dates] self.industry_cat = industry_cat self.industry_level = industry_level self.freq = freq self.horizon = map_freq(freq) self.engine = SqlEngine(self.data_meta.data_source) self.dask_client = dask_client def prepare_backtest_data(self): total_factors = self.engine.fetch_factor_range( self.universe, self.alpha_model.formulas, dates=self.dates) alpha_logger.info("alpha factor data loading finished ...") total_industry = self.engine.fetch_industry_matrix_range( self.universe, dates=self.dates, category=self.industry_cat, level=self.industry_level) alpha_logger.info("industry data loading finished ...") total_benchmark = self.engine.fetch_benchmark_range( dates=self.dates, benchmark=self.benchmark) alpha_logger.info("benchmark data loading finished ...") total_risk_cov, total_risk_exposure = self.engine.fetch_risk_model_range( self.universe, dates=self.dates, risk_model=self.data_meta.risk_model) alpha_logger.info("risk_model data loading finished ...") total_returns = self.engine.fetch_dx_return_range(self.universe, dates=self.dates, horizon=self.horizon, offset=1) alpha_logger.info("returns data loading finished ...") total_data = pd.merge(total_factors, total_industry, on=['trade_date', 'code']) total_data = pd.merge(total_data, total_benchmark, on=['trade_date', 'code'], how='left') total_data.fillna({'weight': 0.}, inplace=True) total_data = pd.merge(total_data, total_returns, on=['trade_date', 'code']) total_data = pd.merge(total_data, total_risk_exposure, on=['trade_date', 'code']) is_in_benchmark = (total_data.weight > 0.).astype(float).values.reshape((-1, 1)) total_data.loc[:, 'benchmark'] = is_in_benchmark total_data.loc[:, 'total'] = np.ones_like(is_in_benchmark) total_data.sort_values(['trade_date', 'code'], inplace=True) self.index_return = self.engine.fetch_dx_return_index_range( self.benchmark, dates=self.dates, horizon=self.horizon, offset=1).set_index('trade_date') self.total_data = total_data self.total_risk_cov = total_risk_cov def _create_lu_bounds(self, running_setting, codes, benchmark_w): codes = np.array(codes) if running_setting.weights_bandwidth: lbound = np.maximum( 0., benchmark_w - running_setting.weights_bandwidth) ubound = running_setting.weights_bandwidth + benchmark_w lb = running_setting.lbound ub = running_setting.ubound if lb or ub: if not isinstance(lb, dict): lbound = np.ones_like(benchmark_w) * lb else: lbound = np.zeros_like(benchmark_w) for c in lb: lbound[codes == c] = lb[c] if 'other' in lb: for i, c in enumerate(codes): if c not in lb: lbound[i] = lb['other'] if not isinstance(ub, dict): ubound = np.ones_like(benchmark_w) * ub else: ubound = np.ones_like(benchmark_w) for c in ub: ubound[codes == c] = ub[c] if 'other' in ub: for i, c in enumerate(codes): if c not in ub: ubound[i] = ub['other'] return lbound, ubound def run(self, running_setting): alpha_logger.info("starting backting ...") total_data_groups = self.total_data.groupby('trade_date') rets = [] turn_overs = [] leverags = [] previous_pos = pd.DataFrame() executor = copy.deepcopy(running_setting.executor) positions = pd.DataFrame() if self.dask_client is None: models = {} for ref_date, _ in total_data_groups: models[ref_date] = train_model(ref_date.strftime('%Y-%m-%d'), self.alpha_model, self.data_meta) else: def worker(parameters): new_model = train_model(parameters[0].strftime('%Y-%m-%d'), parameters[1], parameters[2]) return parameters[0], new_model l = self.dask_client.map(worker, [(d[0], self.alpha_model, self.data_meta) for d in total_data_groups]) results = self.dask_client.gather(l) models = dict(results) for ref_date, this_data in total_data_groups: new_model = models[ref_date] codes = this_data.code.values.tolist() if previous_pos.empty: current_position = None else: previous_pos.set_index('code', inplace=True) remained_pos = previous_pos.loc[codes] remained_pos.fillna(0., inplace=True) current_position = remained_pos.weight.values if running_setting.rebalance_method == 'tv': risk_cov = self.total_risk_cov[self.total_risk_cov.trade_date == ref_date] sec_cov = self._generate_sec_cov(this_data, risk_cov) else: sec_cov = None benchmark_w = this_data.weight.values constraints = LinearConstraints(running_setting.bounds, this_data, benchmark_w) lbound, ubound = self._create_lu_bounds(running_setting, codes, benchmark_w) features = new_model.features dfs = [] for name in features: data_cleaned = this_data.dropna(subset=[name]) raw_factors = data_cleaned[[name]].values new_factors = factor_processing( raw_factors, pre_process=self.data_meta.pre_process, risk_factors=data_cleaned[ self.data_meta.neutralized_risk].values.astype(float) if self.data_meta.neutralized_risk else None, post_process=self.data_meta.post_process) df = pd.DataFrame(new_factors, columns=[name], index=data_cleaned.code) dfs.append(df) new_factors = pd.concat(dfs, axis=1) new_factors = new_factors.loc[codes].fillna(new_factors.median()) er = new_model.predict(new_factors).astype(float) alpha_logger.info('{0} re-balance: {1} codes'.format( ref_date, len(er))) target_pos = self._calculate_pos(running_setting, er, this_data, constraints, benchmark_w, lbound, ubound, sec_cov=sec_cov, current_position=current_position) target_pos['code'] = codes target_pos['trade_date'] = ref_date turn_over, executed_pos = executor.execute(target_pos=target_pos) leverage = executed_pos.weight.abs().sum() ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) - 1.) rets.append(np.log(1. + ret)) executor.set_current(executed_pos) turn_overs.append(turn_over) leverags.append(leverage) positions = positions.append(executed_pos) previous_pos = executed_pos positions['benchmark_weight'] = self.total_data['weight'].values positions['dx'] = self.total_data.dx.values trade_dates = positions.trade_date.unique() ret_df = pd.DataFrame( { 'returns': rets, 'turn_over': turn_overs, 'leverage': leverags }, index=trade_dates) ret_df['benchmark_returns'] = self.index_return['dx'] ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1], self.freq)] = 0. ret_df = ret_df.shift(1) ret_df.iloc[0] = 0. ret_df['excess_return'] = ret_df[ 'returns'] - ret_df['benchmark_returns'] * ret_df['leverage'] return ret_df, positions @staticmethod def _generate_sec_cov(current_data, risk_cov): risk_exposure = current_data[all_styles].values risk_cov = risk_cov[all_styles].values special_risk = current_data['srisk'].values sec_cov = risk_exposure @ risk_cov @ risk_exposure.T / 10000 + np.diag( special_risk**2) / 10000 return sec_cov def _calculate_pos(self, running_setting, er, data, constraints, benchmark_w, lbound, ubound, sec_cov, current_position): more_opts = running_setting.more_opts target_pos, _ = er_portfolio_analysis( er, industry=data.industry_name.values, dx_return=None, constraints=constraints, detail_analysis=False, benchmark=benchmark_w, method=running_setting.rebalance_method, lbound=lbound, ubound=ubound, current_position=current_position, target_vol=more_opts.get('target_vol'), cov=sec_cov, turn_over_target=more_opts.get('turn_over_target')) return target_pos
class Strategy(object): def __init__(self, alpha_model, data_meta, running_setting, dask_client=None): self.alpha_model = alpha_model self.data_meta = data_meta self.running_setting = running_setting self.engine = SqlEngine(self.data_meta.data_source) self.dask_client = dask_client def run(self): alpha_logger.info("starting backting ...") total_factors = self.engine.fetch_factor_range( self.running_setting.universe, self.alpha_model.formulas, dates=self.running_setting.dates) alpha_logger.info("alpha factor data loading finished ...") total_industry = self.engine.fetch_industry_matrix_range( self.running_setting.universe, dates=self.running_setting.dates, category=self.running_setting.industry_cat, level=self.running_setting.industry_level) alpha_logger.info("industry data loading finished ...") total_benchmark = self.engine.fetch_benchmark_range( dates=self.running_setting.dates, benchmark=self.running_setting.benchmark) alpha_logger.info("benchmark data loading finished ...") total_risk_cov, total_risk_exposure = self.engine.fetch_risk_model_range( self.running_setting.universe, dates=self.running_setting.dates, risk_model=self.data_meta.risk_model) alpha_logger.info("risk_model data loading finished ...") total_returns = self.engine.fetch_dx_return_range( self.running_setting.universe, dates=self.running_setting.dates, horizon=self.running_setting.horizon, offset=1) alpha_logger.info("returns data loading finished ...") total_data = pd.merge(total_factors, total_industry, on=['trade_date', 'code']) total_data = pd.merge(total_data, total_benchmark, on=['trade_date', 'code'], how='left') total_data.fillna({'weight': 0.}, inplace=True) total_data = pd.merge(total_data, total_returns, on=['trade_date', 'code']) total_data = pd.merge(total_data, total_risk_exposure, on=['trade_date', 'code']) is_in_benchmark = (total_data.weight > 0.).astype(float).reshape( (-1, 1)) total_data.loc[:, 'benchmark'] = is_in_benchmark total_data.loc[:, 'total'] = np.ones_like(is_in_benchmark) total_data.sort_values(['trade_date', 'code'], inplace=True) total_data_groups = total_data.groupby('trade_date') rets = [] turn_overs = [] leverags = [] previous_pos = pd.DataFrame() executor = copy.deepcopy(self.running_setting.executor) positions = pd.DataFrame() if self.dask_client is None: models = {} for ref_date, _ in total_data_groups: models[ref_date] = train_model(ref_date.strftime('%Y-%m-%d'), self.alpha_model, self.data_meta) else: def worker(parameters): new_model = train_model(parameters[0].strftime('%Y-%m-%d'), parameters[1], parameters[2]) return parameters[0], new_model l = self.dask_client.map(worker, [(d[0], self.alpha_model, self.data_meta) for d in total_data_groups]) results = self.dask_client.gather(l) models = dict(results) for ref_date, this_data in total_data_groups: new_model = models[ref_date] this_data = this_data.fillna( this_data[new_model.features].median()) codes = this_data.code.values.tolist() if self.running_setting.rebalance_method == 'tv': risk_cov = total_risk_cov[total_risk_cov.trade_date == ref_date] sec_cov = self._generate_sec_cov(this_data, risk_cov) else: sec_cov = None benchmark_w = this_data.weight.values constraints = LinearConstraints(self.running_setting.bounds, this_data, benchmark_w) lbound = np.maximum( 0., benchmark_w - self.running_setting.weights_bandwidth) ubound = self.running_setting.weights_bandwidth + benchmark_w if previous_pos.empty: current_position = None else: previous_pos.set_index('code', inplace=True) remained_pos = previous_pos.loc[codes] remained_pos.fillna(0., inplace=True) current_position = remained_pos.weight.values features = new_model.features raw_factors = this_data[features].values new_factors = factor_processing( raw_factors, pre_process=self.data_meta.pre_process, risk_factors=this_data[self.data_meta.neutralized_risk].values. astype(float) if self.data_meta.neutralized_risk else None, post_process=self.data_meta.post_process) er = new_model.predict(pd.DataFrame( new_factors, columns=features)).astype(float) alpha_logger.info('{0} re-balance: {1} codes'.format( ref_date, len(er))) target_pos = self._calculate_pos(er, this_data, constraints, benchmark_w, lbound, ubound, sec_cov=sec_cov, current_position=current_position, **self.running_setting.more_opts) target_pos['code'] = codes target_pos['trade_date'] = ref_date turn_over, executed_pos = executor.execute(target_pos=target_pos) leverage = executed_pos.weight.abs().sum() ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) - 1.) rets.append(np.log(1. + ret)) executor.set_current(executed_pos) turn_overs.append(turn_over) leverags.append(leverage) positions = positions.append(executed_pos) previous_pos = executed_pos positions['benchmark_weight'] = total_data['weight'].values positions['dx'] = total_data.dx.values trade_dates = positions.trade_date.unique() ret_df = pd.DataFrame( { 'returns': rets, 'turn_over': turn_overs, 'leverage': leverags }, index=trade_dates) index_return = self.engine.fetch_dx_return_index_range( self.running_setting.benchmark, dates=self.running_setting.dates, horizon=self.running_setting.horizon, offset=1).set_index('trade_date') ret_df['benchmark_returns'] = index_return['dx'] ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1], self.running_setting.freq)] = 0. ret_df = ret_df.shift(1) ret_df.iloc[0] = 0. ret_df['excess_return'] = ret_df[ 'returns'] - ret_df['benchmark_returns'] * ret_df['leverage'] return ret_df, positions @staticmethod def _generate_sec_cov(current_data, risk_cov): risk_exposure = current_data[all_styles].values risk_cov = risk_cov[all_styles].values special_risk = current_data['srisk'].values sec_cov = risk_exposure @ risk_cov @ risk_exposure.T / 10000 + np.diag( special_risk**2) / 10000 return sec_cov def _calculate_pos(self, er, data, constraints, benchmark_w, lbound, ubound, **kwargs): target_pos, _ = er_portfolio_analysis( er, industry=data.industry_name.values, dx_return=None, constraints=constraints, detail_analysis=False, benchmark=benchmark_w, method=self.running_setting.rebalance_method, lbound=lbound, ubound=ubound, current_position=kwargs.get('current_position'), target_vol=kwargs.get('target_vol'), cov=kwargs.get('sec_cov'), turn_over_target=kwargs.get('turn_over_target')) return target_pos
right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = {'x': ne_x, 'code': codes} return ret if __name__ == '__main__': engine = SqlEngine( 'postgresql+psycopg2://postgres:[email protected]/alpha') universe = Universe('zz500', ['ashare_ex']) neutralized_risk = ['SIZE'] res = fetch_train_phase(engine, ['EPS', 'CFinc1'], '2017-09-04', '2w', universe, 4, warm_start=1, neutralized_risk=neutralized_risk) print(res) res = fetch_predict_phase(engine, ['EPS', 'CFinc1'], '2017-09-04', '2w',