def __init__(self, **kwargs): super(DXDataCenter, self).__init__(kwargs['logger'], kwargs['symbolList']) self.fields = [ 'productID', 'instrumentID', 'tradingDate', 'tradingTime', 'openPrice', 'highPrice', 'lowPrice', 'closePrice', 'volume', 'multiplier', 'openInterest' ] self.startDate = kwargs['startDate'] self.endDate = kwargs['endDate'] self._freq = kwargs['freq'] self.baseDate = adjustDateByCalendar('China.SSE', self.endDate, BizDayConventions.Preceding) if not Settings.usingCache: self.forceUpdate = True else: self.forceUpdate = False if self._freq == FreqType.MIN5: self.loadSchedule = makeSchedule(self.startDate, self.endDate, '1m') elif self._freq == FreqType.MIN1: self.loadSchedule = makeSchedule(self.startDate, self.endDate, '7d') else: self.loadSchedule = [self.startDate, self.endDate] self.schCurrEnd = 1 while True: self._getMinutesBars( startDate=self.loadSchedule[self.schCurrEnd - 1].strftime("%Y-%m-%d"), endDate=self.loadSchedule[self.schCurrEnd].strftime( "%Y-%m-%d"), freq=self._freq) if self.symbolList: break self.logger.warning('There is no any valid data in the back-testing data range ({0} - {1})' \ .format(self.loadSchedule[self.schCurrEnd - 1], self.loadSchedule[self.schCurrEnd])) if self.schCurrEnd == len(self.loadSchedule) - 1: break self.schCurrEnd += 1 if not self.symbolList: raise ValueError( 'There is no any valid data in the back-testing whole data range' ) if kwargs['benchmark']: self._getBenchmarkData(kwargs['benchmark'], self.startDate.strftime("%Y-%m-%d"), self.endDate.strftime("%Y-%m-%d"))
def __init__(self, **kwargs): super(DXDataCenter, self).__init__(kwargs['logger'], kwargs['symbolList']) self.fields = ['productID', 'instrumentID', 'tradingDate', 'tradingTime', 'openPrice', 'highPrice', 'lowPrice', 'closePrice', 'volume', 'multiplier', 'openInterest'] self.startDate = kwargs['startDate'] self.endDate = kwargs['endDate'] self._freq = kwargs['freq'] self.baseDate = adjustDateByCalendar('China.SSE', self.endDate, BizDayConventions.Preceding) if not Settings.usingCache: self.forceUpdate = True else: self.forceUpdate = False if self._freq == FreqType.MIN5: self.loadSchedule = makeSchedule(self.startDate, self.endDate, '1m') elif self._freq == FreqType.MIN1: self.loadSchedule = makeSchedule(self.startDate, self.endDate, '7d') else: self.loadSchedule = [self.startDate, self.endDate] self.schCurrEnd = 1 while True: self._getMinutesBars(startDate=self.loadSchedule[self.schCurrEnd - 1].strftime("%Y-%m-%d"), endDate=self.loadSchedule[self.schCurrEnd].strftime("%Y-%m-%d"), freq=self._freq) if self.symbolList: break self.logger.warning('There is no any valid data in the back-testing data range ({0} - {1})' \ .format(self.loadSchedule[self.schCurrEnd - 1], self.loadSchedule[self.schCurrEnd])) if self.schCurrEnd == len(self.loadSchedule) - 1: break self.schCurrEnd += 1 if not self.symbolList: raise ValueError('There is no any valid data in the back-testing whole data range') if kwargs['benchmark']: self._getBenchmarkData(kwargs['benchmark'], self.startDate.strftime("%Y-%m-%d"), self.endDate.strftime("%Y-%m-%d"))
def __init__(self, alpha_model, data_meta, universe, start_date, end_date, freq, benchmark=905, industry_cat='sw_adj', industry_level=1, dask_client=None): self.alpha_model = alpha_model self.data_meta = data_meta self.universe = universe self.benchmark = benchmark self.dates = makeSchedule(start_date, end_date, freq, 'china.sse') self.dates = [d.strftime('%Y-%m-%d') for d in self.dates] self.industry_cat = industry_cat self.industry_level = industry_level self.freq = freq self.horizon = map_freq(freq) self.engine = SqlEngine(self.data_meta.data_source) self.dask_client = dask_client self.total_data = None self.index_return = None self.risk_models = None self.alpha_models = None
def __init__(self, universe, start_date, end_date, freq, benchmark=905, weights_bandwidth=0.02, industry_cat='sw_adj', industry_level=1, rebalance_method='risk_neutral', bounds=None, **kwargs): self.universe = universe self.dates = makeSchedule(start_date, end_date, freq, 'china.sse') self.dates = [d.strftime('%Y-%m-%d') for d in self.dates] self.benchmark = benchmark self.weights_bandwidth = weights_bandwidth self.freq = freq self.horizon = map_freq(freq) self.executor = NaiveExecutor() self.industry_cat = industry_cat self.industry_level = industry_level self.rebalance_method = rebalance_method self.bounds = bounds self.more_opts = kwargs
def factor(self, factor_category, begin_date, end_date, factor_name=None, freq=None): if factor_name is None: table = importlib.import_module( 'data.factor_model').__getattribute__(factor_category) return self.base(table, begin_date, end_date, freq) else: table = importlib.import_module( 'data.factor_model').__getattribute__(factor_category) key_sets = ['id', 'security_code', 'trade_date'] + factor_name db_columns = [] for key in key_sets: db_columns.append(table.__dict__[key]) if freq is None: query = select(db_columns).where( and_( table.trade_date >= begin_date, table.trade_date <= end_date, )) else: rebalance_dates = makeSchedule(begin_date, end_date, freq, 'china.sse', BizDayConventions.Preceding) query = select(db_columns).where( table.trade_date.in_(rebalance_dates)) return pd.read_sql(query, self._engine.sql_engine()).drop(['id'], axis=1)
def factor_combination(engine, factors, universe_name_list, start_date, end_date, freq): universe = None for name in universe_name_list: if universe is None: universe = Universe(name) else: universe += Universe(name) dates = makeSchedule(start_date, end_date, freq, calendar='china.sse') factor_negMkt = engine.fetch_factor_range(universe, "negMarketValue", dates=dates) risk_cov, risk_factors = engine.fetch_risk_model_range(universe, dates=dates) dx_returns = engine.fetch_dx_return_range(universe, dates=dates, horizon=map_freq(freq)) # data combination total_data = pd.merge(factors, risk_factors, on=['trade_date', 'code']) total_data = pd.merge(total_data, factor_negMkt, on=['trade_date', 'code']) total_data = pd.merge(total_data, dx_returns, on=['trade_date', 'code']) industry_category = engine.fetch_industry_range(universe, dates=dates) total_data = pd.merge(total_data, industry_category, on=['trade_date', 'code']).dropna() total_data.dropna(inplace=True) return total_data
def test_sql_engine_fetch_dx_return_with_universe_adjustment(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', '2017-01-26', '-6m'), '2017-01-26', '60b', 'china.sse') universe = Universe('zz500') dx_return = self.engine.fetch_dx_return_range(universe, dates=ref_dates, horizon=4, offset=1) codes = self.engine.fetch_codes_range(universe, dates=ref_dates) groups = codes.groupby('trade_date') for ref_date, g in groups: start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([Market.code, Market.chgPct]).where( and_(Market.trade_date.between(start_date, end_date), Market.code.in_(g.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('code').apply(lambda x: np.log(1. + x).sum()) calculated_return = dx_return[dx_return.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_return.dx.values, res.chgPct.values)
def test_sql_engine_fetch_factor_range_forward(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') ref_dates = ref_dates + [ advanceDateByCalendar('china.sse', ref_dates[-1], '60b').strftime('%Y-%m-%d') ] universe = Universe('zz500') + Universe('zz1000') factor = 'ROE' factor_data = self.engine.fetch_factor_range_forward(universe, factor, dates=ref_dates) codes = self.engine.fetch_codes_range(universe, dates=ref_dates[:-1]) groups = codes.groupby('trade_date') for ref_date, g in groups: forward_ref_date = advanceDateByCalendar( 'china.sse', ref_date, '60b').strftime('%Y-%m-%d') query = select([Uqer.code, Uqer.ROE]).where( and_(Uqer.trade_date == forward_ref_date, Uqer.code.in_(g.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) calculated_factor = factor_data[factor_data.trade_date == ref_date] calculated_factor.set_index('code', inplace=True) calculated_factor = calculated_factor.loc[df.code] np.testing.assert_array_almost_equal(calculated_factor.dx.values, df.ROE.values)
def prepare_data(engine: SqlEngine, factors: Union[Transformer, Iterable[object]], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0): if warm_start > 0: p = Period(frequency) p = Period(length=-warm_start * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', start_date, p).strftime('%Y-%m-%d') dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Forward) dates = [d.strftime('%Y-%m-%d') for d in dates] horizon = map_freq(frequency) if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).sort_values( ['trade_date', 'code']) alpha_logger.info("factor data loading finished") return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) alpha_logger.info("return data loading finished") industry_df = engine.fetch_industry_range(universe, dates=dates) alpha_logger.info("industry data loading finished") benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) alpha_logger.info("benchmark data loading finished") df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, industry_df, on=['trade_date', 'code']) df['weight'] = df['weight'].fillna(0.) return dates, df[['trade_date', 'code', 'dx']], df[[ 'trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry' ] + transformer.names]
def base(self, table_name, begin_date, end_date, freq=None): if freq is None: query = select([table_name]).where( and_( table_name.trade_date >= begin_date, table_name.trade_date <= end_date, )) else: rebalance_dates = makeSchedule(begin_date, end_date, freq, 'china.sse', BizDayConventions.Preceding) query = select([table_name]).where( table_name.trade_date.in_(rebalance_dates)) return pd.read_sql(query, self._engine.sql_engine())
def index(self, benchmark, begin_date, end_date, freq=None): table = importlib.import_module('data.rl_model').Index if freq is None: query = select([table]).where( and_(table.trade_date >= begin_date, table.trade_date <= end_date, table.isymbol.in_(benchmark))) else: rebalance_dates = makeSchedule(begin_date, end_date, freq, 'china.sse', BizDayConventions.Preceding) query = select([table]).where( and_(table.trade_date.in_(rebalance_dates), table.isymbol.in_(benchmark))) return pd.read_sql(query, self._engine.sql_engine()).drop(['id'], axis=1)
def test_sql_engine_fetch_codes_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') universe = Universe('zz500') + Universe('zz1000') codes = self.engine.fetch_codes_range(universe, dates=ref_dates) query = select([UniverseTable.trade_date, UniverseTable.code]).where( and_(UniverseTable.trade_date.in_(ref_dates), or_(UniverseTable.zz500 == 1, UniverseTable.zz1000 == 1))) df = pd.read_sql(query, con=self.engine.engine).sort_values('code') for ref_date in ref_dates: calculated_codes = list( sorted(codes[codes.trade_date == ref_date].code.values)) expected_codes = list( sorted(df[df.trade_date == ref_date].code.values)) self.assertListEqual(calculated_codes, expected_codes)
def test_sql_engine_fetch_benchmark_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-9m'), self.ref_date, '60b', 'china.sse') benchmark = 906 index_data = self.engine.fetch_benchmark_range(benchmark, dates=ref_dates) query = select([ IndexComponent.trade_date, IndexComponent.code, (IndexComponent.weight / 100.).label('weight') ]).where( and_(IndexComponent.trade_date.in_(ref_dates), IndexComponent.indexCode == benchmark)) df = pd.read_sql(query, con=self.engine.engine) for ref_date in ref_dates: calculated_data = index_data[index_data.trade_date == ref_date] expected_data = df[df.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_data.weight.values, expected_data.weight.values)
def fetch_factors(self, begin_date, end_date, freq=None): if freq is None: query = select([self._table]).where( and_( self._table.trade_date >= begin_date, self._table.trade_date <= end_date, )) else: rebalance_dates = makeSchedule(begin_date, end_date, freq, 'china.sse', BizDayConventions.Preceding) query = select([self._table]).where( and_(self._table.trade_date.in_(rebalance_dates), )) data = pd.read_sql(query, self._engine) for col in ['id', 'creat_time', 'update_time']: if col in data.columns: data = data.drop([col], axis=1) return data
def test_sql_engine_fetch_factor_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') universe = Universe('zz500') + Universe('zz1000') factor = 'ROE' factor_data = self.engine.fetch_factor_range(universe, factor, dates=ref_dates) codes = self.engine.fetch_codes_range(universe, dates=ref_dates) groups = codes.groupby('trade_date') for ref_date, g in groups: query = select([Uqer.code, Uqer.ROE]).where( and_(Uqer.trade_date == ref_date, Uqer.code.in_(g.code.unique().tolist()))) df = pd.read_sql(query, con=self.engine.engine) calculated_factor = factor_data[factor_data.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_factor.ROE.values, df.ROE.values)
def test_sql_engine_fetch_dx_return_index_range(self): ref_dates = makeSchedule( advanceDateByCalendar('china.sse', self.ref_date, '-6m'), self.ref_date, '60b', 'china.sse') index_code = 906 dx_return = self.engine.fetch_dx_return_index_range(index_code, dates=ref_dates, horizon=4, offset=1) for ref_date in ref_dates: start_date = advanceDateByCalendar('china.sse', ref_date, '2b') end_date = advanceDateByCalendar('china.sse', ref_date, '6b') query = select([IndexMarket.indexCode, IndexMarket.chgPct]).where( and_(IndexMarket.trade_date.between(start_date, end_date), IndexMarket.indexCode == index_code)) df = pd.read_sql(query, con=self.engine.engine) res = df.groupby('indexCode').apply(lambda x: np.log(1. + x).sum()) calculated_return = dx_return[dx_return.trade_date == ref_date] np.testing.assert_array_almost_equal(calculated_return.dx.values, res.chgPct.values)
def fetch_predict_phase(engine, alpha_factors: Iterable[object], ref_date, frequency, universe, batch, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0): transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).dropna() names = transformer.names if neutralized_risk: risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] used_neutralized_risk = list(set(neutralized_risk).difference(names)) risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) risk_exp = train_x[neutralized_risk].values.astype(float) x_values = train_x[names].values.astype(float) else: train_x = factor_df.copy() risk_exp = None date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime() dates = np.unique(date_label) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-1] start = dates[-batch] # index = (date_label >= start) & (date_label <= end) left_index = bisect.bisect_left(date_label, start) right_index = bisect.bisect_right(date_label, end) this_raw_x = x_values[left_index:right_index] sub_dates = date_label[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) ne_x = ne_x[inner_left_index:inner_right_index] left_index = bisect.bisect_left(date_label, end) right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = {'x': ne_x, 'code': codes} return ret
def fetch_train_phase(engine, alpha_factors: Iterable[object], ref_date, frequency, universe, batch, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0) -> dict: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = _map_horizon(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() return_df, factor_df = df[['trade_date', 'code', 'dx']], df[['trade_date', 'code', 'isOpen'] + transformer.names] return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \ _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-2] start = dates[-batch - 1] else: end = dates[-1] start = dates[-batch] index = (date_label >= start) & (date_label <= end) this_raw_x = x_values[index] this_raw_y = y_values[index] if risk_exp is not None: this_risk_exp = risk_exp[index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ret = dict() ret['x_names'] = transformer.names ret['train'] = {'x': ne_x, 'y': ne_y} return ret
if __name__ == "__main__": from PyFin.api import makeSchedule # db_url = "mysql+mysqldb://reader:Reader#[email protected]:13317/vision?charset=utf8" db_url = "mysql+mysqldb://dxrw:[email protected]:13317/dxtest?charset=utf8" sql_engine = SqlEngine(db_url=db_url, factor_tables=["factor_momentum"]) universe = Universe("hs300") start_date = '2020-01-02' end_date = '2020-02-21' frequency = "10b" benchmark = 300 factors = ["EMA5D", "EMV6D"] ref_dates = makeSchedule(start_date, end_date, frequency, 'china.sse') print(ref_dates) df = sql_engine.fetch_factor("2020-02-21", factors=factors, codes=["2010031963"]) print(df) df = sql_engine.fetch_factor_range(universe=universe, dates=ref_dates, factors=factors) print(df) df = sql_engine.fetch_codes_range(start_date=start_date, end_date=end_date, universe=Universe("hs300")) print(df) df = sql_engine.fetch_dx_return("2020-10-09", codes=["2010031963"],
def prepare_data(engine: SqlEngine, factors: Union[Transformer, Iterable[object]], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0, fit_target: Union[Transformer, object] = None): if warm_start > 0: p = Period(frequency) p = Period(length=-warm_start * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', start_date, p).strftime('%Y-%m-%d') dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Forward) dates = [d.strftime('%Y-%m-%d') for d in dates] horizon = map_freq(frequency) if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).sort_values( ['trade_date', 'code']) alpha_logger.info("factor data loading finished") if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) alpha_logger.info("fit target data loading finished") industry_df = engine.fetch_industry_range(universe, dates=dates) alpha_logger.info("industry data loading finished") benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) alpha_logger.info("benchmark data loading finished") df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, industry_df, on=['trade_date', 'code']) df['weight'] = df['weight'].fillna(0.) df.dropna(inplace=True) return dates, df[[ 'trade_date', 'code', 'dx' ]], df[['trade_date', 'code', 'weight', 'industry_code', 'industry'] + transformer.names]
def fetch_predict_phase(engine, alpha_factors: Union[Transformer, Iterable[object]], ref_date, frequency, universe, batch=1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, fillna: str = None, fit_target: Union[Transformer, object] = None): if isinstance(alpha_factors, Transformer): transformer = alpha_factors else: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch - 1) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = map_freq(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) if fillna: factor_df = factor_df.groupby('trade_date').apply( lambda x: x.fillna(x.median())).reset_index(drop=True).dropna() else: factor_df = factor_df.dropna() if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) names = transformer.names if neutralized_risk: risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] used_neutralized_risk = list(set(neutralized_risk).difference(names)) risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) train_x = pd.merge(train_x, target_df, on=['trade_date', 'code'], how='left') risk_exp = train_x[neutralized_risk].values.astype(float) else: train_x = pd.merge(factor_df, target_df, on=['trade_date', 'code'], how='left') risk_exp = None train_x.dropna(inplace=True, subset=train_x.columns[:-1]) x_values = train_x[names].values.astype(float) y_values = train_x[['dx']].values.astype(float) date_label = pd.DatetimeIndex(train_x.trade_date).to_pydatetime() dates = np.unique(date_label) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-1] start = dates[-batch] if batch <= len(dates) else dates[0] left_index = bisect.bisect_left(date_label, start) right_index = bisect.bisect_right(date_label, end) this_raw_x = x_values[left_index:right_index] this_raw_y = y_values[left_index:right_index] sub_dates = date_label[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) ne_x = ne_x[inner_left_index:inner_right_index] ne_y = ne_y[inner_left_index:inner_right_index] left_index = bisect.bisect_left(date_label, end) right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None ne_y = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = { 'x': pd.DataFrame(ne_x, columns=transformer.names), 'code': codes, 'y': ne_y.flatten() } return ret
def fetch_train_phase(engine, alpha_factors: Union[Transformer, Iterable[object]], ref_date, frequency, universe, batch=1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, fit_target: Union[Transformer, object] = None) -> dict: if isinstance(alpha_factors, Transformer): transformer = alpha_factors else: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = map_freq(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() target_df, factor_df = df[['trade_date', 'code', 'dx']], df[['trade_date', 'code'] + transformer.names] target_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \ _merge_df(engine, transformer.names, factor_df, target_df, universe, dates, risk_model, neutralized_risk) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): pyFinAssert( len(dates) >= 2, ValueError, "No previous data for training for the date {0}".format(ref_date)) end = dates[-2] start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0] else: end = dates[-1] start = dates[-batch] if batch <= len(dates) else dates[0] index = (date_label >= start) & (date_label <= end) this_raw_x = x_values[index] this_raw_y = y_values[index] this_code = codes[index] if risk_exp is not None: this_risk_exp = risk_exp[index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ret = dict() ret['x_names'] = transformer.names ret['train'] = { 'x': pd.DataFrame(ne_x, columns=transformer.names), 'y': ne_y, 'code': this_code } return ret