def fetch_factor_range(self, universe: Universe, factors: Union[Transformer, Iterable[object]], start_date: str = None, end_date: str = None, dates: Iterable[str] = None, external_data: pd.DataFrame = None, used_factor_tables=None) -> pd.DataFrame: if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) dependency = transformer.dependency if used_factor_tables: factor_cols = _map_factors(dependency, used_factor_tables) else: factor_cols = _map_factors(dependency, factor_tables) big_table = Market joined_tables = set() joined_tables.add(Market.__table__.name) for t in set(factor_cols.values()): if t.__table__.name not in joined_tables: if dates is not None: big_table = outerjoin(big_table, t, and_(Market.trade_date == t.trade_date, Market.code == t.code, Market.trade_date.in_(dates))) else: big_table = outerjoin(big_table, t, and_(Market.trade_date == t.trade_date, Market.code == t.code, Market.trade_date.between(start_date, end_date))) joined_tables.add(t.__table__.name) universe_df = universe.query(self, start_date, end_date, dates) query = select( [Market.trade_date, Market.code, Market.chgPct] + list(factor_cols.keys())) \ .select_from(big_table).where( and_( Market.code.in_(universe_df.code.unique().tolist()), Market.trade_date.in_(dates) if dates is not None else Market.trade_date.between(start_date, end_date) ) ).distinct() df = pd.read_sql(query, self.engine).replace([-np.inf, np.inf], np.nan) if external_data is not None: df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna() df.sort_values(['trade_date', 'code'], inplace=True) df.set_index('trade_date', inplace=True) res = transformer.transform('code', df).replace([-np.inf, np.inf], np.nan) res['chgPct'] = df.chgPct res = res.reset_index() return pd.merge(res, universe_df[['trade_date', 'code']], how='inner').drop_duplicates(['trade_date', 'code'])
def fetch_factor(self, ref_date: str, factors: Iterable[object], codes: Iterable[int], warm_start: int = 0, used_factor_tables=None) -> pd.DataFrame: if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) dependency = transformer.dependency if used_factor_tables: factor_cols = _map_factors(dependency, used_factor_tables) else: factor_cols = _map_factors(dependency, factor_tables) start_date = advanceDateByCalendar('china.sse', ref_date, str(-warm_start) + 'b').strftime('%Y-%m-%d') end_date = ref_date big_table = Market joined_tables = set() joined_tables.add(Market.__table__.name) for t in set(factor_cols.values()): if t.__table__.name not in joined_tables: big_table = outerjoin( big_table, t, and_(Market.trade_date == t.trade_date, Market.code == t.code)) joined_tables.add(t.__table__.name) query = select( [Market.trade_date, Market.code, Market.chgPct, Market.secShortName] + list( factor_cols.keys())) \ .select_from(big_table).where(and_(Market.trade_date.between(start_date, end_date), Market.code.in_(codes))) df = pd.read_sql(query, self.engine) \ .replace([-np.inf, np.inf], np.nan) \ .sort_values(['trade_date', 'code']) \ .set_index('trade_date') res = transformer.transform('code', df).replace([-np.inf, np.inf], np.nan) res['chgPct'] = df.chgPct res['secShortName'] = df['secShortName'] res = res.loc[ref_date:ref_date, :] res.index = list(range(len(res))) return res
def query(self, engine, start_date: str = None, end_date: str = None, dates=None) -> pd.DataFrame: universe_cond = self._query_statements(start_date, end_date, dates) if self.filter_cond is None and self.exclude_universe is None: # simple case query = select([UniverseTable.trade_date, UniverseTable.code ]).where(universe_cond).distinct() return pd.read_sql(query, engine.engine) else: if self.filter_cond is not None: if isinstance(self.filter_cond, Transformer): transformer = self.filter_cond else: transformer = Transformer(self.filter_cond) dependency = transformer.dependency factor_cols = _map_factors(dependency, factor_tables) big_table = Market for t in set(factor_cols.values()): if t.__table__.name != Market.__table__.name: big_table = outerjoin( big_table, t, and_( Market.trade_date == t.trade_date, Market.code == t.code, Market.trade_date.in_(dates) if dates else Market.trade_date.between( start_date, end_date))) big_table = join( big_table, UniverseTable, and_(Market.trade_date == UniverseTable.trade_date, Market.code == UniverseTable.code, universe_cond)) query = select( [Market.trade_date, Market.code] + list(factor_cols.keys())) \ .select_from(big_table).distinct() df = pd.read_sql(query, engine.engine).sort_values( ['trade_date', 'code']).dropna() df.set_index('trade_date', inplace=True) filter_fields = transformer.names pyFinAssert( len(filter_fields) == 1, ValueError, "filter fields can only be 1") df = transformer.transform('code', df) df = df[df[filter_fields[0]] == 1].reset_index()[[ 'trade_date', 'code' ]] return df
def __init__(self, features=None, fit_target=None): if features is not None: self.formulas = Transformer(features) self.features = self.formulas.names else: self.features = None if fit_target is not None: self.fit_target = Transformer(fit_target) else: self.fit_target = None self.impl = None self.trained_time = None
def fetch_factor(self, ref_date: str, factors: Iterable[object], codes: Iterable[int], warm_start: int = 0, used_factor_tables=None) -> pd.DataFrame: if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) dependency = transformer.dependency if used_factor_tables: factor_cols = _map_factors(dependency, used_factor_tables) else: factor_cols = _map_factors(dependency, factor_tables) start_date = advanceDateByCalendar('china.sse', ref_date, str(-warm_start) + 'b').strftime('%Y-%m-%d') end_date = ref_date big_table = FullFactor for t in set(factor_cols.values()): if t.__table__.name != FullFactor.__table__.name: big_table = outerjoin( big_table, t, and_(FullFactor.trade_date == t.trade_date, FullFactor.code == t.code)) query = select( [FullFactor.trade_date, FullFactor.code, FullFactor.isOpen] + list(factor_cols.keys())) \ .select_from(big_table).where(and_(FullFactor.trade_date.between(start_date, end_date), FullFactor.code.in_(codes))) df = pd.read_sql(query, self.engine).sort_values(['trade_date', 'code' ]).set_index('trade_date') res = transformer.transform('code', df) for col in res.columns: if col not in set(['code', 'isOpen']) and col not in df.columns: df[col] = res[col].values df['isOpen'] = df.isOpen.astype(bool) df = df.loc[ref_date] df.index = list(range(len(df))) return df
def fetch_data_experimental(self, ref_date: str, factors: Iterable[str], codes: Iterable[int], benchmark: int = None, risk_model: str = 'short', industry: str = 'sw') -> Dict[str, pd.DataFrame]: total_data = {} transformer = Transformer(factors) factor_data = self.fetch_factor(ref_date, transformer, codes, used_factor_tables=[Experimental]) if benchmark: benchmark_data = self.fetch_benchmark(ref_date, benchmark) total_data['benchmark'] = benchmark_data factor_data = pd.merge(factor_data, benchmark_data, how='left', on=['code']) factor_data['weight'] = factor_data['weight'].fillna(0.) if risk_model: excluded = list(set(total_risk_factors).intersection(transformer.dependency)) risk_cov, risk_exp = self.fetch_risk_model(ref_date, codes, risk_model, excluded) factor_data = pd.merge(factor_data, risk_exp, how='left', on=['code']) total_data['risk_cov'] = risk_cov industry_info = self.fetch_industry(ref_date=ref_date, codes=codes, category=industry) factor_data = pd.merge(factor_data, industry_info, on=['code']) total_data['factor'] = factor_data return total_data
def fetch_data_range( self, universe: Universe, factors: Iterable[str], start_date: str = None, end_date: str = None, dates: Iterable[str] = None, benchmark: int = None, risk_model: str = 'short', industry: str = 'sw', external_data: pd.DataFrame = None) -> Dict[str, pd.DataFrame]: total_data = {} transformer = Transformer(factors) factor_data = self.fetch_factor_range(universe, transformer, start_date, end_date, dates, external_data=external_data) if benchmark: benchmark_data = self.fetch_benchmark_range( benchmark, start_date, end_date, dates) total_data['benchmark'] = benchmark_data factor_data = pd.merge(factor_data, benchmark_data, how='left', on=['trade_date', 'code']) factor_data['weight'] = factor_data['weight'].fillna(0.) if risk_model: excluded = list( set(total_risk_factors).intersection(transformer.dependency)) risk_cov, risk_exp = self.fetch_risk_model_range( universe, start_date, end_date, dates, risk_model, excluded) factor_data = pd.merge(factor_data, risk_exp, how='left', on=['trade_date', 'code']) total_data['risk_cov'] = risk_cov industry_info = self.fetch_industry_range(universe, start_date=start_date, end_date=end_date, dates=dates, category=industry) factor_data = pd.merge(factor_data, industry_info, on=['trade_date', 'code']) total_data['factor'] = factor_data return total_data
def prepare_data(engine: SqlEngine, factors: Union[Transformer, Iterable[object]], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0): if warm_start > 0: p = Period(frequency) p = Period(length=-warm_start * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', start_date, p).strftime('%Y-%m-%d') dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Forward) dates = [d.strftime('%Y-%m-%d') for d in dates] horizon = map_freq(frequency) if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).sort_values( ['trade_date', 'code']) alpha_logger.info("factor data loading finished") return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) alpha_logger.info("return data loading finished") industry_df = engine.fetch_industry_range(universe, dates=dates) alpha_logger.info("industry data loading finished") benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) alpha_logger.info("benchmark data loading finished") df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, industry_df, on=['trade_date', 'code']) df['weight'] = df['weight'].fillna(0.) return dates, df[['trade_date', 'code', 'dx']], df[[ 'trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry' ] + transformer.names]
def fetch_factor_range_forward(self, universe: Universe, factors: Union[Transformer, object], start_date: str = None, end_date: str = None, dates: Iterable[str] = None): if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) dependency = transformer.dependency factor_cols = _map_factors(dependency, factor_tables) codes = universe.query(self, start_date, end_date, dates) total_codes = codes.code.unique().tolist() total_dates = codes.trade_date.astype(str).unique().tolist() big_table = Market joined_tables = set() joined_tables.add(Market.__table__.name) for t in set(factor_cols.values()): if t.__table__.name not in joined_tables: if dates is not None: big_table = outerjoin( big_table, t, and_(Market.trade_date == t.trade_date, Market.code == t.code, Market.trade_date.in_(dates))) else: big_table = outerjoin( big_table, t, and_(Market.trade_date == t.trade_date, Market.code == t.code, Market.trade_date.between(start_date, end_date))) joined_tables.add(t.__table__.name) stats = func.lag(list(factor_cols.keys())[0], -1).over(partition_by=Market.code, order_by=Market.trade_date).label('dx') query = select([Market.trade_date, Market.code, Market.chgPct, stats]).select_from(big_table).where( and_(Market.trade_date.in_(total_dates), Market.code.in_(total_codes))) df = pd.read_sql(query, self.engine) \ .replace([-np.inf, np.inf], np.nan) \ .sort_values(['trade_date', 'code']) return pd.merge(df, codes[['trade_date', 'code']], how='inner').drop_duplicates(['trade_date', 'code'])
def fetch_data_package(engine: SqlEngine, alpha_factors: Iterable[object], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0, batch: int = 1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None) -> dict: alpha_logger.info("Starting data package fetching ...") transformer = Transformer(alpha_factors) dates, return_df, factor_df = prepare_data(engine, transformer, start_date, end_date, frequency, universe, benchmark, warm_start) return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y = \ _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk) return_df['weight'] = train_x['weight'] return_df['industry'] = train_x['industry'] return_df['industry_code'] = train_x['industry_code'] return_df['isOpen'] = train_x['isOpen'] if neutralized_risk: for i, name in enumerate(neutralized_risk): return_df.loc[:, name] = risk_exp[:, i] alpha_logger.info("Loading data is finished") train_x_buckets, train_y_buckets, predict_x_buckets, predict_y_buckets = batch_processing( x_values, y_values, dates, date_label, batch, risk_exp, pre_process, post_process) alpha_logger.info("Data processing is finished") ret = dict() ret['x_names'] = transformer.names ret['settlement'] = return_df ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets} ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets} return ret
def fetch_factor_range(self, universe: Universe, factors: Union[Transformer, Iterable[object]], start_date: str = None, end_date: str = None, dates: Iterable[str] = None, external_data: pd.DataFrame = None, used_factor_tables=None) -> pd.DataFrame: if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) dependency = transformer.dependency if used_factor_tables: factor_cols = _map_factors(dependency, used_factor_tables) else: factor_cols = _map_factors(dependency, factor_tables) cond = universe.query_range(start_date, end_date, dates) big_table = FullFactor for t in set(factor_cols.values()): if t.__table__.name != FullFactor.__table__.name: if dates is not None: big_table = outerjoin( big_table, t, and_(FullFactor.trade_date == t.trade_date, FullFactor.code == t.code, FullFactor.trade_date.in_(dates))) else: big_table = outerjoin( big_table, t, and_( FullFactor.trade_date == t.trade_date, FullFactor.code == t.code, FullFactor.trade_date.between( start_date, end_date))) big_table = join( big_table, UniverseTable, and_(FullFactor.trade_date == UniverseTable.trade_date, FullFactor.code == UniverseTable.code, cond)) query = select( [FullFactor.trade_date, FullFactor.code, FullFactor.isOpen] + list(factor_cols.keys())) \ .select_from(big_table).distinct() df = pd.read_sql(query, self.engine).sort_values(['trade_date', 'code']) if external_data is not None: df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna() df.set_index('trade_date', inplace=True) res = transformer.transform('code', df) for col in res.columns: if col not in set(['code', 'isOpen']) and col not in df.columns: df[col] = res[col].values df['isOpen'] = df.isOpen.astype(bool) return df.reset_index()
def fetch_train_phase(engine, alpha_factors: Iterable[object], ref_date, frequency, universe, batch, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0) -> dict: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = _map_horizon(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() return_df, factor_df = df[['trade_date', 'code', 'dx']], df[['trade_date', 'code', 'isOpen'] + transformer.names] return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \ _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-2] start = dates[-batch - 1] else: end = dates[-1] start = dates[-batch] index = (date_label >= start) & (date_label <= end) this_raw_x = x_values[index] this_raw_y = y_values[index] if risk_exp is not None: this_risk_exp = risk_exp[index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ret = dict() ret['x_names'] = transformer.names ret['train'] = {'x': ne_x, 'y': ne_y} return ret
def fetch_predict_phase(engine, alpha_factors: Iterable[object], ref_date, frequency, universe, batch, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0): transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).dropna() names = transformer.names if neutralized_risk: risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] used_neutralized_risk = list(set(neutralized_risk).difference(names)) risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) risk_exp = train_x[neutralized_risk].values.astype(float) x_values = train_x[names].values.astype(float) else: train_x = factor_df.copy() risk_exp = None date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime() dates = np.unique(date_label) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-1] start = dates[-batch] # index = (date_label >= start) & (date_label <= end) left_index = bisect.bisect_left(date_label, start) right_index = bisect.bisect_right(date_label, end) this_raw_x = x_values[left_index:right_index] sub_dates = date_label[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) ne_x = ne_x[inner_left_index:inner_right_index] left_index = bisect.bisect_left(date_label, end) right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = {'x': ne_x, 'code': codes} return ret
def fetch_data_package(engine: SqlEngine, alpha_factors: Iterable[object], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0, batch: int = 1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, fit_target: Union[Transformer, object] = None) -> dict: alpha_logger.info("Starting data package fetching ...") transformer = Transformer(alpha_factors) names = transformer.names dates, target_df, factor_df = prepare_data(engine, transformer, start_date, end_date, frequency, universe, benchmark, warm_start + batch, fit_target=fit_target) target_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \ _merge_df(engine, names, factor_df, target_df, universe, dates, risk_model, neutralized_risk) alpha_logger.info("data merging finished") target_df['weight'] = train_x['weight'] target_df['industry'] = train_x['industry'] target_df['industry_code'] = train_x['industry_code'] if neutralized_risk: for i, name in enumerate(neutralized_risk): target_df.loc[:, name] = risk_exp[:, i] alpha_logger.info("Loading data is finished") train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets, predict_codes_bucket \ = batch_processing(names, x_values, y_values, dates, date_label, batch, risk_exp, pre_process, post_process, codes) alpha_logger.info("Data processing is finished") ret = dict() ret['x_names'] = names ret['settlement'] = target_df[target_df.trade_date >= start_date] train_x_buckets = { k: train_x_buckets[k] for k in train_x_buckets if k.strftime('%Y-%m-%d') >= start_date } train_y_buckets = { k: train_y_buckets[k] for k in train_y_buckets if k.strftime('%Y-%m-%d') >= start_date } train_risk_buckets = { k: train_risk_buckets[k] for k in train_risk_buckets if k.strftime('%Y-%m-%d') >= start_date } predict_x_buckets = { k: predict_x_buckets[k] for k in predict_x_buckets if k.strftime('%Y-%m-%d') >= start_date } predict_y_buckets = { k: predict_y_buckets[k] for k in predict_y_buckets if k.strftime('%Y-%m-%d') >= start_date } if neutralized_risk: predict_risk_buckets = { k: predict_risk_buckets[k] for k in predict_risk_buckets if k.strftime('%Y-%m-%d') >= start_date } else: predict_risk_buckets = None predict_codes_bucket = { k: predict_codes_bucket[k] for k in predict_codes_bucket if k.strftime('%Y-%m-%d') >= start_date } ret['train'] = { 'x': train_x_buckets, 'y': train_y_buckets, 'risk': train_risk_buckets } ret['predict'] = { 'x': predict_x_buckets, 'y': predict_y_buckets, 'risk': predict_risk_buckets, 'code': predict_codes_bucket } return ret
def fetch_train_phase(engine, alpha_factors: Union[Transformer, Iterable[object]], ref_date, frequency, universe, batch=1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, fit_target: Union[Transformer, object] = None) -> dict: if isinstance(alpha_factors, Transformer): transformer = alpha_factors else: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = map_freq(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() target_df, factor_df = df[['trade_date', 'code', 'dx']], df[['trade_date', 'code'] + transformer.names] target_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \ _merge_df(engine, transformer.names, factor_df, target_df, universe, dates, risk_model, neutralized_risk) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): pyFinAssert( len(dates) >= 2, ValueError, "No previous data for training for the date {0}".format(ref_date)) end = dates[-2] start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0] else: end = dates[-1] start = dates[-batch] if batch <= len(dates) else dates[0] index = (date_label >= start) & (date_label <= end) this_raw_x = x_values[index] this_raw_y = y_values[index] this_code = codes[index] if risk_exp is not None: this_risk_exp = risk_exp[index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ret = dict() ret['x_names'] = transformer.names ret['train'] = { 'x': pd.DataFrame(ne_x, columns=transformer.names), 'y': ne_y, 'code': this_code } return ret
def fetch_predict_phase(engine, alpha_factors: Union[Transformer, Iterable[object]], ref_date, frequency, universe, batch=1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, fillna: str = None, fit_target: Union[Transformer, object] = None): if isinstance(alpha_factors, Transformer): transformer = alpha_factors else: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch - 1) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = map_freq(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) if fillna: factor_df = factor_df.groupby('trade_date').apply( lambda x: x.fillna(x.median())).reset_index(drop=True).dropna() else: factor_df = factor_df.dropna() if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) names = transformer.names if neutralized_risk: risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] used_neutralized_risk = list(set(neutralized_risk).difference(names)) risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) train_x = pd.merge(train_x, target_df, on=['trade_date', 'code'], how='left') risk_exp = train_x[neutralized_risk].values.astype(float) else: train_x = pd.merge(factor_df, target_df, on=['trade_date', 'code'], how='left') risk_exp = None train_x.dropna(inplace=True, subset=train_x.columns[:-1]) x_values = train_x[names].values.astype(float) y_values = train_x[['dx']].values.astype(float) date_label = pd.DatetimeIndex(train_x.trade_date).to_pydatetime() dates = np.unique(date_label) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-1] start = dates[-batch] if batch <= len(dates) else dates[0] left_index = bisect.bisect_left(date_label, start) right_index = bisect.bisect_right(date_label, end) this_raw_x = x_values[left_index:right_index] this_raw_y = y_values[left_index:right_index] sub_dates = date_label[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) ne_x = ne_x[inner_left_index:inner_right_index] ne_y = ne_y[inner_left_index:inner_right_index] left_index = bisect.bisect_left(date_label, end) right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None ne_y = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = { 'x': pd.DataFrame(ne_x, columns=transformer.names), 'code': codes, 'y': ne_y.flatten() } return ret
def prepare_data(engine: SqlEngine, factors: Union[Transformer, Iterable[object]], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0, fit_target: Union[Transformer, object] = None): if warm_start > 0: p = Period(frequency) p = Period(length=-warm_start * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', start_date, p).strftime('%Y-%m-%d') dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Forward) dates = [d.strftime('%Y-%m-%d') for d in dates] horizon = map_freq(frequency) if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).sort_values( ['trade_date', 'code']) alpha_logger.info("factor data loading finished") if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) alpha_logger.info("fit target data loading finished") industry_df = engine.fetch_industry_range(universe, dates=dates) alpha_logger.info("industry data loading finished") benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) alpha_logger.info("benchmark data loading finished") df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, industry_df, on=['trade_date', 'code']) df['weight'] = df['weight'].fillna(0.) df.dropna(inplace=True) return dates, df[[ 'trade_date', 'code', 'dx' ]], df[['trade_date', 'code', 'weight', 'industry_code', 'industry'] + transformer.names]