Пример #1
0
    def fetch_factor_range(self,
                           universe: Universe,
                           factors: Union[Transformer, Iterable[object]],
                           start_date: str = None,
                           end_date: str = None,
                           dates: Iterable[str] = None,
                           external_data: pd.DataFrame = None,
                           used_factor_tables=None) -> pd.DataFrame:

        if isinstance(factors, Transformer):
            transformer = factors
        else:
            transformer = Transformer(factors)

        dependency = transformer.dependency

        if used_factor_tables:
            factor_cols = _map_factors(dependency, used_factor_tables)
        else:
            factor_cols = _map_factors(dependency, factor_tables)

        big_table = Market
        joined_tables = set()
        joined_tables.add(Market.__table__.name)

        for t in set(factor_cols.values()):
            if t.__table__.name not in joined_tables:
                if dates is not None:
                    big_table = outerjoin(big_table, t, and_(Market.trade_date == t.trade_date,
                                                             Market.code == t.code,
                                                             Market.trade_date.in_(dates)))
                else:
                    big_table = outerjoin(big_table, t, and_(Market.trade_date == t.trade_date,
                                                             Market.code == t.code,
                                                             Market.trade_date.between(start_date, end_date)))
                joined_tables.add(t.__table__.name)

        universe_df = universe.query(self, start_date, end_date, dates)

        query = select(
            [Market.trade_date, Market.code, Market.chgPct] + list(factor_cols.keys())) \
            .select_from(big_table).where(
                and_(
                    Market.code.in_(universe_df.code.unique().tolist()),
                    Market.trade_date.in_(dates) if dates is not None else Market.trade_date.between(start_date, end_date)
                )
        ).distinct()

        df = pd.read_sql(query, self.engine).replace([-np.inf, np.inf], np.nan)

        if external_data is not None:
            df = pd.merge(df, external_data, on=['trade_date', 'code']).dropna()

        df.sort_values(['trade_date', 'code'], inplace=True)
        df.set_index('trade_date', inplace=True)
        res = transformer.transform('code', df).replace([-np.inf, np.inf], np.nan)

        res['chgPct'] = df.chgPct
        res = res.reset_index()
        return pd.merge(res, universe_df[['trade_date', 'code']], how='inner').drop_duplicates(['trade_date', 'code'])
Пример #2
0
    def fetch_factor(self,
                     ref_date: str,
                     factors: Iterable[object],
                     codes: Iterable[int],
                     warm_start: int = 0,
                     used_factor_tables=None) -> pd.DataFrame:

        if isinstance(factors, Transformer):
            transformer = factors
        else:
            transformer = Transformer(factors)

        dependency = transformer.dependency

        if used_factor_tables:
            factor_cols = _map_factors(dependency, used_factor_tables)
        else:
            factor_cols = _map_factors(dependency, factor_tables)

        start_date = advanceDateByCalendar('china.sse', ref_date,
                                           str(-warm_start) +
                                           'b').strftime('%Y-%m-%d')
        end_date = ref_date

        big_table = Market
        joined_tables = set()
        joined_tables.add(Market.__table__.name)

        for t in set(factor_cols.values()):
            if t.__table__.name not in joined_tables:
                big_table = outerjoin(
                    big_table, t,
                    and_(Market.trade_date == t.trade_date,
                         Market.code == t.code))

                joined_tables.add(t.__table__.name)

        query = select(
            [Market.trade_date, Market.code, Market.chgPct, Market.secShortName] + list(
                factor_cols.keys())) \
            .select_from(big_table).where(and_(Market.trade_date.between(start_date, end_date),
                                               Market.code.in_(codes)))

        df = pd.read_sql(query, self.engine) \
            .replace([-np.inf, np.inf], np.nan) \
            .sort_values(['trade_date', 'code']) \
            .set_index('trade_date')
        res = transformer.transform('code', df).replace([-np.inf, np.inf],
                                                        np.nan)

        res['chgPct'] = df.chgPct
        res['secShortName'] = df['secShortName']
        res = res.loc[ref_date:ref_date, :]
        res.index = list(range(len(res)))
        return res
Пример #3
0
    def query(self,
              engine,
              start_date: str = None,
              end_date: str = None,
              dates=None) -> pd.DataFrame:

        universe_cond = self._query_statements(start_date, end_date, dates)

        if self.filter_cond is None and self.exclude_universe is None:
            # simple case
            query = select([UniverseTable.trade_date, UniverseTable.code
                            ]).where(universe_cond).distinct()
            return pd.read_sql(query, engine.engine)
        else:
            if self.filter_cond is not None:
                if isinstance(self.filter_cond, Transformer):
                    transformer = self.filter_cond
                else:
                    transformer = Transformer(self.filter_cond)

                dependency = transformer.dependency
                factor_cols = _map_factors(dependency, factor_tables)
                big_table = Market

                for t in set(factor_cols.values()):
                    if t.__table__.name != Market.__table__.name:
                        big_table = outerjoin(
                            big_table, t,
                            and_(
                                Market.trade_date == t.trade_date,
                                Market.code == t.code,
                                Market.trade_date.in_(dates)
                                if dates else Market.trade_date.between(
                                    start_date, end_date)))
                big_table = join(
                    big_table, UniverseTable,
                    and_(Market.trade_date == UniverseTable.trade_date,
                         Market.code == UniverseTable.code, universe_cond))

                query = select(
                    [Market.trade_date, Market.code] + list(factor_cols.keys())) \
                    .select_from(big_table).distinct()

                df = pd.read_sql(query, engine.engine).sort_values(
                    ['trade_date', 'code']).dropna()
                df.set_index('trade_date', inplace=True)
                filter_fields = transformer.names
                pyFinAssert(
                    len(filter_fields) == 1, ValueError,
                    "filter fields can only be 1")
                df = transformer.transform('code', df)
                df = df[df[filter_fields[0]] == 1].reset_index()[[
                    'trade_date', 'code'
                ]]
            return df
Пример #4
0
    def __init__(self, features=None, fit_target=None):
        if features is not None:
            self.formulas = Transformer(features)
            self.features = self.formulas.names
        else:
            self.features = None

        if fit_target is not None:
            self.fit_target = Transformer(fit_target)
        else:
            self.fit_target = None
        self.impl = None
        self.trained_time = None
Пример #5
0
    def fetch_factor(self,
                     ref_date: str,
                     factors: Iterable[object],
                     codes: Iterable[int],
                     warm_start: int = 0,
                     used_factor_tables=None) -> pd.DataFrame:

        if isinstance(factors, Transformer):
            transformer = factors
        else:
            transformer = Transformer(factors)

        dependency = transformer.dependency

        if used_factor_tables:
            factor_cols = _map_factors(dependency, used_factor_tables)
        else:
            factor_cols = _map_factors(dependency, factor_tables)

        start_date = advanceDateByCalendar('china.sse', ref_date,
                                           str(-warm_start) +
                                           'b').strftime('%Y-%m-%d')
        end_date = ref_date

        big_table = FullFactor

        for t in set(factor_cols.values()):
            if t.__table__.name != FullFactor.__table__.name:
                big_table = outerjoin(
                    big_table, t,
                    and_(FullFactor.trade_date == t.trade_date,
                         FullFactor.code == t.code))

        query = select(
            [FullFactor.trade_date, FullFactor.code, FullFactor.isOpen] + list(factor_cols.keys())) \
            .select_from(big_table).where(and_(FullFactor.trade_date.between(start_date, end_date),
                                               FullFactor.code.in_(codes)))

        df = pd.read_sql(query,
                         self.engine).sort_values(['trade_date', 'code'
                                                   ]).set_index('trade_date')
        res = transformer.transform('code', df)

        for col in res.columns:
            if col not in set(['code', 'isOpen']) and col not in df.columns:
                df[col] = res[col].values

        df['isOpen'] = df.isOpen.astype(bool)
        df = df.loc[ref_date]
        df.index = list(range(len(df)))
        return df
Пример #6
0
    def fetch_data_experimental(self, ref_date: str,
                                factors: Iterable[str],
                                codes: Iterable[int],
                                benchmark: int = None,
                                risk_model: str = 'short',
                                industry: str = 'sw') -> Dict[str, pd.DataFrame]:

        total_data = {}

        transformer = Transformer(factors)
        factor_data = self.fetch_factor(ref_date, transformer, codes, used_factor_tables=[Experimental])

        if benchmark:
            benchmark_data = self.fetch_benchmark(ref_date, benchmark)
            total_data['benchmark'] = benchmark_data
            factor_data = pd.merge(factor_data, benchmark_data, how='left', on=['code'])
            factor_data['weight'] = factor_data['weight'].fillna(0.)

        if risk_model:
            excluded = list(set(total_risk_factors).intersection(transformer.dependency))
            risk_cov, risk_exp = self.fetch_risk_model(ref_date, codes, risk_model, excluded)
            factor_data = pd.merge(factor_data, risk_exp, how='left', on=['code'])
            total_data['risk_cov'] = risk_cov

        industry_info = self.fetch_industry(ref_date=ref_date,
                                            codes=codes,
                                            category=industry)

        factor_data = pd.merge(factor_data, industry_info, on=['code'])

        total_data['factor'] = factor_data
        return total_data
Пример #7
0
    def fetch_data_range(
            self,
            universe: Universe,
            factors: Iterable[str],
            start_date: str = None,
            end_date: str = None,
            dates: Iterable[str] = None,
            benchmark: int = None,
            risk_model: str = 'short',
            industry: str = 'sw',
            external_data: pd.DataFrame = None) -> Dict[str, pd.DataFrame]:

        total_data = {}
        transformer = Transformer(factors)
        factor_data = self.fetch_factor_range(universe,
                                              transformer,
                                              start_date,
                                              end_date,
                                              dates,
                                              external_data=external_data)

        if benchmark:
            benchmark_data = self.fetch_benchmark_range(
                benchmark, start_date, end_date, dates)
            total_data['benchmark'] = benchmark_data
            factor_data = pd.merge(factor_data,
                                   benchmark_data,
                                   how='left',
                                   on=['trade_date', 'code'])
            factor_data['weight'] = factor_data['weight'].fillna(0.)

        if risk_model:
            excluded = list(
                set(total_risk_factors).intersection(transformer.dependency))
            risk_cov, risk_exp = self.fetch_risk_model_range(
                universe, start_date, end_date, dates, risk_model, excluded)
            factor_data = pd.merge(factor_data,
                                   risk_exp,
                                   how='left',
                                   on=['trade_date', 'code'])
            total_data['risk_cov'] = risk_cov

        industry_info = self.fetch_industry_range(universe,
                                                  start_date=start_date,
                                                  end_date=end_date,
                                                  dates=dates,
                                                  category=industry)

        factor_data = pd.merge(factor_data,
                               industry_info,
                               on=['trade_date', 'code'])
        total_data['factor'] = factor_data
        return total_data
Пример #8
0
def prepare_data(engine: SqlEngine,
                 factors: Union[Transformer, Iterable[object]],
                 start_date: str,
                 end_date: str,
                 frequency: str,
                 universe: Universe,
                 benchmark: int,
                 warm_start: int = 0):
    if warm_start > 0:
        p = Period(frequency)
        p = Period(length=-warm_start * p.length(), units=p.units())
        start_date = advanceDateByCalendar('china.sse', start_date,
                                           p).strftime('%Y-%m-%d')

    dates = makeSchedule(start_date,
                         end_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Forward)

    dates = [d.strftime('%Y-%m-%d') for d in dates]

    horizon = map_freq(frequency)

    if isinstance(factors, Transformer):
        transformer = factors
    else:
        transformer = Transformer(factors)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).sort_values(
                                              ['trade_date', 'code'])
    alpha_logger.info("factor data loading finished")
    return_df = engine.fetch_dx_return_range(universe,
                                             dates=dates,
                                             horizon=horizon)
    alpha_logger.info("return data loading finished")
    industry_df = engine.fetch_industry_range(universe, dates=dates)
    alpha_logger.info("industry data loading finished")
    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
    alpha_logger.info("benchmark data loading finished")

    df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
    df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
    df = pd.merge(df, industry_df, on=['trade_date', 'code'])
    df['weight'] = df['weight'].fillna(0.)

    return dates, df[['trade_date', 'code', 'dx']], df[[
        'trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry'
    ] + transformer.names]
Пример #9
0
    def fetch_factor_range_forward(self,
                                   universe: Universe,
                                   factors: Union[Transformer, object],
                                   start_date: str = None,
                                   end_date: str = None,
                                   dates: Iterable[str] = None):
        if isinstance(factors, Transformer):
            transformer = factors
        else:
            transformer = Transformer(factors)

        dependency = transformer.dependency
        factor_cols = _map_factors(dependency, factor_tables)

        codes = universe.query(self, start_date, end_date, dates)
        total_codes = codes.code.unique().tolist()
        total_dates = codes.trade_date.astype(str).unique().tolist()

        big_table = Market
        joined_tables = set()
        joined_tables.add(Market.__table__.name)

        for t in set(factor_cols.values()):
            if t.__table__.name not in joined_tables:
                if dates is not None:
                    big_table = outerjoin(
                        big_table, t,
                        and_(Market.trade_date == t.trade_date,
                             Market.code == t.code,
                             Market.trade_date.in_(dates)))
                else:
                    big_table = outerjoin(
                        big_table, t,
                        and_(Market.trade_date == t.trade_date,
                             Market.code == t.code,
                             Market.trade_date.between(start_date, end_date)))
                joined_tables.add(t.__table__.name)

        stats = func.lag(list(factor_cols.keys())[0],
                         -1).over(partition_by=Market.code,
                                  order_by=Market.trade_date).label('dx')

        query = select([Market.trade_date, Market.code, Market.chgPct,
                        stats]).select_from(big_table).where(
                            and_(Market.trade_date.in_(total_dates),
                                 Market.code.in_(total_codes)))

        df = pd.read_sql(query, self.engine) \
            .replace([-np.inf, np.inf], np.nan) \
            .sort_values(['trade_date', 'code'])
        return pd.merge(df, codes[['trade_date', 'code']],
                        how='inner').drop_duplicates(['trade_date', 'code'])
Пример #10
0
def fetch_data_package(engine: SqlEngine,
                       alpha_factors: Iterable[object],
                       start_date: str,
                       end_date: str,
                       frequency: str,
                       universe: Universe,
                       benchmark: int,
                       warm_start: int = 0,
                       batch: int = 1,
                       neutralized_risk: Iterable[str] = None,
                       risk_model: str = 'short',
                       pre_process: Iterable[object] = None,
                       post_process: Iterable[object] = None) -> dict:
    alpha_logger.info("Starting data package fetching ...")

    transformer = Transformer(alpha_factors)
    dates, return_df, factor_df = prepare_data(engine, transformer, start_date,
                                               end_date, frequency, universe,
                                               benchmark, warm_start)

    return_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y = \
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

    return_df['weight'] = train_x['weight']
    return_df['industry'] = train_x['industry']
    return_df['industry_code'] = train_x['industry_code']
    return_df['isOpen'] = train_x['isOpen']

    if neutralized_risk:
        for i, name in enumerate(neutralized_risk):
            return_df.loc[:, name] = risk_exp[:, i]

    alpha_logger.info("Loading data is finished")

    train_x_buckets, train_y_buckets, predict_x_buckets, predict_y_buckets = batch_processing(
        x_values, y_values, dates, date_label, batch, risk_exp, pre_process,
        post_process)

    alpha_logger.info("Data processing is finished")

    ret = dict()
    ret['x_names'] = transformer.names
    ret['settlement'] = return_df
    ret['train'] = {'x': train_x_buckets, 'y': train_y_buckets}
    ret['predict'] = {'x': predict_x_buckets, 'y': predict_y_buckets}
    return ret
Пример #11
0
    def fetch_factor_range(self,
                           universe: Universe,
                           factors: Union[Transformer, Iterable[object]],
                           start_date: str = None,
                           end_date: str = None,
                           dates: Iterable[str] = None,
                           external_data: pd.DataFrame = None,
                           used_factor_tables=None) -> pd.DataFrame:

        if isinstance(factors, Transformer):
            transformer = factors
        else:
            transformer = Transformer(factors)

        dependency = transformer.dependency

        if used_factor_tables:
            factor_cols = _map_factors(dependency, used_factor_tables)
        else:
            factor_cols = _map_factors(dependency, factor_tables)

        cond = universe.query_range(start_date, end_date, dates)

        big_table = FullFactor

        for t in set(factor_cols.values()):
            if t.__table__.name != FullFactor.__table__.name:
                if dates is not None:
                    big_table = outerjoin(
                        big_table, t,
                        and_(FullFactor.trade_date == t.trade_date,
                             FullFactor.code == t.code,
                             FullFactor.trade_date.in_(dates)))
                else:
                    big_table = outerjoin(
                        big_table, t,
                        and_(
                            FullFactor.trade_date == t.trade_date,
                            FullFactor.code == t.code,
                            FullFactor.trade_date.between(
                                start_date, end_date)))

        big_table = join(
            big_table, UniverseTable,
            and_(FullFactor.trade_date == UniverseTable.trade_date,
                 FullFactor.code == UniverseTable.code, cond))

        query = select(
            [FullFactor.trade_date, FullFactor.code, FullFactor.isOpen] + list(factor_cols.keys())) \
            .select_from(big_table).distinct()

        df = pd.read_sql(query,
                         self.engine).sort_values(['trade_date', 'code'])

        if external_data is not None:
            df = pd.merge(df, external_data, on=['trade_date',
                                                 'code']).dropna()

        df.set_index('trade_date', inplace=True)
        res = transformer.transform('code', df)

        for col in res.columns:
            if col not in set(['code', 'isOpen']) and col not in df.columns:
                df[col] = res[col].values

        df['isOpen'] = df.isOpen.astype(bool)
        return df.reset_index()
Пример #12
0
def fetch_train_phase(engine,
                      alpha_factors: Iterable[object],
                      ref_date,
                      frequency,
                      universe,
                      batch,
                      neutralized_risk: Iterable[str] = None,
                      risk_model: str = 'short',
                      pre_process: Iterable[object] = None,
                      post_process: Iterable[object] = None,
                      warm_start: int = 0) -> dict:
    transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = _map_horizon(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)
    return_df = engine.fetch_dx_return_range(universe,
                                             dates=dates,
                                             horizon=horizon)

    df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()

    return_df, factor_df = df[['trade_date', 'code',
                               'dx']], df[['trade_date', 'code', 'isOpen'] +
                                          transformer.names]

    return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-2]
        start = dates[-batch - 1]
    else:
        end = dates[-1]
        start = dates[-batch]

    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
    this_raw_y = y_values[index]
    if risk_exp is not None:
        this_risk_exp = risk_exp[index]
    else:
        this_risk_exp = None

    ne_x = factor_processing(this_raw_x,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ne_y = factor_processing(this_raw_y,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ret = dict()
    ret['x_names'] = transformer.names
    ret['train'] = {'x': ne_x, 'y': ne_y}

    return ret
Пример #13
0
def fetch_predict_phase(engine,
                        alpha_factors: Iterable[object],
                        ref_date,
                        frequency,
                        universe,
                        batch,
                        neutralized_risk: Iterable[str] = None,
                        risk_model: str = 'short',
                        pre_process: Iterable[object] = None,
                        post_process: Iterable[object] = None,
                        warm_start: int = 0):
    transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).dropna()

    names = transformer.names

    if neutralized_risk:
        risk_df = engine.fetch_risk_model_range(universe,
                                                dates=dates,
                                                risk_model=risk_model)[1]
        used_neutralized_risk = list(set(neutralized_risk).difference(names))
        risk_df = risk_df[['trade_date', 'code'] +
                          used_neutralized_risk].dropna()
        train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code'])
        risk_exp = train_x[neutralized_risk].values.astype(float)
        x_values = train_x[names].values.astype(float)
    else:
        train_x = factor_df.copy()
        risk_exp = None

    date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
    dates = np.unique(date_label)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-1]
        start = dates[-batch]

        # index = (date_label >= start) & (date_label <= end)
        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
        this_raw_x = x_values[left_index:right_index]
        sub_dates = date_label[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
        else:
            this_risk_exp = None

        ne_x = factor_processing(this_raw_x,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)

        ne_x = ne_x[inner_left_index:inner_right_index]

        left_index = bisect.bisect_left(date_label, end)
        right_index = bisect.bisect_right(date_label, end)

        codes = train_x.code.values[left_index:right_index]
    else:
        ne_x = None
        codes = None

    ret = dict()
    ret['x_names'] = transformer.names
    ret['predict'] = {'x': ne_x, 'code': codes}

    return ret
Пример #14
0
def fetch_data_package(engine: SqlEngine,
                       alpha_factors: Iterable[object],
                       start_date: str,
                       end_date: str,
                       frequency: str,
                       universe: Universe,
                       benchmark: int,
                       warm_start: int = 0,
                       batch: int = 1,
                       neutralized_risk: Iterable[str] = None,
                       risk_model: str = 'short',
                       pre_process: Iterable[object] = None,
                       post_process: Iterable[object] = None,
                       fit_target: Union[Transformer, object] = None) -> dict:
    alpha_logger.info("Starting data package fetching ...")
    transformer = Transformer(alpha_factors)
    names = transformer.names
    dates, target_df, factor_df = prepare_data(engine,
                                               transformer,
                                               start_date,
                                               end_date,
                                               frequency,
                                               universe,
                                               benchmark,
                                               warm_start + batch,
                                               fit_target=fit_target)

    target_df, dates, date_label, risk_exp, x_values, y_values, train_x, train_y, codes = \
        _merge_df(engine, names, factor_df, target_df, universe, dates, risk_model,
                  neutralized_risk)

    alpha_logger.info("data merging finished")

    target_df['weight'] = train_x['weight']
    target_df['industry'] = train_x['industry']
    target_df['industry_code'] = train_x['industry_code']

    if neutralized_risk:
        for i, name in enumerate(neutralized_risk):
            target_df.loc[:, name] = risk_exp[:, i]

    alpha_logger.info("Loading data is finished")

    train_x_buckets, train_y_buckets, train_risk_buckets, predict_x_buckets, predict_y_buckets, predict_risk_buckets, predict_codes_bucket \
        = batch_processing(names,
                           x_values,
                           y_values,
                           dates,
                           date_label,
                           batch,
                           risk_exp,
                           pre_process,
                           post_process,
                           codes)

    alpha_logger.info("Data processing is finished")

    ret = dict()
    ret['x_names'] = names
    ret['settlement'] = target_df[target_df.trade_date >= start_date]

    train_x_buckets = {
        k: train_x_buckets[k]
        for k in train_x_buckets if k.strftime('%Y-%m-%d') >= start_date
    }
    train_y_buckets = {
        k: train_y_buckets[k]
        for k in train_y_buckets if k.strftime('%Y-%m-%d') >= start_date
    }
    train_risk_buckets = {
        k: train_risk_buckets[k]
        for k in train_risk_buckets if k.strftime('%Y-%m-%d') >= start_date
    }

    predict_x_buckets = {
        k: predict_x_buckets[k]
        for k in predict_x_buckets if k.strftime('%Y-%m-%d') >= start_date
    }
    predict_y_buckets = {
        k: predict_y_buckets[k]
        for k in predict_y_buckets if k.strftime('%Y-%m-%d') >= start_date
    }
    if neutralized_risk:
        predict_risk_buckets = {
            k: predict_risk_buckets[k]
            for k in predict_risk_buckets
            if k.strftime('%Y-%m-%d') >= start_date
        }
    else:
        predict_risk_buckets = None
    predict_codes_bucket = {
        k: predict_codes_bucket[k]
        for k in predict_codes_bucket if k.strftime('%Y-%m-%d') >= start_date
    }

    ret['train'] = {
        'x': train_x_buckets,
        'y': train_y_buckets,
        'risk': train_risk_buckets
    }
    ret['predict'] = {
        'x': predict_x_buckets,
        'y': predict_y_buckets,
        'risk': predict_risk_buckets,
        'code': predict_codes_bucket
    }
    return ret
Пример #15
0
def fetch_train_phase(engine,
                      alpha_factors: Union[Transformer, Iterable[object]],
                      ref_date,
                      frequency,
                      universe,
                      batch=1,
                      neutralized_risk: Iterable[str] = None,
                      risk_model: str = 'short',
                      pre_process: Iterable[object] = None,
                      post_process: Iterable[object] = None,
                      warm_start: int = 0,
                      fit_target: Union[Transformer, object] = None) -> dict:
    if isinstance(alpha_factors, Transformer):
        transformer = alpha_factors
    else:
        transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = map_freq(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)
    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))

    df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()

    target_df, factor_df = df[['trade_date', 'code',
                               'dx']], df[['trade_date', 'code'] +
                                          transformer.names]

    target_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
        _merge_df(engine, transformer.names, factor_df, target_df, universe, dates, risk_model,
                  neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        pyFinAssert(
            len(dates) >= 2, ValueError,
            "No previous data for training for the date {0}".format(ref_date))
        end = dates[-2]
        start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0]
    else:
        end = dates[-1]
        start = dates[-batch] if batch <= len(dates) else dates[0]

    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
    this_raw_y = y_values[index]
    this_code = codes[index]
    if risk_exp is not None:
        this_risk_exp = risk_exp[index]
    else:
        this_risk_exp = None

    ne_x = factor_processing(this_raw_x,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ne_y = factor_processing(this_raw_y,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ret = dict()
    ret['x_names'] = transformer.names
    ret['train'] = {
        'x': pd.DataFrame(ne_x, columns=transformer.names),
        'y': ne_y,
        'code': this_code
    }

    return ret
Пример #16
0
def fetch_predict_phase(engine,
                        alpha_factors: Union[Transformer, Iterable[object]],
                        ref_date,
                        frequency,
                        universe,
                        batch=1,
                        neutralized_risk: Iterable[str] = None,
                        risk_model: str = 'short',
                        pre_process: Iterable[object] = None,
                        post_process: Iterable[object] = None,
                        warm_start: int = 0,
                        fillna: str = None,
                        fit_target: Union[Transformer, object] = None):
    if isinstance(alpha_factors, Transformer):
        transformer = alpha_factors
    else:
        transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch - 1) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = map_freq(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)

    if fillna:
        factor_df = factor_df.groupby('trade_date').apply(
            lambda x: x.fillna(x.median())).reset_index(drop=True).dropna()
    else:
        factor_df = factor_df.dropna()

    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))

    names = transformer.names

    if neutralized_risk:
        risk_df = engine.fetch_risk_model_range(universe,
                                                dates=dates,
                                                risk_model=risk_model)[1]
        used_neutralized_risk = list(set(neutralized_risk).difference(names))
        risk_df = risk_df[['trade_date', 'code'] +
                          used_neutralized_risk].dropna()
        train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code'])
        train_x = pd.merge(train_x,
                           target_df,
                           on=['trade_date', 'code'],
                           how='left')
        risk_exp = train_x[neutralized_risk].values.astype(float)
    else:
        train_x = pd.merge(factor_df,
                           target_df,
                           on=['trade_date', 'code'],
                           how='left')
        risk_exp = None

    train_x.dropna(inplace=True, subset=train_x.columns[:-1])
    x_values = train_x[names].values.astype(float)
    y_values = train_x[['dx']].values.astype(float)

    date_label = pd.DatetimeIndex(train_x.trade_date).to_pydatetime()
    dates = np.unique(date_label)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-1]
        start = dates[-batch] if batch <= len(dates) else dates[0]

        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
        this_raw_x = x_values[left_index:right_index]
        this_raw_y = y_values[left_index:right_index]
        sub_dates = date_label[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
        else:
            this_risk_exp = None

        ne_x = factor_processing(this_raw_x,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        ne_y = factor_processing(this_raw_y,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)

        ne_x = ne_x[inner_left_index:inner_right_index]
        ne_y = ne_y[inner_left_index:inner_right_index]

        left_index = bisect.bisect_left(date_label, end)
        right_index = bisect.bisect_right(date_label, end)

        codes = train_x.code.values[left_index:right_index]
    else:
        ne_x = None
        ne_y = None
        codes = None

    ret = dict()
    ret['x_names'] = transformer.names
    ret['predict'] = {
        'x': pd.DataFrame(ne_x, columns=transformer.names),
        'code': codes,
        'y': ne_y.flatten()
    }

    return ret
Пример #17
0
def prepare_data(engine: SqlEngine,
                 factors: Union[Transformer, Iterable[object]],
                 start_date: str,
                 end_date: str,
                 frequency: str,
                 universe: Universe,
                 benchmark: int,
                 warm_start: int = 0,
                 fit_target: Union[Transformer, object] = None):
    if warm_start > 0:
        p = Period(frequency)
        p = Period(length=-warm_start * p.length(), units=p.units())
        start_date = advanceDateByCalendar('china.sse', start_date,
                                           p).strftime('%Y-%m-%d')

    dates = makeSchedule(start_date,
                         end_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Forward)

    dates = [d.strftime('%Y-%m-%d') for d in dates]

    horizon = map_freq(frequency)

    if isinstance(factors, Transformer):
        transformer = factors
    else:
        transformer = Transformer(factors)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).sort_values(
                                              ['trade_date', 'code'])
    alpha_logger.info("factor data loading finished")

    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))
    alpha_logger.info("fit target data loading finished")

    industry_df = engine.fetch_industry_range(universe, dates=dates)
    alpha_logger.info("industry data loading finished")
    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
    alpha_logger.info("benchmark data loading finished")

    df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()
    df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
    df = pd.merge(df, industry_df, on=['trade_date', 'code'])
    df['weight'] = df['weight'].fillna(0.)
    df.dropna(inplace=True)

    return dates, df[[
        'trade_date', 'code', 'dx'
    ]], df[['trade_date', 'code', 'weight', 'industry_code', 'industry'] +
           transformer.names]