Exemplo n.º 1
0
def st_stocks_filter(stocks,date):
    """
    获得某日的ST类股票
    :param stocks: list 股票列表
    :param date: 交易日
    :return: list 列表中st的股票
    """
    st_series = rqdatac.is_st_stock(stocks,start_date=date,end_date=date).iloc[-1]
    return st_series[~st_series].index.tolist()
Exemplo n.º 2
0
def get_implicit_factor_return(date):

    latest_trading_date = str(
        rqdatac.get_previous_trading_date(
            datetime.strptime(date, "%Y-%m-%d") + timedelta(days=1)))

    previous_trading_date = str(
        rqdatac.get_previous_trading_date(latest_trading_date))

    # 取前一交易日全市场已经上市的股票,保证日收益率计算

    stock_list = rqdatac.all_instruments(
        type='CS', date=previous_trading_date)['order_book_id'].tolist()

    # 剔除上市不满21天的股票
    trading_date_21_days_before = str(
        rqdatac.get_previous_trading_date(latest_trading_date,
                                          country='cn',
                                          n=21))

    stock_list = [
        i for i in stock_list
        if rqdatac.instruments(i).listed_date <= trading_date_21_days_before
    ]

    # 剔除ST股
    is_st_df = rqdatac.is_st_stock(stock_list,
                                   start_date=previous_trading_date,
                                   end_date=previous_trading_date)

    is_st_df.index = is_st_df.index.astype(str)

    stock_list = is_st_df.loc[previous_trading_date][
        is_st_df.loc[previous_trading_date].values == False].index.tolist()

    # 剔除停牌股
    trading_volume = rqdatac.get_price(stock_list,
                                       start_date=previous_trading_date,
                                       end_date=previous_trading_date,
                                       frequency='1d',
                                       fields='volume',
                                       country='cn')

    stock_list = trading_volume.loc[previous_trading_date][
        trading_volume.loc[previous_trading_date].values > 0].index.tolist()

    # 计算全市场前一交易日的行业暴露度

    factor_exposure = get_exposure(stock_list, str(previous_trading_date))

    # 根据上述四类暴露度计算因子收益率

    factor_returns = factor_return_estimation(latest_trading_date,
                                              factor_exposure)

    return factor_returns
Exemplo n.º 3
0
    def data_preprocessing(self, equity_list, start_date, end_date):

        if self.et is 'funds':
            period_data = rqdatac.fund.get_nav(equity_list, start_date, end_date, fields='acc_net_value')
        elif self.et is 'stocks':
            period_data = rqdatac.get_price(equity_list, start_date, end_date, frequency='1d', fields=['close', 'volume'])
        period_prices = period_data['close']
        period_volume = period_data['volume']
        self.period_prices = period_prices
        # Set up the threshhold of elimination
        out_threshold = ceil(period_prices.shape[0] / 2)
        reset_end_date = pd.to_datetime(end_date)
        reset_start_date = pd.to_datetime(start_date)
        kickout_list = list()
        suspended_list = list()
        # Locate the first valid value of each column, if available sequence length is less than threshhold, add
        # the column name into out_list; if sequence length is longer than threshold but less than chosen period length,
        # reset the start_date to the later date. The latest start_date whose sequence length is greater than threshold
        # will be chose.
        # Check whether any stocks has long suspended trading periods or has been delisted and generate list
        # for such stocks
        for i in equity_list:
            if not period_volume.loc[:, i].value_counts().empty:
                if ((reset_end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) < out_threshold:
                    kickout_list.append(i)
                elif period_prices.loc[:, i].first_valid_index() < reset_start_date:
                    reset_start_date = period_prices.loc[:, i].first_valid_index()
                elif period_volume.loc[:, i].last_valid_index() < reset_end_date or \
                                period_volume.loc[:, i].value_counts().iloc[0] >= out_threshold:
                    suspended_list.append(i)
            else:
                kickout_list.append(i)
        # Check whether any ST stocks are included and generate a list for ST stocks
        st_list = list(period_prices.columns.values[rqdatac.is_st_stock(equity_list, reset_start_date, reset_end_date).sum(axis=0)>0])
        # Generate final kickout list which includes all the above

        # kickout_list_s = set(kickout_list)
        # st_list_s = set(st_list)
        # suspended_list_s = set(suspended_list)
        # two_list_union = st_list_s.union(suspended_list_s)
        # final_dif = two_list_union - kickout_list_s
        # final_kickout_list = kickout_list + list(final_dif)
        final_kickout_list = list(set().union(kickout_list, st_list, suspended_list))

        # Generate clean data
        equity_list_s = set(equity_list)
        final_kickout_list_s = set(final_kickout_list)
        clean_equity_list = list(equity_list_s - final_kickout_list_s)
        clean_period_prices = period_prices.loc[reset_start_date:reset_end_date, clean_equity_list]
        self.clean_period_prices = clean_period_prices
        self.clean_equity_list = list(clean_period_prices.columns.values)
        self.kickout_list = kickout_list
        self.st_list = st_list
        self.suspended_list = suspended_list
        self.reset_start_d = reset_start_date
        self.reset_end_d = reset_end_date
def data_process(order_book_ids, equity_type, start_date):

    windows = 132
    end_date = rqdatac.get_previous_trading_date(start_date)
    for i in range(windows + 1):
        start_date = rqdatac.get_previous_trading_date(start_date)

    if equity_type is 'funds':
        period_data = rqdatac.fund.get_nav(order_book_ids,
                                           start_date,
                                           end_date,
                                           fields='acc_net_value')
    elif equity_type is 'stocks':
        period_data = rqdatac.get_price(order_book_ids,
                                        start_date,
                                        end_date,
                                        frequency='1d',
                                        fields=['close', 'volume'])
    period_prices = period_data['close']
    period_volume = period_data['volume']
    # Set up the threshhold of elimination
    out_threshold = ceil(period_prices.shape[0] / 2)
    kickout_list = list()
    suspended_list = list()
    # Locate the first valid value of each column, if available sequence length is less than threshhold, add
    # the column name into out_list; if sequence length is longer than threshold but less than chosen period length,
    # reset the start_date to the later date. The latest start_date whose sequence length is greater than threshold
    # will be chose.
    # Check whether any stocks has long suspended trading periods or has been delisted and generate list
    # for such stocks
    for i in order_book_ids:
        if not period_volume.loc[:, i].value_counts().empty:
            if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) \
                    < out_threshold:
                kickout_list.append(i)
            elif period_prices.loc[:, i].first_valid_index() < start_date:
                reset_start_date = period_prices.loc[:, i].first_valid_index()
            elif period_volume.loc[:, i].last_valid_index() < end_date or \
                            period_volume.loc[:, i].value_counts().iloc[0] >= out_threshold:
                suspended_list.append(i)
        else:
            kickout_list.append(i)
    # Check whether any ST stocks are included and generate a list for ST stocks
    st_list = list(period_prices.columns.values[rqdatac.is_st_stock(
        order_book_ids, reset_start_date, end_date).sum(axis=0) > 0])
    # Generate final kickout list which includes all the above
    final_kickout_list = list(set().union(kickout_list, st_list,
                                          suspended_list))
    # Generate clean data
    order_book_ids_s = set(order_book_ids)
    final_kickout_list_s = set(final_kickout_list)
    clean_order_book_ids = list(order_book_ids_s - final_kickout_list_s)
    clean_period_prices = period_prices.loc[reset_start_date:end_date,
                                            clean_order_book_ids]
    return clean_period_prices, final_kickout_list
Exemplo n.º 5
0
def drop_st(universe, date):
    is_st = rqd.is_st_stock(universe, date, date).squeeze()
    assert isinstance(is_st, pd.Series), 'is_st is not series'
    return is_st.index[~is_st]
Exemplo n.º 6
0
def get_customized_factor_return(universe,
                                 date,
                                 skip_suspended=True,
                                 skip_st_stocks=True,
                                 method='implicit'):
    """

    PARAMETERS
    ----------
    universe:list
              用户自定义股票白名单。默认为 None。用户需传入和股票白名单相对应的 order_book_ids,例如:['600705.XSHG', ' 601555.XSHG']

    date: str
          计算日期(例如:‘2017-03-03’)。需注意股票白名单应为计算日期已上市股票。

    skip_suspended: boolean
                    是否剔除白名单中当天停牌的股票。默认为 True。

    skip_st_stocks: boolean
                    是否剔除白名单中的ST股。默认为 True。

    method: str
            计算方法。默认为'implicit'(隐式因子收益率),可选'explicit'(显式风格因子收益率)

    RETURN
    ----------

    factor_return: Series, 依据用户指定的股票池计算出的因子(或风格因子)收益率,index 为因子名称。

    """

    latest_trading_date = str(
        rqdatac.get_previous_trading_date(
            datetime.strptime(date, "%Y-%m-%d") + timedelta(days=1)))

    previous_trading_date = str(
        rqdatac.get_previous_trading_date(latest_trading_date))

    # 依据用户的选择参数,对stock_list进行筛选

    # 若用户选择剔除ST股:

    if skip_st_stocks == True:

        is_st_df = rqdatac.is_st_stock(universe,
                                       start_date=date,
                                       end_date=date)

        is_st_df.index = is_st_df.index.astype(str)

        universe = is_st_df.loc[date][is_st_df.loc[date].values ==
                                      False].index.tolist()

    # 若用户选择剔除停牌股:

    if skip_suspended == True:

        trading_volume = rqdatac.get_price(universe,
                                           start_date=date,
                                           end_date=date,
                                           frequency='1d',
                                           fields='volume',
                                           country='cn')

        universe = trading_volume.loc[date][
            trading_volume.loc[date].values > 0].index.tolist()

    # 计算指定股票池内股票前一交易日的行业暴露度

    factor_exposure = get_exposure(universe, str(previous_trading_date))

    # 根据上述暴露度计算因子收益率

    if method == 'implicit':

        factor_return = customized_factor_return_estimation(
            date, factor_exposure, universe)

    else:

        factor_return = get_explicit_factor_returns(date, universe)

    return factor_return
Exemplo n.º 7
0
def data_process(order_book_ids,
                 asset_type,
                 start_date,
                 windows,
                 out_threshold_coefficient=None):
    """
    Clean data for covariance matrix calculation
    :param order_book_ids: str list. A group of assets.
    :param asset_type: str. "fund" or "stock"
    :param start_date: str. The first day for backtest.
    :param windows: int. Interval length of sample.
    :param out_threshold_coefficient: float, optional. Determine the threshold to filter out assets with too short data
    which may cause problem in covariance matrix calculation. Whose data length is shorter than threshold will
    be eliminated. Default: 0.5(out_threshold = 0.5*windows).
    :return: DataFrame, DataFrame, str. The first DataFrame contains the prices after cleaning; the second DataFrame
    contains the order_book_ids been filtered out and the reasons of elimination; str is a new start date for
    covariance calculation interval which may differ from default.
    """

    end_date = rqdatac.get_previous_trading_date(start_date)
    end_date = pd.to_datetime(end_date)
    # Choose the start date based on the windows inputted, can't work if backtest start date is earlier than
    # "2005-07-01"
    start_date = rqdatac.get_trading_dates("2005-01-01",
                                           end_date)[-windows - 1]
    reset_start_date = pd.to_datetime(start_date)

    if asset_type is 'fund':
        period_prices = rqdatac.fund.get_nav(order_book_ids,
                                             reset_start_date,
                                             end_date,
                                             fields='adjusted_net_value')
    elif asset_type is 'stock':
        period_data = rqdatac.get_price(order_book_ids,
                                        reset_start_date,
                                        end_date,
                                        frequency='1d',
                                        fields=['close', 'volume'])
        period_prices = period_data['close']
        period_volume = period_data['volume']

    # Set up the threshold of elimination
    if out_threshold_coefficient is None:
        out_threshold = ceil(windows * 0.5)
    else:
        out_threshold = ceil(windows * out_threshold_coefficient)

    kickout_assets = pd.DataFrame(columns=["Elimination Reason"])
    # Locate the first valid value of each column, if available sequence length is less than threshhold, add
    # the column name into out_list; if sequence length is longer than threshold but less than chosen period length,
    # reset the start_date to the later date. The latest start_date whose sequence length is greater than threshold
    # will be chose.
    # Check whether any stocks has long suspended trading periods or has been delisted and generate list
    # for such stocks
    if asset_type is "stock":
        for i in order_book_ids:
            if not period_volume.loc[:, i].value_counts().empty:
                if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) \
                        < out_threshold:
                    temp = pd.DataFrame(
                        {"Elimination Reason": "Late beginning date"},
                        index=[i])
                    kickout_assets = kickout_assets.append(temp)
                elif period_prices.loc[:, i].isnull().sum() >= out_threshold:
                    temp = pd.DataFrame(
                        {
                            "Elimination Reason":
                            "Missing values over threshold"
                        },
                        index=[i])
                    kickout_assets = kickout_assets.append(temp)
                elif period_prices.loc[:, i].first_valid_index(
                ) < reset_start_date:
                    reset_start_date = period_prices.loc[:,
                                                         i].first_valid_index(
                                                         )
                elif period_volume.loc[:, i].last_valid_index() < end_date:
                    temp = pd.DataFrame({"Elimination Reason": "Delisted"},
                                        index=[i])
                    kickout_assets = kickout_assets.append(temp)
                elif period_volume.loc[:, i].value_counts(
                ).iloc[0] >= out_threshold:
                    temp = pd.DataFrame(
                        {
                            "Elimination Reason":
                            "Suspended days over threshold"
                        },
                        index=[i])
                    kickout_assets = kickout_assets.append(temp)
            else:
                temp = pd.DataFrame({"Elimination Reason": "Empty data"},
                                    index=[i])
                kickout_assets = kickout_assets.append(temp)

        # Check whether any ST stocks are included and generate a list for ST stocks
        st_list = list(period_prices.columns.values[rqdatac.is_st_stock(
            order_book_ids, reset_start_date, end_date).sum(axis=0) > 0])
        kickout_assets = kickout_assets.append(
            pd.DataFrame(["ST stocks"] * len(st_list),
                         columns=["Elimination Reason"],
                         index=[st_list]))
    elif asset_type is "fund":
        for i in order_book_ids:
            if period_prices.loc[:, i].first_valid_index() is not None:
                if ((end_date - period_prices.loc[:, i].first_valid_index()) /
                        np.timedelta64(1, 'D')) < out_threshold:
                    temp = pd.DataFrame(
                        {"Elimination Reason": "Late beginning date"},
                        index=[i])
                    kickout_assets = kickout_assets.append(temp)
                elif period_prices.loc[:, i].isnull().sum() >= out_threshold:
                    temp = pd.DataFrame(
                        {
                            "Elimination Reason":
                            "Missing values over threshold"
                        },
                        index=[i])
                    kickout_assets = kickout_assets.append(temp)
                elif period_prices.loc[:, i].first_valid_index(
                ) < reset_start_date:
                    reset_start_date = period_prices.loc[:,
                                                         i].first_valid_index(
                                                         )
            else:
                temp = pd.DataFrame({"Elimination Reason": "Empty data"},
                                    index=[i])
                kickout_assets = kickout_assets.append(temp)

    period_prices = period_prices.fillna(method="pad")
    # Generate final kickout list which includes all the above
    final_kickout_list = list(set(kickout_assets.index))
    # Generate clean data and keep the original input id order
    clean_order_book_ids = list(set(order_book_ids) - set(final_kickout_list))

    clean_period_prices = period_prices.loc[reset_start_date:end_date,
                                            clean_order_book_ids]
    return clean_period_prices, kickout_assets, reset_start_date
Exemplo n.º 8
0
def get_customized_factor_return(date, universe, options, method):
    """

    PARAMETERS
    ----------
    date: str
         分析日期

    stock_list:list 用户指定的股票池


    options: dict 其他选择参数,

    包括:drop_st_stock: boolean, 是否剔除ST股 ; drop_new_stock: np.int 选择股票的上市日期限制(自然日); drop_suspended_stock: boolean,是否剔除停牌股

    method: str default: implicit 可选"explicit" 用户选择计算因子收益率的方式


    RETURN
    ----------

    factor_return: Series, 依据用户指定的股票池计算出的因子收益率

    """

    latest_trading_date = str(
        rqdatac.get_previous_trading_date(
            datetime.strptime(date, "%Y-%m-%d") + timedelta(days=1)))

    previous_trading_date = str(
        rqdatac.get_previous_trading_date(latest_trading_date))

    # 依据用户的选择参数,对stock_list进行筛选

    # 若用户选择剔除ST股:

    if options.get('drop_st_stock') == True:

        is_st_df = rqdatac.is_st_stock(universe,
                                       start_date=date,
                                       end_date=date)

        is_st_df.index = is_st_df.index.astype(str)

        stock_list = is_st_df.loc[date][is_st_df.loc[date].values ==
                                        False].index.tolist()

    # 若用户选择剔除停牌股:

    if options.get('drop_suspended_stock') == True:

        trading_volume = rqdatac.get_price(stock_list,
                                           start_date=date,
                                           end_date=date,
                                           frequency='1d',
                                           fields='volume',
                                           country='cn')

        stock_list = trading_volume.loc[date][
            trading_volume.loc[date].values > 0].index.tolist()

    # 根据用户输入的上市日期限制,剔除新股

    threshold = [
        latest_trading_date if options.get('drop_new_stock') == None else str(
            datetime.strptime(latest_trading_date, "%Y-%m-%d") -
            timedelta(days=options.get('drop_new_stock')))
    ][0]

    stock_list = [
        stock for stock in stock_list
        if rqdatac.instruments(stock).listed_date <= threshold
    ]

    # 计算指定股票池内股票前一交易日的行业暴露度

    factor_exposure = get_exposure(stock_list, str(previous_trading_date))

    # 根据上述暴露度计算因子收益率

    if method == 'implicit':

        factor_return = customized_factor_return_estimation(
            date, factor_exposure, stock_list)

    else:

        factor_return = get_explicit_factor_returns(date, stock_list)

    return factor_return
Exemplo n.º 9
0
def data_processing(order_book_ids, asset_type, date, window):

    """
    Preprocess data for covariance matrix estimation
    :param order_book_ids: str list. selected list of assets.
    :param asset_type: str. "fund" or "stock"
    :param date: str. The date for estimation
    :param window: int. length of sample, i.e., (length of time series)
    :return: DataFrame, str list, str. 
    (1) The proprocessed asset names (order_book_ids); (2) The proprocessed asset prices;
    (3) The actual estimation date, which may differs from the input date if the input date is not trading day,
    or some assets have been filtered out.
    """   

    if (date < "2005-07-01"):
        print('‘错误:优化日期不能早于2005年7月1日!')
        sys.exit(0) 
        
    elif (asset_type != 'fund' and  'stock'):
        print('错误:资产类型必须为 stock 或 fund !')
        sys.exit(0) 
            
    elif (window < 66):
        print('错误:window 必须大于66 (不少于66个交易日) !')
        sys.exit(0) 
    
    return 'normal'

                
                



    # The previous trading day is the enddate of the estimation 

    estimation_end_date = rqdatac.get_previous_trading_date(date)
    estimation_end_date = pd.to_datetime(estimation_end_date)
    
    # Choose the start date based on the window inputted, can't work if backtest start date is earlier than
    # "2005-07-01"
    
    estimation_start_date = rqdatac.get_trading_dates("2005-01-01", estimation_end_date)[-window-1]
    estimation_start_date = pd.to_datetime(estimation_start_date)

    if asset_type is 'fund':
        period_prices = rqdatac.fund.get_nav(order_book_ids, estimation_start_date, estimation_end_date, fields='adjusted_net_value')
    elif asset_type is 'stock':
        period_data = rqdatac.get_price(order_book_ids, estimation_start_date,  estimation_end_date, frequency='1d',
                                        fields=['close', 'volume'])
        period_prices = period_data['close']
        period_volume = period_data['volume']
    # Set up the threshhold of elimination
    out_threshold = ceil(period_prices.shape[0] / 2)
    kickout_list = list()
    suspended_list = list()
    st_list = list()
    # Locate the first valid value of each column, if available sequence length is less than threshhold, add
    # the column name into out_list; if sequence length is longer than threshold but less than chosen period length,
    # reset the start_date to the later date. The latest start_date whose sequence length is greater than threshold
    # will be chose.
    # Check whether any stocks has long suspended trading periods or has been delisted and generate list
    # for such stocks
    if asset_type is "stock":
        for i in order_book_ids:
            if not period_volume.loc[:, i].value_counts().empty:
                if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) \
                        < out_threshold:
                    kickout_list.append(i)
                elif period_prices.loc[:, i].isnull().sum() >= out_threshold:
                    kickout_list.append(i)
                elif period_prices.loc[:, i].first_valid_index() < reset_start_date:
                    reset_start_date = period_prices.loc[:, i].first_valid_index()
                elif period_volume.loc[:, i].last_valid_index() < end_date or \
                                period_volume.loc[:, i].value_counts().iloc[0] >= out_threshold:
                    suspended_list.append(i)
            else:
                kickout_list.append(i)
        # Check whether any ST stocks are included and generate a list for ST stocks
        st_list = list(period_prices.columns.values[rqdatac.is_st_stock(order_book_ids,
                                                                        reset_start_date, end_date).sum(axis=0) > 0])
    elif asset_type is "fund":
        for i in order_book_ids:
            if period_prices.loc[:, i].first_valid_index() is not None:
                if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) < out_threshold:
                    kickout_list.append(i)
                elif period_prices.loc[:, i].isnull().sum() >= out_threshold:
                    kickout_list.append(i)
                elif period_prices.loc[:, i].first_valid_index() < reset_start_date:
                    reset_start_date = period_prices.loc[:, i].first_valid_index()
            else:
                kickout_list.append(i)
    period_prices = period_prices.fillna(method="pad")
    # Generate final kickout list which includes all the above
    final_kickout_list = list(set().union(kickout_list, st_list, suspended_list))
    # Generate clean data
    final_kickout_list_s = set(final_kickout_list)
    # Keep the original input id order
    clean_order_book_ids = [x for x in order_book_ids if x not in final_kickout_list_s]

    clean_period_prices = period_prices.loc[reset_start_date:end_date, clean_order_book_ids]
    return clean_period_prices, final_kickout_list, reset_start_date