def st_stocks_filter(stocks,date): """ 获得某日的ST类股票 :param stocks: list 股票列表 :param date: 交易日 :return: list 列表中st的股票 """ st_series = rqdatac.is_st_stock(stocks,start_date=date,end_date=date).iloc[-1] return st_series[~st_series].index.tolist()
def get_implicit_factor_return(date): latest_trading_date = str( rqdatac.get_previous_trading_date( datetime.strptime(date, "%Y-%m-%d") + timedelta(days=1))) previous_trading_date = str( rqdatac.get_previous_trading_date(latest_trading_date)) # 取前一交易日全市场已经上市的股票,保证日收益率计算 stock_list = rqdatac.all_instruments( type='CS', date=previous_trading_date)['order_book_id'].tolist() # 剔除上市不满21天的股票 trading_date_21_days_before = str( rqdatac.get_previous_trading_date(latest_trading_date, country='cn', n=21)) stock_list = [ i for i in stock_list if rqdatac.instruments(i).listed_date <= trading_date_21_days_before ] # 剔除ST股 is_st_df = rqdatac.is_st_stock(stock_list, start_date=previous_trading_date, end_date=previous_trading_date) is_st_df.index = is_st_df.index.astype(str) stock_list = is_st_df.loc[previous_trading_date][ is_st_df.loc[previous_trading_date].values == False].index.tolist() # 剔除停牌股 trading_volume = rqdatac.get_price(stock_list, start_date=previous_trading_date, end_date=previous_trading_date, frequency='1d', fields='volume', country='cn') stock_list = trading_volume.loc[previous_trading_date][ trading_volume.loc[previous_trading_date].values > 0].index.tolist() # 计算全市场前一交易日的行业暴露度 factor_exposure = get_exposure(stock_list, str(previous_trading_date)) # 根据上述四类暴露度计算因子收益率 factor_returns = factor_return_estimation(latest_trading_date, factor_exposure) return factor_returns
def data_preprocessing(self, equity_list, start_date, end_date): if self.et is 'funds': period_data = rqdatac.fund.get_nav(equity_list, start_date, end_date, fields='acc_net_value') elif self.et is 'stocks': period_data = rqdatac.get_price(equity_list, start_date, end_date, frequency='1d', fields=['close', 'volume']) period_prices = period_data['close'] period_volume = period_data['volume'] self.period_prices = period_prices # Set up the threshhold of elimination out_threshold = ceil(period_prices.shape[0] / 2) reset_end_date = pd.to_datetime(end_date) reset_start_date = pd.to_datetime(start_date) kickout_list = list() suspended_list = list() # Locate the first valid value of each column, if available sequence length is less than threshhold, add # the column name into out_list; if sequence length is longer than threshold but less than chosen period length, # reset the start_date to the later date. The latest start_date whose sequence length is greater than threshold # will be chose. # Check whether any stocks has long suspended trading periods or has been delisted and generate list # for such stocks for i in equity_list: if not period_volume.loc[:, i].value_counts().empty: if ((reset_end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) < out_threshold: kickout_list.append(i) elif period_prices.loc[:, i].first_valid_index() < reset_start_date: reset_start_date = period_prices.loc[:, i].first_valid_index() elif period_volume.loc[:, i].last_valid_index() < reset_end_date or \ period_volume.loc[:, i].value_counts().iloc[0] >= out_threshold: suspended_list.append(i) else: kickout_list.append(i) # Check whether any ST stocks are included and generate a list for ST stocks st_list = list(period_prices.columns.values[rqdatac.is_st_stock(equity_list, reset_start_date, reset_end_date).sum(axis=0)>0]) # Generate final kickout list which includes all the above # kickout_list_s = set(kickout_list) # st_list_s = set(st_list) # suspended_list_s = set(suspended_list) # two_list_union = st_list_s.union(suspended_list_s) # final_dif = two_list_union - kickout_list_s # final_kickout_list = kickout_list + list(final_dif) final_kickout_list = list(set().union(kickout_list, st_list, suspended_list)) # Generate clean data equity_list_s = set(equity_list) final_kickout_list_s = set(final_kickout_list) clean_equity_list = list(equity_list_s - final_kickout_list_s) clean_period_prices = period_prices.loc[reset_start_date:reset_end_date, clean_equity_list] self.clean_period_prices = clean_period_prices self.clean_equity_list = list(clean_period_prices.columns.values) self.kickout_list = kickout_list self.st_list = st_list self.suspended_list = suspended_list self.reset_start_d = reset_start_date self.reset_end_d = reset_end_date
def data_process(order_book_ids, equity_type, start_date): windows = 132 end_date = rqdatac.get_previous_trading_date(start_date) for i in range(windows + 1): start_date = rqdatac.get_previous_trading_date(start_date) if equity_type is 'funds': period_data = rqdatac.fund.get_nav(order_book_ids, start_date, end_date, fields='acc_net_value') elif equity_type is 'stocks': period_data = rqdatac.get_price(order_book_ids, start_date, end_date, frequency='1d', fields=['close', 'volume']) period_prices = period_data['close'] period_volume = period_data['volume'] # Set up the threshhold of elimination out_threshold = ceil(period_prices.shape[0] / 2) kickout_list = list() suspended_list = list() # Locate the first valid value of each column, if available sequence length is less than threshhold, add # the column name into out_list; if sequence length is longer than threshold but less than chosen period length, # reset the start_date to the later date. The latest start_date whose sequence length is greater than threshold # will be chose. # Check whether any stocks has long suspended trading periods or has been delisted and generate list # for such stocks for i in order_book_ids: if not period_volume.loc[:, i].value_counts().empty: if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) \ < out_threshold: kickout_list.append(i) elif period_prices.loc[:, i].first_valid_index() < start_date: reset_start_date = period_prices.loc[:, i].first_valid_index() elif period_volume.loc[:, i].last_valid_index() < end_date or \ period_volume.loc[:, i].value_counts().iloc[0] >= out_threshold: suspended_list.append(i) else: kickout_list.append(i) # Check whether any ST stocks are included and generate a list for ST stocks st_list = list(period_prices.columns.values[rqdatac.is_st_stock( order_book_ids, reset_start_date, end_date).sum(axis=0) > 0]) # Generate final kickout list which includes all the above final_kickout_list = list(set().union(kickout_list, st_list, suspended_list)) # Generate clean data order_book_ids_s = set(order_book_ids) final_kickout_list_s = set(final_kickout_list) clean_order_book_ids = list(order_book_ids_s - final_kickout_list_s) clean_period_prices = period_prices.loc[reset_start_date:end_date, clean_order_book_ids] return clean_period_prices, final_kickout_list
def drop_st(universe, date): is_st = rqd.is_st_stock(universe, date, date).squeeze() assert isinstance(is_st, pd.Series), 'is_st is not series' return is_st.index[~is_st]
def get_customized_factor_return(universe, date, skip_suspended=True, skip_st_stocks=True, method='implicit'): """ PARAMETERS ---------- universe:list 用户自定义股票白名单。默认为 None。用户需传入和股票白名单相对应的 order_book_ids,例如:['600705.XSHG', ' 601555.XSHG'] date: str 计算日期(例如:‘2017-03-03’)。需注意股票白名单应为计算日期已上市股票。 skip_suspended: boolean 是否剔除白名单中当天停牌的股票。默认为 True。 skip_st_stocks: boolean 是否剔除白名单中的ST股。默认为 True。 method: str 计算方法。默认为'implicit'(隐式因子收益率),可选'explicit'(显式风格因子收益率) RETURN ---------- factor_return: Series, 依据用户指定的股票池计算出的因子(或风格因子)收益率,index 为因子名称。 """ latest_trading_date = str( rqdatac.get_previous_trading_date( datetime.strptime(date, "%Y-%m-%d") + timedelta(days=1))) previous_trading_date = str( rqdatac.get_previous_trading_date(latest_trading_date)) # 依据用户的选择参数,对stock_list进行筛选 # 若用户选择剔除ST股: if skip_st_stocks == True: is_st_df = rqdatac.is_st_stock(universe, start_date=date, end_date=date) is_st_df.index = is_st_df.index.astype(str) universe = is_st_df.loc[date][is_st_df.loc[date].values == False].index.tolist() # 若用户选择剔除停牌股: if skip_suspended == True: trading_volume = rqdatac.get_price(universe, start_date=date, end_date=date, frequency='1d', fields='volume', country='cn') universe = trading_volume.loc[date][ trading_volume.loc[date].values > 0].index.tolist() # 计算指定股票池内股票前一交易日的行业暴露度 factor_exposure = get_exposure(universe, str(previous_trading_date)) # 根据上述暴露度计算因子收益率 if method == 'implicit': factor_return = customized_factor_return_estimation( date, factor_exposure, universe) else: factor_return = get_explicit_factor_returns(date, universe) return factor_return
def data_process(order_book_ids, asset_type, start_date, windows, out_threshold_coefficient=None): """ Clean data for covariance matrix calculation :param order_book_ids: str list. A group of assets. :param asset_type: str. "fund" or "stock" :param start_date: str. The first day for backtest. :param windows: int. Interval length of sample. :param out_threshold_coefficient: float, optional. Determine the threshold to filter out assets with too short data which may cause problem in covariance matrix calculation. Whose data length is shorter than threshold will be eliminated. Default: 0.5(out_threshold = 0.5*windows). :return: DataFrame, DataFrame, str. The first DataFrame contains the prices after cleaning; the second DataFrame contains the order_book_ids been filtered out and the reasons of elimination; str is a new start date for covariance calculation interval which may differ from default. """ end_date = rqdatac.get_previous_trading_date(start_date) end_date = pd.to_datetime(end_date) # Choose the start date based on the windows inputted, can't work if backtest start date is earlier than # "2005-07-01" start_date = rqdatac.get_trading_dates("2005-01-01", end_date)[-windows - 1] reset_start_date = pd.to_datetime(start_date) if asset_type is 'fund': period_prices = rqdatac.fund.get_nav(order_book_ids, reset_start_date, end_date, fields='adjusted_net_value') elif asset_type is 'stock': period_data = rqdatac.get_price(order_book_ids, reset_start_date, end_date, frequency='1d', fields=['close', 'volume']) period_prices = period_data['close'] period_volume = period_data['volume'] # Set up the threshold of elimination if out_threshold_coefficient is None: out_threshold = ceil(windows * 0.5) else: out_threshold = ceil(windows * out_threshold_coefficient) kickout_assets = pd.DataFrame(columns=["Elimination Reason"]) # Locate the first valid value of each column, if available sequence length is less than threshhold, add # the column name into out_list; if sequence length is longer than threshold but less than chosen period length, # reset the start_date to the later date. The latest start_date whose sequence length is greater than threshold # will be chose. # Check whether any stocks has long suspended trading periods or has been delisted and generate list # for such stocks if asset_type is "stock": for i in order_book_ids: if not period_volume.loc[:, i].value_counts().empty: if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) \ < out_threshold: temp = pd.DataFrame( {"Elimination Reason": "Late beginning date"}, index=[i]) kickout_assets = kickout_assets.append(temp) elif period_prices.loc[:, i].isnull().sum() >= out_threshold: temp = pd.DataFrame( { "Elimination Reason": "Missing values over threshold" }, index=[i]) kickout_assets = kickout_assets.append(temp) elif period_prices.loc[:, i].first_valid_index( ) < reset_start_date: reset_start_date = period_prices.loc[:, i].first_valid_index( ) elif period_volume.loc[:, i].last_valid_index() < end_date: temp = pd.DataFrame({"Elimination Reason": "Delisted"}, index=[i]) kickout_assets = kickout_assets.append(temp) elif period_volume.loc[:, i].value_counts( ).iloc[0] >= out_threshold: temp = pd.DataFrame( { "Elimination Reason": "Suspended days over threshold" }, index=[i]) kickout_assets = kickout_assets.append(temp) else: temp = pd.DataFrame({"Elimination Reason": "Empty data"}, index=[i]) kickout_assets = kickout_assets.append(temp) # Check whether any ST stocks are included and generate a list for ST stocks st_list = list(period_prices.columns.values[rqdatac.is_st_stock( order_book_ids, reset_start_date, end_date).sum(axis=0) > 0]) kickout_assets = kickout_assets.append( pd.DataFrame(["ST stocks"] * len(st_list), columns=["Elimination Reason"], index=[st_list])) elif asset_type is "fund": for i in order_book_ids: if period_prices.loc[:, i].first_valid_index() is not None: if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) < out_threshold: temp = pd.DataFrame( {"Elimination Reason": "Late beginning date"}, index=[i]) kickout_assets = kickout_assets.append(temp) elif period_prices.loc[:, i].isnull().sum() >= out_threshold: temp = pd.DataFrame( { "Elimination Reason": "Missing values over threshold" }, index=[i]) kickout_assets = kickout_assets.append(temp) elif period_prices.loc[:, i].first_valid_index( ) < reset_start_date: reset_start_date = period_prices.loc[:, i].first_valid_index( ) else: temp = pd.DataFrame({"Elimination Reason": "Empty data"}, index=[i]) kickout_assets = kickout_assets.append(temp) period_prices = period_prices.fillna(method="pad") # Generate final kickout list which includes all the above final_kickout_list = list(set(kickout_assets.index)) # Generate clean data and keep the original input id order clean_order_book_ids = list(set(order_book_ids) - set(final_kickout_list)) clean_period_prices = period_prices.loc[reset_start_date:end_date, clean_order_book_ids] return clean_period_prices, kickout_assets, reset_start_date
def get_customized_factor_return(date, universe, options, method): """ PARAMETERS ---------- date: str 分析日期 stock_list:list 用户指定的股票池 options: dict 其他选择参数, 包括:drop_st_stock: boolean, 是否剔除ST股 ; drop_new_stock: np.int 选择股票的上市日期限制(自然日); drop_suspended_stock: boolean,是否剔除停牌股 method: str default: implicit 可选"explicit" 用户选择计算因子收益率的方式 RETURN ---------- factor_return: Series, 依据用户指定的股票池计算出的因子收益率 """ latest_trading_date = str( rqdatac.get_previous_trading_date( datetime.strptime(date, "%Y-%m-%d") + timedelta(days=1))) previous_trading_date = str( rqdatac.get_previous_trading_date(latest_trading_date)) # 依据用户的选择参数,对stock_list进行筛选 # 若用户选择剔除ST股: if options.get('drop_st_stock') == True: is_st_df = rqdatac.is_st_stock(universe, start_date=date, end_date=date) is_st_df.index = is_st_df.index.astype(str) stock_list = is_st_df.loc[date][is_st_df.loc[date].values == False].index.tolist() # 若用户选择剔除停牌股: if options.get('drop_suspended_stock') == True: trading_volume = rqdatac.get_price(stock_list, start_date=date, end_date=date, frequency='1d', fields='volume', country='cn') stock_list = trading_volume.loc[date][ trading_volume.loc[date].values > 0].index.tolist() # 根据用户输入的上市日期限制,剔除新股 threshold = [ latest_trading_date if options.get('drop_new_stock') == None else str( datetime.strptime(latest_trading_date, "%Y-%m-%d") - timedelta(days=options.get('drop_new_stock'))) ][0] stock_list = [ stock for stock in stock_list if rqdatac.instruments(stock).listed_date <= threshold ] # 计算指定股票池内股票前一交易日的行业暴露度 factor_exposure = get_exposure(stock_list, str(previous_trading_date)) # 根据上述暴露度计算因子收益率 if method == 'implicit': factor_return = customized_factor_return_estimation( date, factor_exposure, stock_list) else: factor_return = get_explicit_factor_returns(date, stock_list) return factor_return
def data_processing(order_book_ids, asset_type, date, window): """ Preprocess data for covariance matrix estimation :param order_book_ids: str list. selected list of assets. :param asset_type: str. "fund" or "stock" :param date: str. The date for estimation :param window: int. length of sample, i.e., (length of time series) :return: DataFrame, str list, str. (1) The proprocessed asset names (order_book_ids); (2) The proprocessed asset prices; (3) The actual estimation date, which may differs from the input date if the input date is not trading day, or some assets have been filtered out. """ if (date < "2005-07-01"): print('‘错误:优化日期不能早于2005年7月1日!') sys.exit(0) elif (asset_type != 'fund' and 'stock'): print('错误:资产类型必须为 stock 或 fund !') sys.exit(0) elif (window < 66): print('错误:window 必须大于66 (不少于66个交易日) !') sys.exit(0) return 'normal' # The previous trading day is the enddate of the estimation estimation_end_date = rqdatac.get_previous_trading_date(date) estimation_end_date = pd.to_datetime(estimation_end_date) # Choose the start date based on the window inputted, can't work if backtest start date is earlier than # "2005-07-01" estimation_start_date = rqdatac.get_trading_dates("2005-01-01", estimation_end_date)[-window-1] estimation_start_date = pd.to_datetime(estimation_start_date) if asset_type is 'fund': period_prices = rqdatac.fund.get_nav(order_book_ids, estimation_start_date, estimation_end_date, fields='adjusted_net_value') elif asset_type is 'stock': period_data = rqdatac.get_price(order_book_ids, estimation_start_date, estimation_end_date, frequency='1d', fields=['close', 'volume']) period_prices = period_data['close'] period_volume = period_data['volume'] # Set up the threshhold of elimination out_threshold = ceil(period_prices.shape[0] / 2) kickout_list = list() suspended_list = list() st_list = list() # Locate the first valid value of each column, if available sequence length is less than threshhold, add # the column name into out_list; if sequence length is longer than threshold but less than chosen period length, # reset the start_date to the later date. The latest start_date whose sequence length is greater than threshold # will be chose. # Check whether any stocks has long suspended trading periods or has been delisted and generate list # for such stocks if asset_type is "stock": for i in order_book_ids: if not period_volume.loc[:, i].value_counts().empty: if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) \ < out_threshold: kickout_list.append(i) elif period_prices.loc[:, i].isnull().sum() >= out_threshold: kickout_list.append(i) elif period_prices.loc[:, i].first_valid_index() < reset_start_date: reset_start_date = period_prices.loc[:, i].first_valid_index() elif period_volume.loc[:, i].last_valid_index() < end_date or \ period_volume.loc[:, i].value_counts().iloc[0] >= out_threshold: suspended_list.append(i) else: kickout_list.append(i) # Check whether any ST stocks are included and generate a list for ST stocks st_list = list(period_prices.columns.values[rqdatac.is_st_stock(order_book_ids, reset_start_date, end_date).sum(axis=0) > 0]) elif asset_type is "fund": for i in order_book_ids: if period_prices.loc[:, i].first_valid_index() is not None: if ((end_date - period_prices.loc[:, i].first_valid_index()) / np.timedelta64(1, 'D')) < out_threshold: kickout_list.append(i) elif period_prices.loc[:, i].isnull().sum() >= out_threshold: kickout_list.append(i) elif period_prices.loc[:, i].first_valid_index() < reset_start_date: reset_start_date = period_prices.loc[:, i].first_valid_index() else: kickout_list.append(i) period_prices = period_prices.fillna(method="pad") # Generate final kickout list which includes all the above final_kickout_list = list(set().union(kickout_list, st_list, suspended_list)) # Generate clean data final_kickout_list_s = set(final_kickout_list) # Keep the original input id order clean_order_book_ids = [x for x in order_book_ids if x not in final_kickout_list_s] clean_period_prices = period_prices.loc[reset_start_date:end_date, clean_order_book_ids] return clean_period_prices, final_kickout_list, reset_start_date