def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}): if raw_data.shape[0] == 0: return pd.Panel(items=factor_names, major_axis=dts, minor_axis=ids) raw_data = raw_data.set_index(["日期", "ID"]) DataType = self.getFactorMetaData(factor_names=factor_names, key="DataType") Data = {} for iFactorName in raw_data.columns: iRawData = raw_data[iFactorName].unstack() if DataType[iFactorName] == "double": iRawData = iRawData.astype("float") Data[iFactorName] = iRawData Data = pd.Panel(Data).loc[factor_names] Data.major_axis = [ dt.datetime.strptime(iDate, "%Y%m%d") for iDate in Data.major_axis ] if Data.minor_axis.intersection(ids).shape[0] == 0: return pd.Panel(items=factor_names, major_axis=dts, minor_axis=ids) LookBack = args.get("回溯天数", self.LookBack) if LookBack == 0: return Data.loc[:, dts, ids] AllDTs = Data.major_axis.union(dts).sort_values() Data = Data.loc[:, AllDTs, ids] Limits = LookBack * 24.0 * 3600 for i, iFactorName in enumerate(Data.items): Data.iloc[i] = fillNaByLookback(Data.iloc[i], lookback=Limits) return Data.loc[:, dts]
def _adjustData(data, look_back, factor_names, ids, dts): if ids is not None: data = pd.Panel(data).loc[factor_names, :, ids] else: data = pd.Panel(data).loc[factor_names, :, :] if look_back == 0: if dts is not None: return data.loc[:, dts] else: return data if dts is not None: AllDTs = data.major_axis.union(dts).sort_values() data = data.loc[:, AllDTs, :] if np.isinf(look_back): for i, iFactorName in enumerate(data.items): data.iloc[i].fillna(method="pad", inplace=True) else: data = dict(data) Limits = look_back * 24.0 * 3600 for iFactorName in data: data[iFactorName] = fillNaByLookback(data[iFactorName], lookback=Limits) data = pd.Panel(data).loc[factor_names] if dts is not None: return data.loc[:, dts] else: return data
def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}): if raw_data.shape[0]==0: return pd.Panel(items=factor_names, major_axis=dts, minor_axis=ids) raw_data = raw_data.set_index(["日期", "ID"]) Operator = args.get("算子", self.Operator) if Operator is None: Operator = (lambda x: x.tolist()) Data = {} for iFactorName in factor_names: Data[iFactorName] = raw_data[iFactorName].groupby(axis=0, level=[0, 1]).apply(Operator).unstack() Data = pd.Panel(Data).loc[factor_names, :, ids] Data.major_axis = [dt.datetime.strptime(iDate, "%Y%m%d") for iDate in Data.major_axis] LookBack = args.get("回溯天数", self.LookBack) if LookBack==0: return Data.loc[:, dts, ids] AllDTs = Data.major_axis.union(dts).sort_values() Data = Data.loc[:, AllDTs, ids] Limits = LookBack*24.0*3600 for i, iFactorName in enumerate(Data.items): Data.iloc[i] = fillNaByLookback(Data.iloc[i], lookback=Limits) return Data.loc[:, dts]
def adjustDataDTID(data, look_back, factor_names, ids, dts, only_start_lookback=False, only_lookback_nontarget=False, only_lookback_dt=False, logger=None): if look_back == 0: try: return data.loc[:, dts, ids] except KeyError as e: if logger is not None: logger.warning("待提取的因子 %s 数据超出了原始数据的时点或 ID 范围, 将填充缺失值!" % (str(list(data.items)), )) return pd.Panel(items=factor_names, major_axis=dts, minor_axis=ids) AllDTs = data.major_axis.union(dts).sort_values() AdjData = data.loc[:, AllDTs, ids] if only_start_lookback: # 只在起始时点回溯填充缺失 AllAdjData = AdjData AdjData = AllAdjData.loc[:, :dts[0], :] TargetDTs = dts[:1] else: TargetDTs = dts if only_lookback_dt: TargetDTs = sorted(set(TargetDTs).difference(data.major_axis)) if TargetDTs: Limits = look_back * 24.0 * 3600 if only_lookback_nontarget: # 只用非目标时间序列的数据回溯填充 Mask = pd.Series(np.full(shape=(AdjData.shape[1], ), fill_value=False, dtype=np.bool), index=AdjData.major_axis) Mask[TargetDTs] = True FillMask = Mask.copy() FillMask[Mask.astype("int").diff() != 1] = False TimeDelta = pd.Series(np.r_[0, np.diff(Mask.index.values) / np.timedelta64(1, "D")], index=Mask.index) TimeDelta[(Mask & (~FillMask)) | (Mask.astype("int").diff() == -1)] = 0 TimeDelta = TimeDelta.cumsum().loc[TargetDTs] FirstDelta = TimeDelta.iloc[0] TimeDelta = TimeDelta.diff().fillna(value=0) TimeDelta.iloc[0] = FirstDelta NewLimits = np.minimum(TimeDelta.values * 24.0 * 3600, Limits).reshape( (TimeDelta.shape[0], 1)).repeat(AdjData.shape[2], axis=1) Limits = pd.DataFrame(0, index=AdjData.major_axis, columns=AdjData.minor_axis) Limits.loc[TargetDTs, :] = NewLimits if only_lookback_dt: Mask = pd.Series(np.full(shape=(AdjData.shape[1], ), fill_value=False, dtype=np.bool), index=AdjData.major_axis) Mask[TargetDTs] = True FillMask = Mask.copy() FillMask[Mask.astype("int").diff() != 1] = False FillMask = FillMask.loc[TargetDTs] TimeDelta = pd.Series(np.r_[0, np.diff(Mask.index.values) / np.timedelta64(1, "D")], index=Mask.index).loc[TargetDTs] NewLimits = TimeDelta.cumsum().loc[TargetDTs] Temp = NewLimits.copy() Temp[~FillMask] = np.nan Temp = Temp.fillna(method="pad") TimeDelta[~FillMask] = np.nan NewLimits = NewLimits - Temp + TimeDelta.fillna(method="pad") if isinstance(Limits, pd.DataFrame): Limits.loc[TargetDTs, :] = np.minimum( NewLimits.values.reshape( (NewLimits.shape[0], 1)).repeat(AdjData.shape[2], axis=1), Limits.loc[TargetDTs].values) else: NewLimits = np.minimum(NewLimits.values * 24.0 * 3600, Limits).reshape( (NewLimits.shape[0], 1)).repeat(AdjData.shape[2], axis=1) Limits = pd.DataFrame(0, index=AdjData.major_axis, columns=AdjData.minor_axis) Limits.loc[TargetDTs, :] = NewLimits if np.isinf(look_back) and (not only_lookback_nontarget) and ( not only_lookback_dt): for i, iFactorName in enumerate(AdjData.items): AdjData.iloc[i].fillna(method="pad", inplace=True) else: AdjData = dict(AdjData) for iFactorName in AdjData: AdjData[iFactorName] = fillNaByLookback(AdjData[iFactorName], lookback=Limits) AdjData = pd.Panel(AdjData).loc[factor_names] if only_start_lookback: AllAdjData.loc[:, dts[0], :] = AdjData.loc[:, dts[0], :] return AllAdjData.loc[:, dts] else: return AdjData.loc[:, dts]