def _dump_bin(self, file_or_data: [Path, pd.DataFrame], calendar_list: List[pd.Timestamp]): if not calendar_list: logger.warning("calendar_list is empty") return if isinstance(file_or_data, pd.DataFrame): if file_or_data.empty: return code = fname_to_code( str(file_or_data.iloc[0][self.symbol_field_name]).lower()) df = file_or_data elif isinstance(file_or_data, Path): code = self.get_symbol_from_file(file_or_data) df = self._get_source_data(file_or_data) else: raise ValueError(f"not support {type(file_or_data)}") if df is None or df.empty: logger.warning(f"{code} data is None or empty") return # try to remove dup rows or it will cause exception when reindex. df = df.drop_duplicates(self.date_field_name) # features save dir features_dir = self._features_dir.joinpath(code_to_fname(code).lower()) features_dir.mkdir(parents=True, exist_ok=True) self._data_to_bin(df, calendar_list, features_dir)
def save_instrument(self, symbol, df: pd.DataFrame): # if df is None or df.empty: # logger.warning(f"{symbol} is empty") # return symbol = self.normalize_symbol(symbol) symbol = code_to_fname(symbol) instrument_path = self.save_dir.joinpath(f"{symbol}.csv") if instrument_path.exists(): _old_df = pd.read_csv(instrument_path) df.index = range(len(df)) # 只留下更新的数据 df_merge = pd.concat([_old_df.copy(), df.copy()], sort=False) df_merge = df_merge.round(6) duplicate_cols = df.columns.drop('date') df_merge.drop_duplicates(subset=duplicate_cols, ignore_index=False, inplace=True) if len(df_merge) == len(_old_df): # logger.info(f'{symbol} is not updated!') return elif len(df_merge) < len(_old_df): logger.error(f'please check {symbol} data!') instrument_path.replace( instrument_path.with_suffix('.bak.csv')) df = _old_df.drop_duplicates() else: # 数据需要更新 当天更新多次,date和period可能会重复, # 同一天更新的数据也有可能会有变化,特征值读取时,原则上是取最新的数据,并不会影响数据使用 # 决策前更新数据,如果不使用数据,按需每天固定时间更新即可,比如每天下午6点 logger.info(f'update {symbol} new recorder ......') updated_index = df_merge.index[len(_old_df):] updated_df = df.reindex(updated_index) # 数据原始记录的发布日期如果没有在数据中出现,优先使用原始数据中的发布日期 updated_df.date = updated_df.date.apply( lambda x: self.updated_date if x in _old_df.date.to_list() else x) df = pd.concat([_old_df, updated_df]) else: logger.info(f'update new stock {symbol} ......') df = df.drop_duplicates() df = df.set_index(['date', 'period']).sort_index() df.to_csv(instrument_path)
def save_instrument(self, symbol, df: pd.DataFrame): """save stock data to file Parameters ---------- symbol: str stock code df : pd.DataFrame df.columns must contain "symbol" and "datetime" """ if df.empty: logger.warning(f"{symbol} is empty") return symbol = self.normalize_symbol(symbol) symbol = code_to_fname(symbol) stock_path = self.save_dir.joinpath(f"{symbol}.csv") df["symbol"] = symbol if stock_path.exists(): _old_df = pd.read_csv(stock_path) df = _old_df.append(df, sort=False) df.to_csv(stock_path, index=False)
def normalize_symbol(self, symbol): return code_to_fname(symbol).upper()