def fetch_peindex(**kwargs): """ Args: **kwargs: statistic_date: iids: freq: use_tmstmp: conn: Returns: pandas.DataFrame """ kw_used = ("statistic_date", "iids", "freq", "conn") statistic_date, iids, freq, conn = meta.get_kwargs_used(kw_used, **kwargs) sql_pes = SQL_USED.pe_index(statistic_date, iids, freq) df_pes = pd.read_sql(sql_pes, conn) if kwargs.get("use_tmstmp") is True: df_pes["statistic_date"] = df_pes["statistic_date"].apply( lambda x: dt.datetime(x.year, x.month, x.day).timestamp()) df_pes.sort_values(by=["index_id", "statistic_date"], ascending=[True, False], inplace=True) df_pes.index = range(len(df_pes)) return df_pes
def fetch_found_date(**kwargs): """ Args: **kwargs: iids: id_type: use_tmstmp: conn: Returns: dict """ kw_used = ("iids", "id_type", "conn") iids, id_type, conn = meta.get_kwargs_used(kw_used, **kwargs) sql_foundation_date = SQL_USED.foundation_date(ids=iids, id_type=id_type) d_foundation_date = pd.read_sql(sql_foundation_date, conn) if id_type == "fund": k1 = "fund_id" elif id_type == "org": k1 = "org_id" if kwargs.get("use_tmstmp") is True: d_foundation_date["t_min"] = d_foundation_date["t_min"].apply( lambda x: dt.datetime(x.year, x.month, x.day).timestamp()) result = dict(zip(d_foundation_date[k1], d_foundation_date["t_min"])) return result
def fetch_fundname(**kwargs): kw_used = ("iids", "level", "conn") iids, level, conn = meta.get_kwargs_used(kw_used, **kwargs) sql_fname = SQL_USED.fund_name(iids, level=level) df = pd.read_sql(sql_fname, con=conn) results = dict(zip(df["fund_id"], df["fund_name"])) return results
def fetch_marketindex(**kwargs): """ Args: **kwargs: statistic_date: iids: use_tmstmp: conn: Returns: pandas.DataFrame """ kw_used = ("statistic_date", "iids", "transform", "conn") statistic_date, iids, apply, conn = meta.get_kwargs_used(kw_used, **kwargs) if type(iids) is str: iids = [iids] sql_bms = SQL_USED.market_index(statistic_date, iids, whole=True) df_bms = pd.read_sql(sql_bms, conn) if "y1_treasury_rate" in iids: df_bms.loc[:, "y1_treasury_rate"] = df_bms["y1_treasury_rate"].fillna( method="backfill") if apply is not None: for col, func in apply.items(): df_bms[col] = df_bms[col].apply(func) result = pd.DataFrame() for col in df_bms.columns[:-1]: tmp = df_bms.loc[:, [col, "statistic_date"]] tmp.loc[:, "index_id"] = col tmp.columns = ["index_value", "statistic_date", "index_id"] tmp = tmp.dropna() result = result.append(tmp) result = result[["index_id", "index_value", "statistic_date"]] if kwargs.get("use_tmstmp") is True: # result.loc[:, "statistic_date"] = result["statistic_date"].apply(lambda x: pd.Timestamp(x).timestamp()) result.loc[:, "statistic_date"] = result["statistic_date"].apply( lambda x: dt.datetime(x.year, x.month, x.day).timestamp()) result.sort_values(by=["index_id", "statistic_date"], ascending=[True, False], inplace=True) result.index = range(len(result)) return result
def fetch_firstdate_used(**kwargs): """ Args: **kwargs: iids: id_type: use_tmstmp: conn: Returns: dict """ kw_used = ("iids", "id_type", "use_tmstmp", "check_date", "conn") iids, id_type, use_tmstmp, check_date, conn = meta.get_kwargs_used( kw_used, **kwargs) check_date = tu.universal_time(check_date)[1] error_list = [] first_date = fetch_found_date(iids=iids, id_type=id_type, use_tmstmp=use_tmstmp, conn=conn) for iid, date in first_date.items(): if date > check_date: error_list.append(iid) if len(error_list) > 0: print( "{num_error} of {num_total} funds ({err_funds}) foundation date may be wrong, using first nv date instead..." .format(num_error=len(error_list), num_total=len(iids), err_funds=error_list)) # if len(first_date) != len(iids): if (len(first_date) - len(error_list) ) != len(iids): # 有错误的首个日期应该为: 1)首日期为空 2) 首个净值日期错误两种情况的并集; if id_type == "fund": print( "{num_missing} of {num_total} funds' foundation date missing, using first nv date instead..." .format(num_missing=len(set(iids) - first_date.keys()), num_total=len(iids))) firstnv_date = fetch_firstnv_date( iids=(set(iids) - first_date.keys() | set(error_list)), use_tmstmp=use_tmstmp, conn=conn) first_date.update(firstnv_date) return first_date
def fetch_fids_used(**kwargs): """ Args: **kwargs: statistic_date: freq: Returns: list """ kw_used = ("statistic_date", "freq", "conn") statistic_date, freq, conn = meta.get_kwargs_used(kw_used, **kwargs) sql_ids_used = SQL_USED.ids_updated_sd(statistic_date, freq) ids_used = [x[0] for x in conn.execute(sql_ids_used)] return ids_used
def gen_stdseries(**kwargs): """ Generate standard time series of various intervals for aligning. Args: **kwargs: Returns: """ kw_used = ("date_s", "intervals", "freq", "extend") date_s, intervals, freq, extend = meta.get_kwargs_used(kw_used, **kwargs) if extend is None: tm_series_std = { interval: tu.timeseries_std( date_s, interval, _periods_of_freq[freq], # extend=1 * int(freq == "d" and interval != "m"), extend=1 + 1 * int(not (freq == "d" and interval == "m")), use_lastday=True, # 处理日频, 本月以来区间多取一个点的问题 keep_firstday=True, weekday=kwargs.get("weekday", False)) for interval in intervals if interval != "whole" } else: tm_series_std = { interval: tu.timeseries_std( date_s, interval, _periods_of_freq[freq], extend=1 + extend.get(interval, 0) + 1 * int(freq == "d" and interval != "m"), # 区间"今年以来", 以及大于等于一年的基金, 需要加长一个月的数据, 用于算法的区间外搜索; use_lastday=True, keep_firstday=True, weekday=kwargs.get("weekday", False)) for interval in intervals if interval != "whole" } return tm_series_std
def gen_stdseries_longest(**kwargs): """ Args: **kwargs: iids: collections.Iterable or dict{id: tm_series} date_s: datetime.date, or dict{id: <datetime.date>} date_e: datetime.date, or dict{id: <datetime.date>} freq: str Returns: """ kw_used = ("iids", "key_tm", "date_s", "date_e", "freq") iids, key_tm, date_s, date_e, freq = meta.get_kwargs_used( kw_used, **kwargs) if isinstance(date_s, (dt.date, dt.datetime, float)): date_s = dict.fromkeys(iids, date_s) elif date_s is None: date_s = { iid: max(attributes[key_tm]) for iid, attributes in iids.items() } if isinstance(date_e, (dt.date, dt.datetime, float)): date_e = dict.fromkeys(iids, date_e) elif date_e is None: date_e = { iid: min(attributes[key_tm]) for iid, attributes in iids.items() } t_std_alls = {} for iid in iids: t_std_alls[iid] = tu.timeseries_std_se( date_s.get(iid), date_e.get(iid), periods_y=_periods_of_freq.get(freq), weekday=kwargs.get("weekday", False)) return t_std_alls
def fetch_fundnv(**kwargs): """ Args: **kwargs: iids: use_tmstmp: conn: Returns: pandas.DataFrame """ kw_used = ("iids", "conn") iids, conn = meta.get_kwargs_used(kw_used, **kwargs) if "processes" in kwargs: _threadpool = ThreadPool(processes=_poolsize.get("thread")) tasks = [ SQL_USED.nav(x) for x in _get_chunk(iids, kwargs.get("processes")) ] my_func = functools.partial(pd.read_sql, con=conn) results = _threadpool.map(my_func, tasks) _threadpool.close() result = pd.DataFrame() for df in results: result = result.append(df) else: sql_fund_nv = SQL_USED.nav(iids) result = pd.read_sql(sql_fund_nv, con=conn) if kwargs.get("use_tmstmp") is True: # result["statistic_date"] = result["statistic_date"].apply(lambda x: pd.Timestamp(x).timestamp()) result["statistic_date"] = result["statistic_date"].apply( lambda x: dt.datetime(x.year, x.month, x.day).timestamp()) result.sort_values(by=["fund_id", "statistic_date"], ascending=[True, False], inplace=True) result.index = range(len(result)) return result
def fetch_fundtype(**kwargs): """ Args: **kwargs: iids: dimension: {"strategy", "structure", "target", "issuance"} level: 1 conn: Returns: """ kw_used = ("iids", "dimension", "level", "conn") iids, dimension, level, conn = meta.get_kwargs_used(kw_used, **kwargs) sql_ftype = SQL_USED.fund_type(iids, dimension, level) if sql_ftype is not None: df = pd.read_sql(sql_ftype, con=conn) results = dict(zip(df["fund_id"], df["code"])) return results else: return {}
def fetch_firstnv_date(**kwargs): """ Args: **kwargs: iids: use_tmstmp: conn: Returns: dict """ kw_used = ("iids", "conn") iids, conn = meta.get_kwargs_used(kw_used, **kwargs) sql_firstnv_date = SQL_USED.firstnv_date(iids) d_firstnv_date = pd.read_sql(sql_firstnv_date, con=conn) if kwargs.get("use_tmstmp") is True: d_firstnv_date["t_min"] = d_firstnv_date["t_min"].apply( lambda x: dt.datetime(x.year, x.month, x.day).timestamp()) return dict(zip(d_firstnv_date["fund_id"], d_firstnv_date["t_min"]))
def match_by_std(obj, **kwargs): """ Match objects with time series to standard time series, and apply the strategy to its other attributes. Args: obj: dict Dict like {id: {key1: Iterable, key2: Iterable, ...}}; **kwargs: key_tm: str Key of the time series; key_used: Iterable<str> keys to match to standard time series; date_s: datetime.date, datetime.datetime, or float Statistic date(or the start date) of the standard time series; date_e: datetime.date, datetime.datetime, float, or dict Earliest date(or the end date) of the standard time series. If a dict is passed, then it should be {id: date}_like and its ids should be the same as the `obj` length; intervals: Iterable Interval of the standard time series to match, optional {1, 3, 6, 12, 24, 36, 60, "w", "m", "q", "a", "whole"}; freq: str Frequency of the standard time series, optional {"w", "m"}; extend: int, or dict Extra sample number of `interval` to use. If an int is parsed, then all intervals in `interval` will use this int as the extra sample number, else if an dict like {interval: extra_num} is parsed, then the specified interval will use the given extra number. Default None; shift: dict Dict like {id: shift_num} to specified ids which need to be shifted on its match case; apply: dict Dict like {id: func} Returns: dict like {id: {key_used:{interval: Iterable}}} """ kw_used = ("key_tm", "key_used", "date_s", "date_e", "intervals", "freq", "shift", "extend", "apply") key_tm, key_used, date_s, date_e, intervals, freq, shift, extend, apply = meta.get_kwargs_used( kw_used, **kwargs) if isinstance(date_e, (dt.date, dt.datetime, float)): date_e = dict.fromkeys(obj.keys(), tu.universal_time(date_e)[1]) elif date_e is None: date_e = { iid: tu.universal_time(min(attributes[key_tm]))[1] for iid, attributes in obj.items() } tm_series_std_alls = gen_stdseries_longest(iids=obj, key_tm=key_tm, date_s=date_s, date_e=date_e, freq=freq, weekday=kwargs.get( "weekday", False)) tm_series_std = gen_stdseries(date_s=date_s, freq=freq, intervals=intervals, extend=extend, weekday=kwargs.get("weekday", False)) sample_nums = { interval: len(tm_serie_std) for interval, tm_serie_std in tm_series_std.items() } if freq == "w" or freq == "d": matchs_whole = { iid: tu.outer_match4indicator_w(attributes[key_tm], tm_series_std_alls[iid], False)[1] for iid, attributes in obj.items() } matchs_w = { iid: tu.outer_match4indicator_w(attributes[key_tm], tm_series_std["w"])[1] for iid, attributes in obj.items() } elif freq == "m": matchs_whole = { iid: tu.outer_match4indicator_m(attributes[key_tm], tm_series_std_alls[iid], False)[1] for iid, attributes in obj.items() } result = dict.fromkeys(obj.keys(), {}) if shift is None: shift = {} # match for each object date_s_dt = dt.date.fromtimestamp(tu.universal_time(date_s)[1]) # intervals_regular = [interval for interval in intervals if interval not in ("w", "a", "whole")] # intervals_regular = [ interval for interval in intervals if interval not in ("w", "whole") ] for iid, attributes in obj.items(): shift_iid = shift.get(iid, 0) result[iid] = dict.fromkeys(key_used, {}) date_e_iid = dt.date.fromtimestamp(date_e[iid]) for key in result[iid].keys(): freq_of_key = {} freq_of_key["whole"] = [ attributes[key][idx] if idx is not None else None for idx in matchs_whole[iid].values() ] if apply is not None and iid in apply: freq_of_key["whole"] = apply[iid](freq_of_key["whole"]) if shift_iid > 0: freq_of_key["whole"] = freq_of_key["whole"][shift_iid:] # 根据每个基金产品的成立时间判断可以计算多长区间 interval_used = _check_intervals(date_s_dt, date_e_iid, intervals) length_max = len(freq_of_key["whole"]) for interval in intervals_regular: if interval_used[interval]: sp_num = sample_nums[interval] - shift_iid - 1 # # sp_num = sample_nums[interval] - 1 freq_of_key[interval] = freq_of_key["whole"][:sp_num] if sp_num > length_max: freq_of_key[interval].extend([None] * (sp_num - length_max)) else: freq_of_key[interval] = None # freq_of_key["a"] = [attributes[key][idx] if idx is not None else None for idx in matchs_a[iid].values()] # if freq == "w" or freq == "d": freq_of_key["w"] = [ attributes[key][idx] if idx is not None else None for idx in matchs_w[iid].values() ] result[iid][key] = freq_of_key return result