def distribution_compare_dict(fit: powerlaw.Fit) -> Dict[str, float]: """ Compose a dict of length distribution fit comparisons. """ compare_dict = dict() for dist_enum_pairs in [ (Dist.POWERLAW, Dist.LOGNORMAL), (Dist.POWERLAW, Dist.EXPONENTIAL), (Dist.LOGNORMAL, Dist.EXPONENTIAL), (Dist.POWERLAW, Dist.TRUNCATED_POWERLAW), ]: first, second = dist_enum_pairs[0].value, dist_enum_pairs[1].value r, p = fit.distribution_compare(first, second, normalized_ratio=True) compare_dict[f"{first} vs. {second} R"] = r compare_dict[f"{first} vs. {second} p"] = p return compare_dict
class _Analyzer(ABC): def __init__(self, settings): self.sc = settings.ctrl self.sd = settings.data self.sa = settings.anal # TODO: factor setting of these boolean flags into own method if self.sa.txmin_map: self._use_pct_file = any('PCT' in col_hdr for col_hdr in self.sa.txmin_map.values()) self.rtn = Returns(settings) self.res = Results(settings) self._distros_to_compare = {'tpl': 'truncated_power_law', 'exp': 'exponential', 'lgn': 'lognormal'} # # # iteration state DEPENDENT (or aware) methods # # # def _log_curr_iter(self): # TODO: factor out repetitive log? (static: date, dynamic: group_label) gtyp, *date, tail = self.curr_iter_id grp_tail_log = (f"Analyzing {tail.name.upper()} tail of time series " f"for {self.sd.grouping_type.title()} '{gtyp}' ") if bool(date): # dynamic approach df = date[0] di = self.sa.get_dyn_lbd(df) # NOTE: di above is 1st date w/ price, not 1st date w/ return else: # static approach di, df = self.sd.date_i, self.sd.date_f date_log = f"b/w [{di}, {df}]" print(grp_tail_log + date_log) @abstractmethod def _set_curr_input_array(self): # NOTE: storage posn into results_df (curr_df_pos) also set here pass def __get_xmin(self): rule, qnty = self.sa.xmin_rule, self.sa.xmin_qnty if rule in {"clauset", "manual"}: xmin = qnty # ie. {None, user-input-ℝ} respectively elif rule == "percent": xmin = np.percentile(self.curr_signed_returns, qnty) elif rule == "std-dev": xmin = self.__calc_stdv_xmin(qnty) elif rule in {"file", "average"}: assert self.sa.use_dynamic,\ ("static approach does NOT currently support passing " "xmin data by file") # TODO: add file support for -a static? grp, date, tail = self.curr_iter_id txmin = self.sa.txmin_map[tail] xmin = qnty.loc[date, f"{txmin} {grp}"] if isinstance(xmin, str) and xmin.endswith("%"): # b/c values containing '%' in xmins_df must be str percent = float(xmin[:-1]) elif isinstance(xmin, (int, float)) and self._use_pct_file: if not (0 <= xmin <= 1): raise TypeError("xmin percentile threshold value for " f"{self.iter_id_keys} is outside of 0-100") percent = xmin * 100 else: pass # numerical xmin data reaches this branch try: xmin = np.percentile(self.curr_signed_returns, percent) except NameError: xmin = float(xmin) else: raise AttributeError("this should never be reached!") return xmin def __calc_stdv_xmin(self, factor): mean = st.fmean(self.curr_returns_array) stdv = st.stdev(self.curr_returns_array) *_, tail = self.curr_iter_id assert mean < factor * stdv return abs(mean + tail.value * factor * stdv) # tail.value ∈ {1, -1} def _fit_curr_data(self): data = self.curr_signed_returns data = data[np.nonzero(data)] # only use non-zero elements to do Fit xmin = self.__get_xmin() self.curr_fit = Fit(data=data, xmin=xmin, discrete=self.sa.fit_discretely) @staticmethod def gen_rmsf(mmt_func): # rmsf: Returns Moments Statistics Functions def mf_wrapped(mmt_func, rtrn_vec): try: return mmt_func(rtrn_vec) except st.StatisticsError: return np.nan return (mmt_func, lambda rv: mf_wrapped(mmt_func, rv[rv>0]), lambda rv: mf_wrapped(mmt_func, rv[rv<0])) def __get_curr_rtrn_stats(self): # NOTE: functions in below list must match order in output_columns.yaml rs_fns = (len, lambda r: np.count_nonzero(r == 0), np.count_nonzero, *_Analyzer.gen_rmsf(st.fmean), *_Analyzer.gen_rmsf(st.stdev), *_Analyzer.gen_rmsf(scipy.stats.skew), *_Analyzer.gen_rmsf(scipy.stats.kurtosis),) rstats_fmap = {self.sd.rstats_collabs[i]: rs_fns[i] for i in range(len(rs_fns))} return {rstat: rstats_fmap[rstat](self.curr_returns_array) for rstat in self.sd.rstats_collabs} def __get_curr_tail_stats(self): alpha, xmin, sigma = (getattr(self.curr_fit.power_law, prop) for prop in ('alpha', 'xmin', 'sigma')) elm_in_fit = self.curr_signed_returns >= xmin fitted_vec = self.curr_signed_returns[elm_in_fit] xmax = max(fitted_vec) xmean = fitted_vec.mean() xstdv = fitted_vec.std() abs_len = len(fitted_vec) if self.sa.run_ks_test is True: # TODO: try compute ks_pv using MATLAB engine & module, and time ks_pv, _ = plpva(self.curr_signed_returns, xmin, 'reps', self.sa.ks_iter, 'silent') locs = locals() return {('tail-statistics', stat): locs.get(stat) for st_type, stat in self.sd.tstats_collabs if stat in locs} def __get_curr_logl_stats(self): # compute (R, p)-pairs (x3) using powerlaw.Fit.distribution_compare logl_stats = {key: {stat: val for stat, val in zip(('R', 'p'), self.curr_fit.distribution_compare( 'power_law', distro, normalized_ratio=True))} for key, distro in self._distros_to_compare.items()} return {('log-likelihoods', f"{dist}_{st}"): val for dist, stats in logl_stats.items() for st, val in stats.items()} def __get_curr_plfit_stats(self): tail_stats = self.__get_curr_tail_stats() logl_stats = (self.__get_curr_logl_stats() if self.sa.compare_distros else {}) return {**tail_stats, **logl_stats} def __get_calcd_substats_map(self, sstype): idx, col = self.curr_df_pos # type(idx)==str; type(col)==tuple if sstype == 'plfit': stcalc_fn = self.__get_curr_plfit_stats top_grp = col if self.sa.use_dynamic else (col,) need_ss = self.sa.analyze_tails elif sstype == 'returns': stcalc_fn = self.__get_curr_rtrn_stats top_grp = ((col,) if not self.sa.analyze_tails else (col[0],) if self.sa.use_dynamic else ()) # NOTE: hasnans check below on (<col>, 'rtrn-stats') Rm's redundant # calc only works for 1-proc b/c multiproc only updts res_df at end rstat_uncalcd = self.res.df.loc[idx, top_grp + ('returns-statistics',)].hasnans need_ss = self.sa.calc_rtrn_stats and rstat_uncalcd return ({top_grp + tuple(ss_key): ss_val for ss_key, ss_val in stcalc_fn().items()} if need_ss else {}) def _gset_curr_partial_results(self, action): fstats_map = self.__get_calcd_substats_map('plfit') rstats_map = self.__get_calcd_substats_map('returns') # TODO: use np.ndarray instead of pd.Series (wasteful) --> order later curr_part_res_series = pd.Series({**fstats_map, **rstats_map}) idx, _ = self.curr_df_pos if action == 'store': self.res.df.loc[idx].update(curr_part_res_series) # TODO: consider using pd.DataFrame.replace(, inplace=True) instead # TODO: can also order stats results first, then assign to DF row elif action == 'return': return idx, curr_part_res_series # # # orchestration / driver methods # # # # convenience wrapper to keep things tidy def _run_curr_iter_fitting(self): self._log_curr_iter() self._set_curr_input_array() self._fit_curr_data() # runs analysis on data ID'd by the next iteration of the stateful iterator def _analyze_next(self): # TODO: combine _analyze_next & _analyze_iter?? self.curr_iter_id = next(self.iter_id_keys) # set in subclasses self._run_curr_iter_fitting() self._gset_curr_partial_results('store') # runs analysis from start to finish, in 1-process + single-threaded mode def analyze_sequential(self): while True: try: self._analyze_next() except StopIteration: break # runs analysis for one iteration of analysis given arbitrary iter_id def _analyze_iter(self, iter_id): # NOTE: use this to resume computation print(f"### DEBUG: PID {getpid()} analyzing iter {iter_id}", file=sys.stderr) self.curr_iter_id = iter_id self._run_curr_iter_fitting() return self._gset_curr_partial_results('return') # runs analysis in multiprocessing mode def analyze_multiproc(self): # TODO: https://stackoverflow.com/a/52596590/5437918 (use shared DBDFs) iter_id_keys = tuple(self.iter_id_keys) # TODO: look into Queue & Pipe for sharing data with Pool(processes=self.sc.nproc) as pool: # TODO checkout .map alternatives: .imap, .map_async, etc. restup_ls = [restup for restup in # TODO: optimize chunksize below pool.map(self._analyze_iter, iter_id_keys)] # TODO: update res_df more efficiently, ex. pd.df.replace(), np.ndarray for restup in restup_ls: idx, res = restup # if use '+' NOTE that DFs init'd w/ NaNs self.res.df.loc[idx].update(res) # top-level convenience method that autodetects how to run tail analysis def analyze(self): nproc = self.sc.nproc # TODO: add other conditions for analyze_sequential (ex. -a static) if nproc == 1: self.analyze_sequential() elif nproc > 1: self.analyze_multiproc() else: # if 0 or negative number of processors got through to here raise TypeError(f'Cannot perform analysis with {nproc} processes') def get_resdf(self): # TODO: final clean ups of DF for presentation: # - use .title() on all index labels, then write to file self.res.prettify_df() return self.res.df