def test_create_flat_names_2d(): shape = 2, 3 result = ttab.create_flat_names('x', shape) expected = ['x__0_0', 'x__0_1', 'x__0_2', 'x__1_0', 'x__1_1', 'x__1_2'] assert result == expected assert ttab._create_shape(result) == shape
def extract_bounds_from_summary(summary, varname, shape, roundto=None): """ Extract lower and upper bound of random variable. Returns ------- list of num.Ndarray """ def do_nothing(value): return value indexes = ttab.create_flat_names(varname, shape) lower_quant = 'hpd_2.5' upper_quant = 'hpd_97.5' bounds = [] for quant in [lower_quant, upper_quant]: values = num.empty(shape, 'float64') for i, idx in enumerate(indexes): adjust = 10.**roundto if roundto is not None: if quant == lower_quant: operation = num.floor elif quant == upper_quant: operation = num.ceil else: operation = do_nothing values[i] = operation(summary[quant][idx] * adjust) / adjust bounds.append(values) return bounds
def load(con_str, model=None): """Load ODBC database. Parameters ---------- con_str : str ODBC Connection string including database model : Model If None, the model is taken from the `with` context. Returns ------- A MultiTrace instance """ db = _ODBCDB(con_str) db.connect() name = _get_db_name(con_str) varnames = _get_table_list(db.cursor) if len(varnames) == 0: raise ValueError(('Can not get variable list for database' '`{}`'.format(name))) chains = _get_chain_list(db.cursor, varnames[0]) print(chains) straces = [] for chain in chains: strace = ODBC(con_str, model=model) strace.chain = chain strace._var_cols = {varname: ttab.create_flat_names('v', shape) for varname, shape in strace.var_shapes.items()} strace._is_setup = True strace.db = db # Share the db with all traces. straces.append(strace) return base.MultiTrace(straces)
def dump(name, trace, chains=None): """ Store values from NDArray trace as CSV files. Parameters ---------- name : str Name of directory to store CSV files in trace : :class:`pymc3.backend.base.MultiTrace` of NDArray traces Result of MCMC run with default NDArray backend chains : list Chains to dump. If None, all chains are dumped. """ if not os.path.exists(name): os.mkdir(name) if chains is None: chains = trace.chains var_shapes = trace._straces[chains[0]].var_shapes flat_names = { v: ttab.create_flat_names(v, shape) for v, shape in var_shapes.items() } for chain in chains: filename = os.path.join(name, 'chain-{}.csv'.format(chain)) df = ttab.trace_to_dataframe(trace, chains=chain, flat_names=flat_names) df.to_csv(filename, index=False)
def __init__(self, name, model=None, vars=None): if not os.path.exists(name): os.mkdir(name) super(TextChain, self).__init__(name, model, vars) self.flat_names = {v: ttab.create_flat_names(v, shape) for v, shape in self.var_shapes.items()} self.filename = None self.df = None self.corrupted_flag = False
def __init__(self, dir_path='', model=None, vars=None, buffer_size=5000, buffer_thinning=1, progressbar=False, k=None): super(FileChain, self).__init__(model=model, vars=vars, buffer_size=buffer_size, buffer_thinning=buffer_thinning) if not os.path.exists(dir_path): os.mkdir(dir_path) self.dir_path = dir_path self.flat_names = OrderedDict() if self.var_shapes is not None: if k is not None: self.flat_names = OrderedDict() for var, shape in self.var_shapes.items(): if var in transd_vars_dist: shape = (k, ) self.flat_names[var] = ttab.create_flat_names(var, shape) else: for v, shape in self.var_shapes.items(): self.flat_names[v] = ttab.create_flat_names(v, shape) self.k = k self.corrupted_flag = False self.progressbar = progressbar self.stored_samples = 0 self.draws = 0 self._df = None self.filename = None
def dict2pd(statdict, labelname): """Small helper function to transform a diagnostics output dict into a pandas Series. """ var_dfs = [] for key, value in statdict.items(): var_df = pd.Series(value.flatten()) var_df.index = ttab.create_flat_names(key, value.shape) var_dfs.append(var_df) statpd = pd.concat(var_dfs, axis=0) statpd = statpd.rename(labelname) return statpd
def __init__(self, name, model=None, vars=None, buffer_size=5000, progressbar=False, k=None): if not os.path.exists(name): os.mkdir(name) super(TextChain, self).__init__(name, model, vars) self.flat_names = None if self.var_shapes is not None: if k is not None: self.flat_names = {} for var, shape in self.var_shapes.items(): if var in transd_vars_dist: shape = (k, ) self.flat_names[var] = ttab.create_flat_names(var, shape) else: self.flat_names = { v: ttab.create_flat_names(v, shape) for v, shape in self.var_shapes.items() } self.k = k self.filename = None self.df = None self.corrupted_flag = False self.progressbar = progressbar self.buffer_size = buffer_size self.stored_samples = 0 self.buffer = []
def add_derived_variables(self, source_type, n_sources=1): try: varnames = derived_variables_mapping[source_type] logger.info('Adding derived variables %s to ' 'trace.' % list2string(varnames)) except KeyError: logger.info('No derived variables for %s' % source_type) varnames = [] for varname in varnames: shape = (n_sources, ) self.flat_names[varname] = ttab.create_flat_names(varname, shape) self.var_shapes[varname] = shape self.var_dtypes[varname] = 'float64' self.varnames.append(varname)
def setup(self, draws, chain): """Perform chain-specific setup. Parameters ---------- draws : int Expected number of draws chain : int Chain number """ self.db.connect() self.chain = chain if self._is_setup: self.draw_idx = self._get_max_draw(chain) + 1 self._len = None else: # Table has not been created. self._var_cols = {varname: ttab.create_flat_names('v', shape) for varname, shape in self.var_shapes.items()} self._create_table() self._is_setup = True self._create_insert_queries()
def test_create_flat_names_1d(): shape = (2, ) result = ttab.create_flat_names("x", shape) expected = ["x__0", "x__1"] assert result == expected assert ttab._create_shape(result) == shape
def summary(trace, varnames=None, transform=lambda x: x, stat_funcs=None, extend=False, include_transformed=False, alpha=0.05, start=0, batches=None): R"""Create a data frame with summary statistics. Parameters ---------- trace : MultiTrace instance varnames : list Names of variables to include in summary transform : callable Function to transform data (defaults to identity) stat_funcs : None or list A list of functions used to calculate statistics. By default, the mean, standard deviation, simulation standard error, and highest posterior density intervals are included. The functions will be given one argument, the samples for a variable as a 2 dimensional array, where the first axis corresponds to sampling iterations and the second axis represents the flattened variable (e.g., x__0, x__1,...). Each function should return either 1) A `pandas.Series` instance containing the result of calculating the statistic along the first axis. The name attribute will be taken as the name of the statistic. 2) A `pandas.DataFrame` where each column contains the result of calculating the statistic along the first axis. The column names will be taken as the names of the statistics. extend : boolean If True, use the statistics returned by `stat_funcs` in addition to, rather than in place of, the default statistics. This is only meaningful when `stat_funcs` is not None. include_transformed : bool Flag for reporting automatically transformed variables in addition to original variables (defaults to False). alpha : float The alpha level for generating posterior intervals. Defaults to 0.05. This is only meaningful when `stat_funcs` is None. start : int The starting index from which to summarize (each) chain. Defaults to zero. batches : None or int Batch size for calculating standard deviation for non-independent samples. Defaults to the smaller of 100 or the number of samples. This is only meaningful when `stat_funcs` is None. Returns ------- `pandas.DataFrame` with summary statistics for each variable Defaults one are: `mean`, `sd`, `mc_error`, `hpd_2.5`, `hpd_97.5`, `n_eff` and `Rhat`. Last two are only computed for traces with 2 or more chains. Examples -------- .. code:: ipython >>> import pymc3 as pm >>> trace.mu.shape (1000, 2) >>> pm.summary(trace, ['mu']) mean sd mc_error hpd_5 hpd_95 mu__0 0.106897 0.066473 0.001818 -0.020612 0.231626 mu__1 -0.046597 0.067513 0.002048 -0.174753 0.081924 n_eff Rhat mu__0 487.0 1.00001 mu__1 379.0 1.00203 Other statistics can be calculated by passing a list of functions. .. code:: ipython >>> import pandas as pd >>> def trace_sd(x): ... return pd.Series(np.std(x, 0), name='sd') ... >>> def trace_quantiles(x): ... return pd.DataFrame(pm.quantiles(x, [5, 50, 95])) ... >>> pm.summary(trace, ['mu'], stat_funcs=[trace_sd, trace_quantiles]) sd 5 50 95 mu__0 0.066473 0.000312 0.105039 0.214242 mu__1 0.067513 -0.159097 -0.045637 0.062912 """ if varnames is None: varnames = get_default_varnames( trace.varnames, include_transformed=include_transformed) if batches is None: batches = min([100, len(trace)]) funcs = [ lambda x: pd.Series(np.mean(x, 0), name='mean'), lambda x: pd.Series(np.std(x, 0), name='sd'), lambda x: pd.Series(mc_error(x, batches), name='mc_error'), lambda x: _hpd_df(x, alpha) ] if stat_funcs is not None: if extend: funcs = funcs + stat_funcs else: funcs = stat_funcs var_dfs = [] for var in varnames: vals = transform(trace.get_values(var, burn=start, combine=True)) flat_vals = vals.reshape(vals.shape[0], -1) var_df = pd.concat([f(flat_vals) for f in funcs], axis=1) var_df.index = ttab.create_flat_names(var, vals.shape[1:]) var_dfs.append(var_df) dforg = pd.concat(var_dfs, axis=0) if (stat_funcs is not None) and (not extend): return dforg elif trace.nchains < 2: return dforg else: n_eff = pm.effective_n(trace, varnames=varnames, include_transformed=include_transformed) n_eff_pd = dict2pd(n_eff, 'n_eff') rhat = pm.gelman_rubin(trace, varnames=varnames, include_transformed=include_transformed) rhat_pd = dict2pd(rhat, 'Rhat') #import pdb; pdb.set_trace() # return pd.concat([dforg, n_eff_pd, rhat_pd], # axis=1, join_axes=[dforg.index]) return pd.concat([dforg, n_eff_pd, rhat_pd], axis=1).reindex(dforg.index)
def test_create_flat_names_3d(): shape = 2, 3, 4 assert ttab._create_shape(ttab.create_flat_names('x', shape)) == shape
def test_create_flat_names_0d(): shape = () result = ttab.create_flat_names('x', shape) expected = ['x'] assert result == expected assert ttab._create_shape(result) == shape
def test_create_flat_names_2d(): shape = 2, 3 result = ttab.create_flat_names("x", shape) expected = ["x__0_0", "x__0_1", "x__0_2", "x__1_0", "x__1_1", "x__1_2"] assert result == expected assert ttab._create_shape(result) == shape