def get_sample_conf(self, conf=0.95, process=None): r"""Returns the confidence interval that contains alpha % of the sample data etc. Parameters ---------- conf : float, default = 0.95 the confidence interval. Use: * conf = 0.6827 for 1-sigma confidence interval * conf = 0.9545 for 2-sigma confidence interval * conf = 0.9973 for 3-sigma confidence interval Returns ------- (L,R) : (float[],float[]) or (float[][],float[][]) lower and upper timescales bounding the confidence interval * if process is None, will return two (l x k) arrays, where l is the number of lag times and k is the number of computed timescales. * if process is an integer, will return two (l)-arrays with the selected process time scale for every lag time """ if self._its_samples is None: raise RuntimeError('Cannot compute sample conf, because no samples were generated. ' 'Use an estimator, which provides samples') # OK, go: if process is None: return confidence_interval(self._its_samples[:, self._successful_lag_indexes, :], conf=conf) else: return confidence_interval(self._its_samples[:, self._successful_lag_indexes, process], conf=conf)
def _compute_observables_conf(self, model, estimator, mlag=1): # for lag time 0 we return an identity matrix if mlag == 0 or model is None: return np.eye(self.nsets), np.eye(self.nsets) # otherwise compute or predict them by model.propagate subset = self._full2active[ estimator.active_set] # find subset we are now working on l = np.zeros((self.nsets, self.nsets)) r = np.zeros((self.nsets, self.nsets)) for i in range(self.nsets): p0 = self.P0[:, i] # starting distribution p0sub = p0[subset] # map distribution to new active set p0sub /= p0sub.sum() # renormalize pksub_samples = model.sample_f('propagate', p0sub, mlag) for j in range(self.nsets): pk_on_set_samples = np.fromiter( (np.dot(pksub, self.memberships[subset, j]) for pksub in pksub_samples), dtype=np.float, count=len(pksub_samples)) l[i, j], r[i, j] = confidence_interval(pk_on_set_samples, conf=self.conf) return l, r # TODO: conf is better added to function sample_conf() and not made a model parameter # TODO: should Estimator really have a model parameter? This is not consistent with sklearn # TODO: estimate_param_scan without return_estimators=True doesn't work at all!
def sample_conf(self, f, *args, **kwargs): r"""Sample confidence interval of numerical method f over all samples Calls f(*args, **kwargs) on all samples and computes the confidence interval. Size of confidence interval is given in the construction of the SampledModel. f must return a numerical value or an ndarray. Parameters ---------- f : method reference or name (str) Model method to be evaluated for each model sample args : arguments Non-keyword arguments to be passed to the method in each call kwargs : keyword-argments Keyword arguments to be passed to the method in each call Returns ------- L : float or ndarray lower value or array of confidence interval R : float or ndarray upper value or array of confidence interval """ vals = self.sample_f(f, *args, **kwargs) return confidence_interval(vals, conf=self.conf)
def assertConfidence(self, sample, alpha, precision): alpha = 0.5 conf = statistics.confidence_interval(sample, alpha) n_in = 0.0 for i in range(len(sample)): if sample[i] > conf[1] and sample[i] < conf[2]: n_in += 1.0 assert (alpha - (n_in / len(sample)) < precision)
def assertConfidence(self, sample, alpha, precision): alpha = 0.5 conf = statistics.confidence_interval(sample, alpha) n_in = 0.0 for i in range(len(sample)): if sample[i] > conf[1] and sample[i] < conf[2]: n_in += 1.0 assert(alpha - (n_in/len(sample)) < precision)
def get_sample_conf(self, alpha=0.6827, process=None): r"""Returns the confidence interval that contains alpha % of the sample data Use: alpha = 0.6827 for 1-sigma confidence interval alpha = 0.9545 for 2-sigma confidence interval alpha = 0.9973 for 3-sigma confidence interval etc. Returns ------- (L,R) : (float[],float[]) or (float[][],float[][]) lower and upper timescales bounding the confidence interval if process is None, will return two (l x k) arrays, where l is the number of lag times and k is the number of computed timescales. if process is an integer, will return two (l)-arrays with the selected process time scale for every lag time """ if (self._its_samples is None): raise RuntimeError( 'Cannot compute sample mean, because no samples were generated ' + ' try calling bootstrap() before') # OK, go: if (process is None): L = np.zeros((len(self._lags), self._nits)) R = np.zeros((len(self._lags), self._nits)) for i in range(len(self._lags)): for j in range(self._nits): conf = confidence_interval(self._its_samples[i, j], alpha) L[i, j] = conf[1] R[i, j] = conf[2] return (L, R) else: L = np.zeros(len(self._lags)) R = np.zeros(len(self._lags)) for i in range(len(self._lags)): conf = confidence_interval(self._its_samples[i, process], alpha) L[i] = conf[1] R[i] = conf[2] return (L, R)
def get_sample_conf(self, alpha=0.6827, process=None): r"""Returns the confidence interval that contains alpha % of the sample data Use: alpha = 0.6827 for 1-sigma confidence interval alpha = 0.9545 for 2-sigma confidence interval alpha = 0.9973 for 3-sigma confidence interval etc. Returns ------- (L,R) : (float[],float[]) or (float[][],float[][]) lower and upper timescales bounding the confidence interval if process is None, will return two (l x k) arrays, where l is the number of lag times and k is the number of computed timescales. if process is an integer, will return two (l)-arrays with the selected process time scale for every lag time """ if (self._its_samples is None): raise RuntimeError('Cannot compute sample mean, because no samples were generated ' + ' try calling bootstrap() before') # OK, go: if (process is None): L = np.zeros((len(self._lags), self._nits)) R = np.zeros((len(self._lags), self._nits)) for i in range(len(self._lags)): for j in range(self._nits): conf = confidence_interval(self._its_samples[i, j], alpha) L[i, j] = conf[1] R[i, j] = conf[2] return (L, R) else: L = np.zeros(len(self._lags)) R = np.zeros(len(self._lags)) for i in range(len(self._lags)): conf = confidence_interval(self._its_samples[i, process], alpha) L[i] = conf[1] R[i] = conf[2] return (L, R)
def _compute_observables_conf(self, model, estimator, mlag=1): # for lag time 0 we return all 1's. if mlag == 0 or model is None: return np.ones(self.nits+1), np.ones(self.nits+1) # otherwise compute or predict them from them model samples = self.test_model.sample_f('eigenvalues', self.nits+1) if mlag != 1: for i in range(len(samples)): samples[i] = np.power(samples[i], mlag) l, r = confidence_interval(samples, conf=self.conf) if self.exclude_stat: l = l[1:] r = r[1:] return l, r
def estimate_pi_error(dtrajs, orig_msm, ntrails=10, conf_interval=0.68): """ Estimate boostrap error for stationary probability :param dtrajs: list of np.array, discrete trajectories :param orig_msm: pyemma.msm.MarkovModel Only used for reference of lag time and to incorporate ML stationary distribution to data frame :param ntrails: int, the number of bootstrap samples to draw. :param conf_interval: float 0 < conf_interval < 1 :return: pandas.DataFrame instance containing ML MSM pi and bootstrap error """ from pyemma.util.statistics import confidence_interval pi_samples = np.zeros((ntrails, orig_msm.nstates)) for trial in tqdm(range(ntrails)): try: bs_sample = np.random.choice(len(dtrajs), size=len(dtrajs), replace=True) dtraj_sample = list(np.array(dtrajs)[bs_sample]) msm = pyemma.msm.estimate_markov_model(dtraj_sample, lag=orig_msm.lag) pi_samples[trial, msm.active_set] = msm.pi except Exception as e: print(e) std = pi_samples.std(axis=0) lower_confidence, upper_confidence = confidence_interval( pi_samples, conf_interval) probabilities = pd.DataFrame( np.array([ orig_msm.active_set, orig_msm.pi, std, lower_confidence, upper_confidence ]).T, columns=['State', 'StatDist', 'Std', 'LowerConf', 'UpperConf']) # type cast to int probabilities['State'] = probabilities['State'].astype(int) return probabilities
def _compute_observables_conf(self, model, estimator, mlag=1): # for lag time 0 we return an identity matrix if mlag == 0 or model is None: return np.eye(self.nsets), np.eye(self.nsets) # otherwise compute or predict them by model.propagate subset = self._full2active[estimator.active_set] # find subset we are now working on l = np.zeros((self.nsets, self.nsets)) r = np.zeros((self.nsets, self.nsets)) for i in range(self.nsets): p0 = self.P0[:, i] # starting distribution p0sub = p0[subset] # map distribution to new active set p0sub /= p0sub.sum() # renormalize pksub_samples = model.sample_f('propagate', p0sub, mlag) for j in range(self.nsets): pk_on_set_samples = np.fromiter((np.dot(pksub, self.memberships[subset, j]) for pksub in pksub_samples), dtype=np.float, count=len(pksub_samples)) l[i, j], r[i, j] = confidence_interval(pk_on_set_samples, conf=self.conf) return l, r
def estimate_pi_error(dtrajs, orig_msm, ntrails=10, conf_interval=0.68, return_samples=False): """ Estimate boostrap error for stationary probability :param dtrajs: list of np.array, discrete trajectories :param orig_msm: pyemma.msm.MarkovModel Only used for reference of lag time and to incorporate ML stationary distribution to data frame :param ntrails: int, the number of bootstrap samples to draw. :param conf_interval: float 0 < conf_interval < 1 :return: pandas.DataFrame instance containing ML MSM pi and bootstrap error """ from pyemma.util.statistics import confidence_interval #pi_samples = np.zeros((ntrails, len(orig_msm.nstates))) pi_samples = np.zeros((ntrails, orig_msm.count_matrix_full.shape[0])) all_states = np.arange(start=0, stop=orig_msm.count_matrix_full.shape[0], step=1) for trial in tqdm(range(ntrails)): try: bs_sample = np.random.choice(len(dtrajs), size=len(dtrajs), replace=True) dtraj_sample = list(np.array(dtrajs)[bs_sample]) msm = pyemma.msm.estimate_markov_model(dtraj_sample, lag=orig_msm.lag) stationary_probs = msm.pi if len(connected_sets(msm.count_matrix_full)) > 1: disconnected_states = [ element for element in all_states if element not in connected_sets(msm.count_matrix_full)[0] ] if len(disconnected_states) > 0: for element in disconnected_states: stationary_probs = np.insert(stationary_probs, element, 0) #pi_samples[trial, msm.active_set] = stationary_probs pi_samples[trial, all_states] = stationary_probs except Exception as e: pdb.set_trace() print(e) if return_samples: return pi_samples std = pi_samples.std(axis=0) lower_confidence, upper_confidence = confidence_interval( pi_samples, conf_interval) probabilities = pd.DataFrame( np.array([ orig_msm.active_set, orig_msm.pi, std, lower_confidence, upper_confidence ]).T, columns=['State', 'StatDist', 'Std', 'LowerConf', 'UpperConf'], ) # type cast to int probabilities['State'] = probabilities['State'].astype(int) return probabilities
else: for n, k in enumerate(metadata_fields_to_agregate): #index = state_indices[k].index(KeywordLabel) try: if 'w' in k.split('_'): leg = 'women' else: leg = 'men' state_samples = samples[k][:, index] #plt.figure(figsize=(28,28)) #change your figure size as per your desire heres y, x, _ = plt.hist(state_samples, bins=20, label=f'{leg}', color=f'C{n}') lower_confidence, upper_confidence = confidence_interval( state_samples, 0.68) #plt.vlines(lower_confidence, 0, 10, color=f'C{n}', linestyle=':', label=f'lower conf {k}') #plt.vlines(upper_confidence, 0, 10, color=f'C{n}', linestyle='--', label=f'upper conf {k}') plt.vlines(msms[k].pi[msms[k]._full2active[index]], 0, 1400, color='k', label='Model estimate' if n == 1 else None, linestyles='dashed') plt.ylabel("Count (R=10000)", size=14) plt.xlabel("Stationary probability", size=14) except: pdb.set_trace()