예제 #1
0
    def get_sample_conf(self, conf=0.95, process=None):
        r"""Returns the confidence interval that contains alpha % of the sample data


        etc.

        Parameters
        ----------
        conf : float, default = 0.95
            the confidence interval. Use:

            * conf = 0.6827 for 1-sigma confidence interval
            * conf = 0.9545 for 2-sigma confidence interval
            * conf = 0.9973 for 3-sigma confidence interval

        Returns
        -------
        (L,R) : (float[],float[]) or (float[][],float[][])
            lower and upper timescales bounding the confidence interval

            * if process is None, will return two (l x k) arrays, where l is the number of lag times
              and k is the number of computed timescales.
            * if process is an integer, will return two (l)-arrays with the
              selected process time scale for every lag time

        """
        if self._its_samples is None:
            raise RuntimeError('Cannot compute sample conf, because no samples were generated. '
                               'Use an estimator, which provides samples')
        # OK, go:
        if process is None:
            return confidence_interval(self._its_samples[:, self._successful_lag_indexes, :], conf=conf)
        else:
            return confidence_interval(self._its_samples[:, self._successful_lag_indexes, process], conf=conf)
예제 #2
0
    def _compute_observables_conf(self, model, estimator, mlag=1):
        # for lag time 0 we return an identity matrix
        if mlag == 0 or model is None:
            return np.eye(self.nsets), np.eye(self.nsets)
        # otherwise compute or predict them by model.propagate
        subset = self._full2active[
            estimator.active_set]  # find subset we are now working on
        l = np.zeros((self.nsets, self.nsets))
        r = np.zeros((self.nsets, self.nsets))
        for i in range(self.nsets):
            p0 = self.P0[:, i]  # starting distribution
            p0sub = p0[subset]  # map distribution to new active set
            p0sub /= p0sub.sum()  # renormalize
            pksub_samples = model.sample_f('propagate', p0sub, mlag)
            for j in range(self.nsets):
                pk_on_set_samples = np.fromiter(
                    (np.dot(pksub, self.memberships[subset, j])
                     for pksub in pksub_samples),
                    dtype=np.float,
                    count=len(pksub_samples))
                l[i, j], r[i, j] = confidence_interval(pk_on_set_samples,
                                                       conf=self.conf)
        return l, r


# TODO: conf is better added to function sample_conf() and not made a model parameter
# TODO: should Estimator really have a model parameter? This is not consistent with sklearn
# TODO: estimate_param_scan without return_estimators=True doesn't work at all!
예제 #3
0
    def sample_conf(self, f, *args, **kwargs):
        r"""Sample confidence interval of numerical method f over all samples

        Calls f(*args, **kwargs) on all samples and computes the confidence interval.
        Size of confidence interval is given in the construction of the
        SampledModel. f must return a numerical value or an ndarray.

        Parameters
        ----------
        f : method reference or name (str)
            Model method to be evaluated for each model sample
        args : arguments
            Non-keyword arguments to be passed to the method in each call
        kwargs : keyword-argments
            Keyword arguments to be passed to the method in each call

        Returns
        -------
        L : float or ndarray
            lower value or array of confidence interval
        R : float or ndarray
            upper value or array of confidence interval

        """
        vals = self.sample_f(f, *args, **kwargs)
        return confidence_interval(vals, conf=self.conf)
예제 #4
0
    def assertConfidence(self, sample, alpha, precision):
        alpha = 0.5
        conf = statistics.confidence_interval(sample, alpha)

        n_in = 0.0
        for i in range(len(sample)):
            if sample[i] > conf[1] and sample[i] < conf[2]:
                n_in += 1.0

        assert (alpha - (n_in / len(sample)) < precision)
예제 #5
0
    def assertConfidence(self, sample, alpha, precision):
        alpha = 0.5
        conf = statistics.confidence_interval(sample, alpha)

        n_in = 0.0
        for i in range(len(sample)):
            if sample[i] > conf[1] and sample[i] < conf[2]:
                n_in += 1.0

        assert(alpha - (n_in/len(sample)) < precision)
예제 #6
0
 def get_sample_conf(self, alpha=0.6827, process=None):
     r"""Returns the confidence interval that contains alpha % of the sample data
     
     Use:
     alpha = 0.6827 for 1-sigma confidence interval
     alpha = 0.9545 for 2-sigma confidence interval
     alpha = 0.9973 for 3-sigma confidence interval
     etc.
     
     Returns
     -------
     (L,R) : (float[],float[]) or (float[][],float[][])
         lower and upper timescales bounding the confidence interval
     if process is None, will return two (l x k) arrays, where l is the number of lag times 
     and k is the number of computed timescales.
     if process is an integer, will return two (l)-arrays with the
     selected process time scale for every lag time
     
     """
     if (self._its_samples is None):
         raise RuntimeError(
             'Cannot compute sample mean, because no samples were generated '
             + ' try calling bootstrap() before')
     # OK, go:
     if (process is None):
         L = np.zeros((len(self._lags), self._nits))
         R = np.zeros((len(self._lags), self._nits))
         for i in range(len(self._lags)):
             for j in range(self._nits):
                 conf = confidence_interval(self._its_samples[i, j], alpha)
                 L[i, j] = conf[1]
                 R[i, j] = conf[2]
         return (L, R)
     else:
         L = np.zeros(len(self._lags))
         R = np.zeros(len(self._lags))
         for i in range(len(self._lags)):
             conf = confidence_interval(self._its_samples[i, process],
                                        alpha)
             L[i] = conf[1]
             R[i] = conf[2]
         return (L, R)
예제 #7
0
 def get_sample_conf(self, alpha=0.6827, process=None):
     r"""Returns the confidence interval that contains alpha % of the sample data
     
     Use:
     alpha = 0.6827 for 1-sigma confidence interval
     alpha = 0.9545 for 2-sigma confidence interval
     alpha = 0.9973 for 3-sigma confidence interval
     etc.
     
     Returns
     -------
     (L,R) : (float[],float[]) or (float[][],float[][])
         lower and upper timescales bounding the confidence interval
     if process is None, will return two (l x k) arrays, where l is the number of lag times 
     and k is the number of computed timescales.
     if process is an integer, will return two (l)-arrays with the
     selected process time scale for every lag time
     
     """
     if (self._its_samples is None):
         raise RuntimeError('Cannot compute sample mean, because no samples were generated ' +
                            ' try calling bootstrap() before')
     # OK, go:
     if (process is None):
         L = np.zeros((len(self._lags), self._nits))
         R = np.zeros((len(self._lags), self._nits))
         for i in range(len(self._lags)):
             for j in range(self._nits):
                 conf = confidence_interval(self._its_samples[i, j], alpha)
                 L[i, j] = conf[1]
                 R[i, j] = conf[2]
         return (L, R)
     else:
         L = np.zeros(len(self._lags))
         R = np.zeros(len(self._lags))
         for i in range(len(self._lags)):
             conf = confidence_interval(self._its_samples[i, process], alpha)
             L[i] = conf[1]
             R[i] = conf[2]
         return (L, R)
예제 #8
0
 def _compute_observables_conf(self, model, estimator, mlag=1):
     # for lag time 0 we return all 1's.
     if mlag == 0 or model is None:
         return np.ones(self.nits+1), np.ones(self.nits+1)
     # otherwise compute or predict them from them model
     samples = self.test_model.sample_f('eigenvalues', self.nits+1)
     if mlag != 1:
         for i in range(len(samples)):
             samples[i] = np.power(samples[i], mlag)
     l, r = confidence_interval(samples, conf=self.conf)
     if self.exclude_stat:
         l = l[1:]
         r = r[1:]
     return l, r
예제 #9
0
def estimate_pi_error(dtrajs, orig_msm, ntrails=10, conf_interval=0.68):
    """
    Estimate boostrap error for stationary probability

    :param dtrajs: list of np.array, discrete trajectories
    :param orig_msm: pyemma.msm.MarkovModel
    Only used for reference of lag time and to incorporate ML
    stationary distribution to data frame
    :param ntrails: int, the number of bootstrap samples to draw.
    :param conf_interval: float 0 < conf_interval < 1

    :return:
    pandas.DataFrame instance containing ML MSM pi and bootstrap error
    """
    from pyemma.util.statistics import confidence_interval

    pi_samples = np.zeros((ntrails, orig_msm.nstates))

    for trial in tqdm(range(ntrails)):
        try:
            bs_sample = np.random.choice(len(dtrajs),
                                         size=len(dtrajs),
                                         replace=True)
            dtraj_sample = list(np.array(dtrajs)[bs_sample])

            msm = pyemma.msm.estimate_markov_model(dtraj_sample,
                                                   lag=orig_msm.lag)

            pi_samples[trial, msm.active_set] = msm.pi
        except Exception as e:
            print(e)

    std = pi_samples.std(axis=0)
    lower_confidence, upper_confidence = confidence_interval(
        pi_samples, conf_interval)

    probabilities = pd.DataFrame(
        np.array([
            orig_msm.active_set, orig_msm.pi, std, lower_confidence,
            upper_confidence
        ]).T,
        columns=['State', 'StatDist', 'Std', 'LowerConf', 'UpperConf'])

    # type cast to int
    probabilities['State'] = probabilities['State'].astype(int)

    return probabilities
예제 #10
0
 def _compute_observables_conf(self, model, estimator, mlag=1):
     # for lag time 0 we return an identity matrix
     if mlag == 0 or model is None:
         return np.eye(self.nsets), np.eye(self.nsets)
     # otherwise compute or predict them by model.propagate
     subset = self._full2active[estimator.active_set]  # find subset we are now working on
     l = np.zeros((self.nsets, self.nsets))
     r = np.zeros((self.nsets, self.nsets))
     for i in range(self.nsets):
         p0 = self.P0[:, i]  # starting distribution
         p0sub = p0[subset]  # map distribution to new active set
         p0sub /= p0sub.sum()  # renormalize
         pksub_samples = model.sample_f('propagate', p0sub, mlag)
         for j in range(self.nsets):
             pk_on_set_samples = np.fromiter((np.dot(pksub, self.memberships[subset, j])
                                              for pksub in pksub_samples), dtype=np.float, count=len(pksub_samples))
             l[i, j], r[i, j] = confidence_interval(pk_on_set_samples, conf=self.conf)
     return l, r
def estimate_pi_error(dtrajs,
                      orig_msm,
                      ntrails=10,
                      conf_interval=0.68,
                      return_samples=False):
    """
    Estimate boostrap error for stationary probability
    
    :param dtrajs: list of np.array, discrete trajectories
    :param orig_msm: pyemma.msm.MarkovModel
    Only used for reference of lag time and to incorporate ML 
    stationary distribution to data frame
    :param ntrails: int, the number of bootstrap samples to draw. 
    :param conf_interval: float 0 < conf_interval < 1
    
    :return:
    pandas.DataFrame instance containing ML MSM pi and bootstrap error
    """
    from pyemma.util.statistics import confidence_interval

    #pi_samples = np.zeros((ntrails, len(orig_msm.nstates)))
    pi_samples = np.zeros((ntrails, orig_msm.count_matrix_full.shape[0]))
    all_states = np.arange(start=0,
                           stop=orig_msm.count_matrix_full.shape[0],
                           step=1)
    for trial in tqdm(range(ntrails)):
        try:
            bs_sample = np.random.choice(len(dtrajs),
                                         size=len(dtrajs),
                                         replace=True)
            dtraj_sample = list(np.array(dtrajs)[bs_sample])

            msm = pyemma.msm.estimate_markov_model(dtraj_sample,
                                                   lag=orig_msm.lag)
            stationary_probs = msm.pi
            if len(connected_sets(msm.count_matrix_full)) > 1:
                disconnected_states = [
                    element for element in all_states
                    if element not in connected_sets(msm.count_matrix_full)[0]
                ]
                if len(disconnected_states) > 0:
                    for element in disconnected_states:
                        stationary_probs = np.insert(stationary_probs, element,
                                                     0)

            #pi_samples[trial, msm.active_set] = stationary_probs

            pi_samples[trial, all_states] = stationary_probs
        except Exception as e:
            pdb.set_trace()
            print(e)
    if return_samples:
        return pi_samples

    std = pi_samples.std(axis=0)
    lower_confidence, upper_confidence = confidence_interval(
        pi_samples, conf_interval)

    probabilities = pd.DataFrame(
        np.array([
            orig_msm.active_set, orig_msm.pi, std, lower_confidence,
            upper_confidence
        ]).T,
        columns=['State', 'StatDist', 'Std', 'LowerConf', 'UpperConf'],
    )

    # type cast to int
    probabilities['State'] = probabilities['State'].astype(int)

    return probabilities
            else:
                for n, k in enumerate(metadata_fields_to_agregate):
                    #index = state_indices[k].index(KeywordLabel)
                    try:
                        if 'w' in k.split('_'):
                            leg = 'women'
                        else:
                            leg = 'men'
                        state_samples = samples[k][:, index]
                        #plt.figure(figsize=(28,28)) #change your figure size as per your desire heres

                        y, x, _ = plt.hist(state_samples,
                                           bins=20,
                                           label=f'{leg}',
                                           color=f'C{n}')
                        lower_confidence, upper_confidence = confidence_interval(
                            state_samples, 0.68)
                        #plt.vlines(lower_confidence, 0, 10,  color=f'C{n}', linestyle=':', label=f'lower conf {k}')
                        #plt.vlines(upper_confidence, 0, 10,  color=f'C{n}', linestyle='--', label=f'upper conf {k}')

                        plt.vlines(msms[k].pi[msms[k]._full2active[index]],
                                   0,
                                   1400,
                                   color='k',
                                   label='Model estimate' if n == 1 else None,
                                   linestyles='dashed')
                        plt.ylabel("Count (R=10000)", size=14)
                        plt.xlabel("Stationary probability", size=14)

                    except:
                        pdb.set_trace()