Пример #1
0
def test_confint_multinomial_proportions_zeros():
    # test when a count is zero or close to zero
    # values from R MultinomialCI
    ci01 = np.array([
        0.09364718, 0.1898413, 0.00000000, 0.0483581, 0.13667426, 0.2328684,
        0.10124019, 0.1974343, 0.10883321, 0.2050273, 0.17210833, 0.2683024,
        0.09870919, 0.1949033
    ]).reshape(-1, 2)

    ci0 = np.array([
        0.09620253, 0.19238867, 0.00000000, 0.05061652, 0.13924051, 0.23542664,
        0.10379747, 0.19998360, 0.11139241, 0.20757854, 0.17468354, 0.27086968,
        0.10126582, 0.19745196
    ]).reshape(-1, 2)

    # the shifts are the differences between "LOWER(SG)"  "UPPER(SG)" and
    # "LOWER(C+1)" "UPPER(C+1)" in verbose printout
    # ci01_shift = np.array([0.002531008, -0.002515122])  # not needed
    ci0_shift = np.array([0.002531642, 0.002515247])

    p = [56, 0.1, 73, 59, 62, 87, 58]
    ci_01 = smprop.multinomial_proportions_confint(p,
                                                   0.05,
                                                   method='sison_glaz')
    p = [56, 0, 73, 59, 62, 87, 58]
    ci_0 = smprop.multinomial_proportions_confint(p, 0.05, method='sison_glaz')

    assert_allclose(ci_01, ci01, atol=1e-5)
    assert_allclose(ci_0, np.maximum(ci0 - ci0_shift, 0), atol=1e-5)
    assert_allclose(ci_01, ci_0, atol=5e-4)
Пример #2
0
def compute_multinomial_confidence_intervals(trace):
    indices=pymbar.timeseries.subsampleCorrelatedData(trace[::10,0])

        
    confint_trace_USE=trace[indices]
        
    
    trace_model_0=[]
    trace_model_1=[]
    trace_model_2=[]
    for i in range(np.size(confint_trace_USE,0)):
        if confint_trace_USE[i,0] == 0:
            trace_model_0.append(confint_trace_USE[i])
            #log_trace_0.append(logp_trace[i])
        elif confint_trace_USE[i,0] == 1:
            trace_model_1.append(confint_trace_USE[i])
            #log_trace_1.append(logp_trace[i])
        elif confint_trace_USE[i,0] == 2:
            trace_model_2.append(confint_trace_USE[i])
            #log_trace_2.append(logp_trace[i])        
        
        
    trace_model_0=np.asarray(trace_model_0)
    trace_model_1=np.asarray(trace_model_1)
    trace_model_2=np.asarray(trace_model_2)
    
    counts = np.asarray([len(trace_model_0),len(trace_model_1),len(trace_model_2)])
 
    
    prob_conf = multinomial_proportions_confint(counts)
    
    return prob_conf
Пример #3
0
    def test_df_loc(self):
        # tests the df subsetting including subsetting the index

        subsample_index=2
        nc = 4
        nr = subsample_index*16
        alpha=0.01

        a = np.arange(0, nr * nc).reshape(nr, nc)
        df_all = pd.DataFrame(a, columns=np.arange(0, nc))

        df_sub=df_all.loc[::subsample_index,:]

        in_values=df_sub.values
        out_values=np.zeroes((nr/subsample_index,2*nc))

        for nl in range(0,in_values.shape[0]):
            ci=multinomial_proportions_confint(in_values[nl,:],alpha=alpha,method='goodman'  )
            out_values[nl, 0:nc] = ci[:, 0]
            out_values[nl, nc:-1] = ci[:, 1]

        output_sm=pd.DataFrame(out_values,gen_ci_label(df_all.columns,'lb')+gen_ci_label(df_all.columns,'ub'))

        out_df = multinomial_proportions_confint_df(df_sub, alpha=alpha)
        assert_allclose(output_sm.values, out_df.values, rtol=1e-07, err_msg="test_df_loc ")
        self.assertSequenceEqual(list(output_sm.columns), list(out_df.columns), 'test_df_loc: different columns')
        self.assertSequenceEqual(list(output_sm.index), list(out_df.index), 'test_df_loc: different indices')
    def certification(self, model, data, label, data_adv=None, mask=None):
        if mask is None:
            bone_length = self.preprocess(data.data)
            mask = bone_length.abs().max(2)[0]
            mask = torch.sign(torch.max(mask - 1e-5,
                                        torch.zeros_like(mask))).float()
            mask = mask.view(data.size(0), 1, data.size(2), 1, data.size(4))
            mask = mask.repeat(1, data.size(1), 1, data.size(3), 1).cuda()

        counts = self.counts(data, model, mask).numpy()

        # max_counts = np.max(counts, axis=1)
        pred_label = np.argmax(counts, axis=1)

        p1, p2 = [], []
        for i in range(counts.shape[0]):
            p = multinomial_proportions_confint(np.sort(counts[i, :])[::-1],
                                                alpha=self.fail_prob)
            p1.append(p[0, 0])
            p2.append(p[1, 1])

        radius = np.maximum(
            0.5 * self.sigma *
            (norm.ppf(np.array(p1)) - norm.ppf(np.array(p2))), 0.0)

        if data_adv is not None:
            l2_norm = self.L2_distance(data.data, data_adv.data).cpu().numpy()
            return np.logical_and(
                np.greater(radius, l2_norm),
                np.equal(pred_label,
                         label.data.cpu().numpy()))
        return np.where(np.equal(pred_label,
                                 label.data.cpu().numpy()), radius, -1)
Пример #5
0
def test_confint_multinomial_proportions():
    from .results.results_multinomial_proportions import res_multinomial

    for ((method, description), values) in res_multinomial.items():
        cis = multinomial_proportions_confint(values.proportions, 0.05,
                                              method=method)
        assert_almost_equal(
            values.cis, cis, decimal=values.precision,
            err_msg='"%s" method, %s' % (method, description))
def test_confint_multinomial_proportions():
    from .results.results_multinomial_proportions import res_multinomial

    for ((method, description), values) in res_multinomial.items():
        cis = multinomial_proportions_confint(values.proportions, 0.05,
                                              method=method)
        assert_almost_equal(
            values.cis, cis, decimal=values.precision,
            err_msg='"%s" method, %s' % (method, description))
Пример #7
0
def test_confint_multinomial_proportions_zeros():
    # test when a count is zero or close to zero
    # values from R MultinomialCI
    ci01 = np.array([
     0.09364718, 0.1898413,
     0.00000000, 0.0483581,
     0.13667426, 0.2328684,
     0.10124019, 0.1974343,
     0.10883321, 0.2050273,
     0.17210833, 0.2683024,
     0.09870919, 0.1949033]).reshape(-1,2)

    ci0 = np.array([
    0.09620253, 0.19238867,
    0.00000000, 0.05061652,
    0.13924051, 0.23542664,
    0.10379747, 0.19998360,
    0.11139241, 0.20757854,
    0.17468354, 0.27086968,
    0.10126582, 0.19745196]).reshape(-1,2)

    # the shifts are the differences between "LOWER(SG)"  "UPPER(SG)" and
    # "LOWER(C+1)" "UPPER(C+1)" in verbose printout
    # ci01_shift = np.array([0.002531008, -0.002515122])  # not needed
    ci0_shift = np.array([0.002531642, 0.002515247])

    p = [56, 0.1, 73, 59, 62, 87, 58]
    ci_01 = smprop.multinomial_proportions_confint(p, 0.05,
                                                   method='sison_glaz')
    p = [56, 0, 73, 59, 62, 87, 58]
    ci_0 = smprop.multinomial_proportions_confint(p, 0.05,
                                                  method='sison_glaz')

    assert_allclose(ci_01, ci01, atol=1e-5)
    assert_allclose(ci_0, np.maximum(ci0 - ci0_shift, 0), atol=1e-5)
    assert_allclose(ci_01, ci_0, atol=5e-4)
Пример #8
0
def calculate_expansion_proportion(tree):
    """ Detects clonal expansion of all children at depth 1. 
    
    :param tree: ete3.TreeNode, the root of the tree
    :return: A mapping from child to confidence interval
    """

    N = len(tree.get_leaf_names())

    props = {}
    for c in tree.children:
        props[c] = len(c.get_leaves())

    ci = multinomial_proportions_confint(list(props.values()), alpha=0.05)
    props = {c: interval for c, interval in zip(props.keys(), ci)}
    return props
Пример #9
0
    def fit(self, data):
        '''
        Parameters
        ----------
        data : array-like
            The data to use for the estimation

        Returns
        -------
        matrix : estimated transition matrix
        confint_lower: lower confidence interval
        confint_upper: upper confidence interval

        Notes
        ------
        '''

        # loop over data (id, state_in, state_out)
        # calculate population count N^i_k per state i
        # calculate migrations count N^{ij}_{kl} from i to j
        # calculate transition matrix as ratio T^{ij}_{kl} = N^{ij}_{kl} / N^i_k

        # In the simple estimator all events are part of the same cohort
        state_count = self.states.cardinality
        state_list = self.states.get_states()

        # storage of counts
        tm_count = np.ndarray(state_count)
        tmn_count = np.ndarray((state_count, state_count))
        tm_count.fill(0.0)
        tmn_count.fill(0.0)

        i = 0
        for row in data.itertuples():
            state_in = state_list.index(row[2])
            state_out = state_list.index(row[3])
            # print(row[2], row[3])
            # print(state_in, state_out)
            tm_count[state_in] += 1
            tmn_count[state_in, state_out] += 1
            i += 1

        self.counts = int(tm_count.sum())

        '''Confidence intervals for multinomial proportions. See statsmodels URL
        http://www.statsmodels.org/devel/_modules/statsmodels/stats/proportion.html

        Parameters
        ----------
        counts : array_like of int, 1-D
            Number of observations in each category.
        alpha : float in (0, 1), optional
            Significance level, defaults to 0.05.
        method : {'goodman', 'sison-glaz'}, optional
            Method to use to compute the confidence intervals; available methods
            are:

             - `goodman`: based on a chi-squared approximation, valid if all
               values in `counts` are greater or equal to 5 [2]_
             - `sison-glaz`: less conservative than `goodman`, but only valid if
               `counts` has 7 or more categories (``len(counts) >= 7``) [3]_

        Returns
        -------
        confint : ndarray, 2-D
            Array of [lower, upper] confidence levels for each category, such that
            overall coverage is (approximately) `1-alpha`.
        '''

        confint_lower = np.ndarray((state_count, state_count, 1))
        confint_upper = np.ndarray((state_count, state_count, 1))
        for s1 in range(state_count):
            intervals = st.multinomial_proportions_confint(tmn_count[s1, :], alpha=self.ci_alpha, method=self.ci_method)
            for s2 in range(state_count):
                confint_lower[s1, s2, 0] = intervals[s2][0]
                confint_upper[s1, s2, 0] = intervals[s2][1]
        self.confint_lower = confint_lower
        self.confint_upper = confint_upper

        # Normalization of counts to produce family of probability matrices
        for s1 in range(state_count):
            for s2 in range(state_count):
                if tm_count[s1] > 0:
                    tmn_count[(s1, s2)] = tmn_count[(s1, s2)] / tm_count[s1]

        # for s1 in range(state_count):
        #     for s2 in range(state_count):
        #         print(confint_lower[s1, s2], tmn_count[(s1, s2)], confint_upper[s1, s2])

        self.matrix_set.append(tmn_count)

        return self.matrix_set
Пример #10
0
    def fit(self, data, labels=None):
        '''
        Parameters
        ----------
        data : array-like
            The data to use for the estimation

        labels: a dictionary for relabeling column names

        Returns
        -------
        matrix : estimated transition matrix
        confint_lower: lower confidence interval
        confint_upper: upper confidence interval

        Notes
        ------
        '''

        # loop over data (id, period, state)
        # calculate population count N^i_k per state i per period k
        # calculate migrations count N^{ij}_{kl} from i to j from period k to period l
        # calculate transition matrix as ratio T^{ij}_{kl} = N^{ij}_{kl} / N^i_k

        if labels is not None:
            state_label = labels['State']
            timestep_label = labels['Timestamp']
            id_label = labels['ID']
        else:
            state_label = 'State'
            id_label = 'ID'
            timestep_label = 'Timestep'

        state_dim = self.states.cardinality
        cohort_labels = data[timestep_label].unique()
        cohort_dim = len(cohort_labels) - 1
        event_count = data[id_label].count()

        # store data in 1d arrays for fast processing
        # capture nan events for missing observations
        event_exists = np.empty(event_count, int)
        event_id = np.empty(event_count, int)
        event_state = np.empty(event_count, int)
        event_time = np.empty(event_count, int)
        nan_count = 0
        i = 0
        # TODO read data by labels, not column location
        for row in data.itertuples():
            try:
                event_state[i] = int(row[3])
                event_id[i] = row[1]
                event_time[i] = int(row[2])
                event_exists[i] = 1
            except ValueError:
                nan_count += 1
            i += 1
        self.nans = nan_count

        # storage of counts
        # number of entities observed in given state per period
        tm_count = np.ndarray((state_dim, cohort_dim), int)
        # number of entities observed to transition from state to state per period
        tmn_count = np.ndarray((state_dim, state_dim, cohort_dim), int)
        tm_count.fill(0)
        tmn_count.fill(0)
        # normalized frequencies
        tmn_values = np.ndarray((state_dim, state_dim, cohort_dim), float)
        tmn_values.fill(0.0)

        # TODO Capture case if entity with only one observation (hence no transition count)

        for i in range(1, event_count - 1):
            if event_exists[i] == 1:
                # while processing event data from same entity
                if event_id[i + 1] == event_id[i]:
                    tm_count[(event_state[i], event_time[i])] += 1
                    tmn_count[(event_state[i], event_state[i + 1], event_time[i])] += 1
                # last data point from entity data
                # elif event_id[i + 1] != event_id[i] and event_id[i] == event_id[i - 1]:
                #     tm_count[(event_state[i], event_time[i])] += 1
                #     tmn_count[(event_state[i - 1], event_state[i], event_time[i])] += 1
                # elif event_id[i + 1] != event_id[i] and event_id[i] != event_id[i - 1]:
                #     sys.exit("Isolated observation in data")

        # boundary cases
        #
        i = 0
        if event_exists[i] == 1:
            if event_id[i + 1] == event_id[i]:
                tm_count[(event_state[i], event_time[i])] += 1
                tmn_count[(event_state[i], event_state[i + 1], event_time[i])] += 1
        #
        # i = event_count - 1
        # if event_exists[i] == 1:
        #     if event_id[i] == event_id[i - 1]:
        #         tm_count[(event_state[i], event_time[i])] += 1
        #         tmn_count[(event_state[i - 1], event_state[i], event_time[i])] += 1

        self.counts = int(tm_count.sum())

        # Confidence Interval Estimation (Based on Counts)
        confint_lower = np.ndarray((state_dim, state_dim, cohort_dim))
        confint_upper = np.ndarray((state_dim, state_dim, cohort_dim))
        for k in range(cohort_dim):
            for s1 in range(state_dim):
                intervals = st.multinomial_proportions_confint(tmn_count[s1, :, k], alpha=self.ci_alpha,
                                                               method=self.ci_method)
                for s2 in range(state_dim):
                    confint_lower[s1, s2, k] = intervals[s2][0]
                    confint_upper[s1, s2, k] = intervals[s2][1]
            self.confint_lower = confint_lower
            self.confint_upper = confint_upper

        # Normalization of counts to produce family of probability matrices
        for s1 in range(state_dim):
            for s2 in range(state_dim):
                for k in range(cohort_dim):
                    if tm_count[(s1, k)] > 0:
                        tmn_values[(s1, s2, k)] = tmn_count[(s1, s2, k)] / tm_count[(s1, k)]
                        # print(s1, s2, k, tmn_values[(s1, s2, k)], tmn_count[(s1, s2, k)], tm_count[(s1, k)])

        # Return a list of transition matrices
        for k in range(cohort_dim):
            self.matrix_set.append(tmn_values[:, :, k])
            self.count_set.append(tmn_count[:, :, k])
            self.count_normalization.append(tm_count[:, k])

        return self.matrix_set
def simultaneous_confint_from_cdf(alpha, n_samples, x, cdf):
    return binom.multinomial_proportions_confint(
        n_samples*np.diff(cdf), alpha=alpha, method='sison-glaz'
    )
Пример #12
0
    def fit(self, data, labels=None):
        """
        Parameters
        ----------
        data : dataframe - The data to use for the estimation (in sorted by ID in compact format)

        labels: an optional dictionary for relabeling column names

        Returns
        -------
        matrix_set : An estimated transition matrix set

        Notes
        ------

        * loop over data rows (id, timepoint, state)
        * at least two distinct timepoints are required (initial and final)
        * calculate population count N^i_k per state i per timepoint k
        * calculate migrations count N^{ij}_{kl} from i to j from timepoint k to timepoint l
        * calculate transition matrix as ratio T^{ij}_{kl} = N^{ij}_{kl} / N^i_k
        * calculate also count-averaged matrix

        References
        ----------


        """

        # Allow for flexible labelling for dataframe columns
        if labels is not None:
            state_label = labels['State']
            timestep_label = labels['Time']
            id_label = labels['ID']
        else:
            state_label = 'State'
            id_label = 'ID'
            timestep_label = 'Time'

        # Old way of enumerating cohort intervals was using labels
        # cohort_labels = data[timestep_label].unique()
        # cohort_dim = len(cohort_labels) - 1

        # The size of the state space
        state_dim = self.states.cardinality
        # The number of cohorts is the number of intervals
        # Minimally two (initial and final)
        cohort_dim = len(self.cohort_bounds) - 1
        event_count = data[id_label].count()

        # store data in 1d arrays for faster processing
        # capture nan events for missing observations
        event_exists = np.empty(event_count, int)
        entity_id = np.empty(event_count, int)
        entity_state = np.empty(event_count, int)
        event_time = np.empty(event_count, int)
        nan_count = 0

        i = 0
        for index, row in data.iterrows():
            entity_id[i] = row[id_label]
            try:
                entity_state[i] = row[state_label]
                event_time[i] = row[timestep_label]
                event_exists[i] = 1  # indicates a valid (complete) data row
            except ValueError:
                entity_state[i] = -99999
                event_time[i] = -99999
                event_exists[i] = 0
                nan_count += 1
            i += 1
        self.nans = nan_count

        # store number of entities observed in given state per time step
        tm_count = np.ndarray((state_dim, cohort_dim + 1), int)
        # store number of entities observed to transition from state (From) to state (To) per period
        tmn_count = np.ndarray((state_dim, state_dim, cohort_dim), int)
        # store normalized frequencies
        tmn_values = np.ndarray((state_dim, state_dim, cohort_dim), float)
        # matrix to store average transitions
        tmn_average = np.ndarray((state_dim, state_dim), float)

        # initialize to zero (TODO ?)
        tm_count.fill(0)
        tmn_count.fill(0)
        tmn_values.fill(0)
        tmn_average.fill(0)

        # TODO Capture case if entity with only one observation (hence no transition count)
        # TODO Capture case with stale observations (no transitions)

        for i in range(0, event_count - 1):  # the last point handled separately
            if event_exists[i] == 1:
                # while processing valid event data from same entity
                # increment state count
                tm_count[(entity_state[i], event_time[i])] += 1
                if entity_id[i + 1] == entity_id[i]:
                    # increment migration count if there is subsequent observation
                    # NB: It does not have to be different
                    tmn_count[(entity_state[i], entity_state[i + 1], event_time[i])] += 1

        # handle boundary cases
        # the last event must be evaluated in comparison with its previous one
        i = event_count - 1
        if event_exists[i] == 1:
            # ATTN we must shift the time index of the tm_count, tmn_count
            tm_count[(entity_state[i], event_time[i] - 1)] += 1
            if entity_id[i] == entity_id[i - 1]:
                tmn_count[(entity_state[i - 1], entity_state[i], event_time[i] - 1)] += 1

        # print(tm_count)
        # print(tm_count.sum())
        # print(tmn_count[:, :, 0])

        self.counts = int(tm_count.sum())

        # Normalization of counts to produce a family of probability matrices
        for s1 in range(state_dim):
            for s2 in range(state_dim):
                for k in range(cohort_dim):
                    if tm_count[(s1, k)] > 0:
                        tmn_values[(s1, s2, k)] = tmn_count[(s1, s2, k)] / tm_count[(s1, k)]

        # for k in range(cohort_dim):
        #     m = transitionMatrix.TransitionMatrix(tmn_values[:, :, k])
        #     m.print_matrix(accuracy=3)

        # Average transition matrix (assuming temporal homogeneity)
        for s1 in range(state_dim):
            for s2 in range(state_dim):
                tm_total_count = 0
                for k in range(cohort_dim):
                    tmn_average[(s1, s2)] += tmn_count[(s1, s2, k)]
                    tm_total_count += tm_count[(s1, k)]
                if tm_total_count > 0:
                    tmn_average[(s1, s2)] /= tm_total_count
        self.average_matrix = tmn_average

        # Confidence Interval Estimation (Based on Counts)
        confint_lower = np.ndarray((state_dim, state_dim, cohort_dim))
        confint_upper = np.ndarray((state_dim, state_dim, cohort_dim))
        for k in range(cohort_dim - 1):
            for s1 in range(state_dim):
                intervals = st.multinomial_proportions_confint(tmn_count[s1, :, k], alpha=self.ci_alpha,
                                                               method=self.ci_method)
                for s2 in range(state_dim):
                    confint_lower[s1, s2, k] = intervals[s2][0]
                    confint_upper[s1, s2, k] = intervals[s2][1]
            self.confint_lower = confint_lower
            self.confint_upper = confint_upper

        # Return a list of transition matrices
        # Both absolute (frequency) and relative (probability) format
        for k in range(cohort_dim):
            self.matrix_set.append(tmn_values[:, :, k])
            self.count_set.append(tmn_count[:, :, k])


        # Return absolute counts at time points
        for k in range(cohort_dim + 1):
            self.count_normalization.append(tm_count[:, k])

        # print(self.count_normalization)

        return self.matrix_set