Пример #1
0
def bezier_matrix(degree):
    m = degree
    Q = np.zeros((degree + 1, degree + 1))
    for i in range(degree + 1):
        for j in range(degree + 1):
            if (0 <= (i + j)) and ((i + j) <= degree):
                Q[i, j] = choose(m, j) * choose(m - j, m - i - j) * (
                    (-1)**(m - i - j))
    return Q
Пример #2
0
def bezier1d(points):
#    d = len(points[0])
    N = len(points) - 1
    bez_x = lambda t: sum([
            choose(N,k)*np.power(t,k)*np.power(1-t,N-k)*points[k][0]
                                for k in range(N+1) ]) 
    bez_y = lambda t: sum([
            choose(N,k)*np.power(t,k)*np.power(1-t,N-k)*points[k][1]
                                for k in range(N+1) ]) 
    return bez_x, bez_y
Пример #3
0
def abund_log_prob(genotype, abundance, refrabund=None, mean=30.0, sd=8.0,
                   error=0.001):
    """Calculate probability of k-mer abundance conditioned on genotype.

    The `genotype` variable represents the number of assumed allele copies and
    is one of {0, 1, 2} (corresponding to genotypes {0/0, 0/1, and 1/1}). The
    `mean` and `sd` variables describe a normal distribution of observed
    abundances of k-mers with copy number 2. The `error` parameter is the
    sequencing error rate.

    For SNVs, there is a 1-to-1 correspondence of alternate allele k-mers to
    reference allele k-mers. We can therefore check the frequency of the
    reference allele in the reference genome and scale up the error rate if it
    is repetitive. There is no such mapping of alt allele k-mers to refr allele
    k-mers for indels, so we use a lower fixed error rate.
    """
    if genotype == 0:
        if not refrabund:  # INDEL mode
            refrabund = 1
            error *= 0.01
        scaledmean = mean * refrabund
        if abundance > scaledmean:
            abundance = scaledmean
        nCk = choose(scaledmean, abundance, exact=True)
        prob = (
            log(nCk)
            + (abundance * log(error))
            + ((scaledmean - abundance) * log(1.0 - error))
        )
        return prob
    elif genotype == 1:
        return scipy.stats.norm.logpdf(abundance, mean / 2, sd / 2)
    elif genotype == 2:
        return scipy.stats.norm.logpdf(abundance, mean, sd)
Пример #4
0
def _chao_7d(x, n, f1, p1, q):
    data, counts = np.unique(x, return_counts=True)
    term = np.zeros(data.shape[0])
    zi = stats.lchoose(n, data)
    for i, z in enumerate(data):
        k = np.arange(n - z + 1)
        term[i] = np.sum(
            choose(k - q, k) * np.exp(stats.lchoose(n - k - 1, z - 1) - zi[i]))
    A = np.sum(counts * term)
    if f1 == 0 or p1 == 1:
        B = 0
    else:
        B = f1 / n * (1 - p1)**(1. - n)
        r = np.arange(n)
        B *= (p1**(q - 1)) - np.sum(choose(q - 1, r) * (p1 - 1) ** r)
    return (A + B)**(1 / (1 - q))
Пример #5
0
    def _get_vacancies(self):
        '''Vacancies.py: Group to create atomic vacancies from a seed configuration.

        Args:
            atom_seed (list, str, matdb.atoms.Atoms): The location of the
                 files that will be read into to make the atoms object or an
                 atoms object.
            ran_seed (hashable):(=1 default) seed for the random number
                 generator for index of vacancies selection.
            nconfigs (int): number of cells with vacancies to create.
            vac_per_atom (int < 1): The number of vacancies to include per
                 atom in the cell. (i.e. 0.1 would be 1 in every 10 atoms.)
            min_index (int):(default=0) Default choice with the same ran_seed
                 would produce the same vacancies in each cell.

        .. note:: Additional attributes are also exposed by the super class
              :class:`~matdb.database.Group`.

        Attributes:
            name (str): name of this database type relative to the over
                 database collection. This is also the name of the folder
                 in which all of its calculations will be performed.
            num_atom(int): The number of atoms present in each atoms object.
            num_vac(int): The number of vacancies per cell.
            seed_state(tuple, len=4): values 1,3-4 are set by ran_seed after
                 the first call to np.random and do not change, value 2 gives
                 the ith value of a call to random
            select_atoms(list): list of lists with indices of atoms to be
                 removed
            unique_perm(int): number of possible combinations
        Returns:
            vacancies(AtomsList): an list of atoms objects of length nconfigs
                 with unique vacancies for each cell.
        '''
        select_atoms = []  # list of lists with indices of atoms to be removed
        num_atoms = int(len(self.atoms.get_positions()))  # number of atoms
        num_vac = int(num_atoms * self.vac_per_atom)

        np.random.seed(self.ran_seed)  # Set the random seed for reproduction
        if (choose(num_atoms, num_vac) > 1000):
            select_atoms = self._get_random_choice(select_atoms, num_atoms,
                                                   num_vac)
        else:
            select_atoms = self._get_combinations(select_atoms, num_atoms,
                                                  num_vac)
        atom_seed = AtomsList()
        for i in select_atoms:
            local_atoms = self.atoms.copy()
            del local_atoms[i]
            atom_seed.append(local_atoms)
        return atom_seed, select_atoms
Пример #6
0
    def _get_combinations(self, select_atoms, num_atoms, num_vac):
        '''This Approach allows for simple, efficient random iteration of all
        possible vacancies for small cell sizes. Limiting this approach that
        n choose k is less than 1000.

        Args:
            select_atoms(list): the selected indices to be removed
                 from each config.
            num_atoms(int): number of atoms present in the cell.
            num_vac(int): number of vacancies to include in each config.
        Returns:
            select_atoms(list): the indices to remove from each
                 config.
        '''
        atomic_vacancies = range(choose(num_atoms, num_vac, exact=True))
        np.random.shuffle(atomic_vacancies)  # shuffle all possible options
        atomic_vacancies = list(
            islice(atomic_vacancies, self.min_index,
                   (self.min_index + self.nconfigs)))
        for i in atomic_vacancies:
            select_atoms.append(
                list(islice(combinations(range(num_atoms), num_vac), i,
                            i + 1)))
        return select_atoms
Пример #7
0
    def cdf(self, a, b=None):
        """Computes P(X < `a`) for X distributed like this Gaussian.

        If `b` is also specified, this function will compute P(`a` < X < `b`).

        For multivariate Gaussians, this function performs inclusion-exclusion on (2 ^ N) CDF
        results, which computes the hypercubic intersection between CDF(upper limit) and CDF(lower
        limit). In the 2D case between points (a, b) and (c, d), where a < c and b < d, this works
        out to CDF(c, d) - CDF(b, d) - CDF(a, c) + CDF(a, b).

        NOTE: Ensure that the Gaussian covariance is positive semi-definite.

        TODO: Figure out how to use `mvnormcdf` from `statsmodels` for more efficient multivariate
        CDF intervals via sampling. Although their code looks sus...

        >>> Gaussian(1, 9).cdf(4)
        0.841344746068543

        >>> Gaussian(1, 9).cdf([4, 1])
        array([0.84134475, 0.5       ])

        >>> Gaussian(0, 1).cdf(-1, 1)
        0.6826894921370861

        >>> Gaussian(0, 1).cdf([-1, -2], [1, 2])
        array([0.68268949, 0.95449974])

        Output is slightly non-deterministic:
        >>> Gaussian(pd.Series([0, 0, 0]), pd.DataFrame([ \
                [ 2, -1,  0], \
                [-1,  2, -1], \
                [ 0, -1,  2] \
            ])).cdf([ \
                [0, 0, 0], \
                [-4, -2, -3] \
            ], [ \
                [1, 1, 1], \
                [1, 2, 4] \
            ]).round(3)
        array([0.017, 0.644])

        >>> Gaussian(pd.Series([0, 2], index=['a', 'b']), [1, 1]) \
                .cdf(pd.Series([2, 0], index=['b', 'a']))
        0.25

        """
        # Consider pre-computing and storing this distribution on the Gaussian.
        distribution = multivariate_normal(self.mean,
                                           self.covariance,
                                           allow_singular=True)

        if self.__should_vectorize(a):
            if b is None:
                b = np.empty(np.shape(a), dtype=object)
            if isinstance(a, pd.DataFrame):
                b = pd.DataFrame(b)
                b.columns, b.index = a.columns, a.index
                result = np.array(
                    [self.cdf(a.iloc[i], b.iloc[i]) for i in a.index])
            else:
                result = np.array(
                    [self.cdf(a[i], b[i]) for i in range(len(a))])

            # Restore row labels as necessary.
            if isinstance(a, (pd.DataFrame, pd.Series)):
                return pd.Series(result, index=a.index)
            return result
        elif b is None:
            # Sort `a` labels to match `mean` indexing.
            if self.__has_similar_labels(a):
                a = a[self.__mean.index]
            return distribution.cdf(a)
        # Multivariate intervals require a non-degenerate hypercube.
        elif np.any(np.array(b) - np.array(a) <= 0):
            return 0
        # Apply inclusion-exclusion (see function header) to compute multivariate intervals.
        else:
            # Sort `a` labels to match `mean` indexing.
            if self.__has_similar_labels(a):
                a = a[self.__mean.index]
            # Sort `b` labels to match `mean` indexing.
            if self.__has_similar_labels(b):
                b = b[self.__mean.index]

            num_vars = len(self.__mean)
            # Returns e.g. [[0, 0], [0, 1], [1, 0], [1, 1]] for 2 variables. More generally, this
            # returns the (2 ^ N) bit vectors of length N, sorted ascending by the number of ones.
            inclusion_bits = np.array(
                sorted(list(itertools.product([0, 1], repeat=num_vars)),
                       key=sum)).astype(bool)

            i = 0
            result = 0
            multiplier = 1

            # Iterates through `inclusion_bits` grouped by the number of ones `num_upper` in the bit
            # vector. At each value of `num_upper`, there are (`num_vars` choose `num_upper`) such
            # elements in `inclusion_bits`.
            for num_upper in range(num_vars + 1):
                for _ in range(int(choose(num_vars, num_upper))):
                    # If `inclusion_bits[i]` is e.g. 1001, we will construct a CDF limit by taking
                    # the first variable from `b`, the second and third variables from `a`, and the
                    # fourth variable from `b`.
                    inclusion_exclusion = np.where(inclusion_bits[i], a, b)
                    result += multiplier * distribution.cdf(
                        inclusion_exclusion)
                    i += 1
                # This corresponds to switching the power of (-1) in the standard formula for
                # computing inclusion-exclusion.
                multiplier *= -1
            return result
Пример #8
0
def gp(d_alpha,verbose=False):
	global gp_computed_values

	d,alpha = d_alpha
	alpha_sum = np.sum(alpha)

	if len(alpha) > 0 and np.min(alpha) < 0:
		if d == 0 and len(alpha) == 1 and alpha_sum == -1:
			return 1
		else:
			return 0
	n = 3*d-1-alpha_sum
	if n < 0:
		return 0

	alpha = make_canonical_alpha(alpha)
	d_alpha = (d,alpha)
	alpha_sum = np.sum(alpha)
	n = 3*d-1-alpha_sum

	if d_alpha in gp_computed_values:
		return gp_computed_values[d_alpha]

	n = 3*d-1-alpha_sum #The number of extra point constraints needed to give the invariant corresponding to d_alpha index 0.

	if d == 1 and alpha_sum == 0:
		return 1

	if verbose:
		print('Applying GP recursion for d = %s, alpha = %s' %(d,str(alpha)))

	if np.sum([elt*(elt-1) for elt in alpha]) > (d-1)*(d-2): #This condition checks whether the invariant is automatically zero by the adjunction formula.
		gp_computed_values[d_alpha] = 0
		return 0

	if n >= 3:
		out = 0
		for d1_alpha1,d2_alpha2 in decompositions(d_alpha):
			d1,alpha1 = d1_alpha1
			d2,alpha2 = d2_alpha2

			n1 = 3*d1-1-np.sum(alpha1)
			out += gp(d1_alpha1,verbose)*gp(d2_alpha2,verbose)*(d1*d2-np.dot(alpha1,alpha2))*(d1*d2*choose(n-3,n1-1)-d1**2*choose(n-3,n1))
		gp_computed_values[d_alpha] = out
		return out

	elif len(alpha) > 0:
		a = alpha[0]
		alpha_decr = tuple([alpha[0]-1] + list(alpha[1:]))
		d_alpha_decr = (d,alpha_decr)

		out = (d**2 - (a-1)**2) * gp(d_alpha_decr,verbose)

		for d1_alpha1,d2_alpha2 in decompositions(d_alpha_decr):
			d1,alpha1 = d1_alpha1
			d2,alpha2 = d2_alpha2

			n1 = 3*d1-1-np.sum(alpha1)
			b = alpha1[0]
			c = alpha2[0]
			out += gp(d1_alpha1,verbose)*gp(d2_alpha2,verbose)*(d1*d2 - np.dot(alpha1,alpha2))*(d1*d2*b*c-d1**2*c**2)*choose(n,n1)	

		if out % (d**2*a) == 0:
			out = out // (d**2*a)
			gp_computed_values[d_alpha] = out
			return out

		else:
			raise Exception('Error! Expected divisibility in second branch of the recursion does not hold...')
	else:
		raise Exception('Error! Was not able to apply either branch of the recursion...')
Пример #9
0
    r = 3
    """
    """ rows and cols of square grid """

    ks = range(4, 12 + 1)
    for i, k in enumerate(ks):
        print("k=%d" % k)
        # SQUARE
        m = k**2
        n = m // 2  #(3*m)//4
        H = square_hypergraph(m, k)
        r = 2  # since SIP satisfied by (row,col) pair

        # CIRCLE
        #k = 3 # choose something less than m
        #H = cyclic_hypergraph(m, k)
        #r = k

        N_per_support = int((k - 1) * choose(m, k, exact=True)) + 1
        N = len(H) * N_per_support
        num_trials = 1000
        c2s = np.zeros(num_trials)
        for t in range(num_trials):
            print("Trial %d" % t, end='\r')
            A = np.random.randn(n, m)
            A = np.dot(A, np.diag(1. / np.linalg.norm(A, axis=0)))  # normalize
            c2s[t] = C2(A, H, r)
            if t % 100 == 0:
                np.save("./c2s_k=%d_2x" % k, np.sort(c2s[c2s < np.inf]))

        np.save("./c2s_k=%d_2x" % k, np.sort(c2s[c2s < np.inf]))
Пример #10
0
def hypergeometric(n, k, N, K):
    return choose(n, k) * choose(N - n, K - k) / choose(N, n)
Пример #11
0
def binomialDist(x, n, p):
    return choose(n, x) * p**x * (1 - p)**(n - x)
Пример #12
0
def bij(t, i, n):
    # binomial coefficients
    return choose(n, i) * (t**i) * ((1 - t)**(n - i))
Пример #13
0
def likelihood(n, h, p):
    '''Returns the probability of h heads in n trials, with probability of heads being p'''
    return choose(n, h, exact=True) * (p**h) * ((1 - p)**(n - h))
Пример #14
0
def projection_fun(x, i, n, N, s=0, u=1e-8):
    # this can likely be done better in log space
    return (choose(n, i) * np.power(x, i) * np.power(1 - x, n - i) *
            afs_inf_sites(x, N, s, u))