def test_cross_validator_with_default_indices():
    n_samples = 4
    n_unique_labels = 4
    n_folds = 2
    p = 2
    n_iter = 10  # (the default value)

    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    X_1d = np.array([1, 2, 3, 4])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = LeaveOneOut()
    lpo = LeavePOut(p)
    kf = KFold(n_folds)
    skf = StratifiedKFold(n_folds)
    lolo = LeaveOneLabelOut()
    lopo = LeavePLabelOut(p)
    ss = ShuffleSplit(random_state=0)
    ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2

    n_splits = [n_samples, comb(n_samples, p), n_folds, n_folds,
                n_unique_labels, comb(n_unique_labels, p), n_iter, 2]

    for i, cv in enumerate([loo, lpo, kf, skf, lolo, lopo, ss, ps]):
        # Test if get_n_splits works correctly
        assert_equal(n_splits[i], cv.get_n_splits(X, y, labels))

        # Test if the cross-validator works as expected even if
        # the data is 1d
        np.testing.assert_equal(list(cv.split(X, y, labels)),
                                list(cv.split(X_1d, y, labels)))
        # Test that train, test indices returned are integers
        for train, test in cv.split(X, y, labels):
            assert_equal(np.asarray(train).dtype.kind, 'i')
            assert_equal(np.asarray(train).dtype.kind, 'i')
Exemplo n.º 2
0
def findF(k,p,q,x, side):
    
    #print k, p, q, x
    
    if k in Fdict:
        if p in Fdict[k]:
            if q in Fdict[k][p]:
                if x in Fdict[k][p][q]:
                    if side in Fdict[k][p][q][x]:
                        return Fdict[k][p][q][x][side]
    
    if q>0:
        if side==LEFT: #left
            ret=0
            for l in xrange(min(p*q+1, x+1)):
                ret+=findF(k, p, 0, x-l, LEFT)*sc.comb(p*q,l)
            
        elif side==RIGHT: #right
            ret=0
            for l in xrange(min(p*q+1, x+1)):
                if l==0:
                    ret+=0 #findF(k, p, 0, x-l, LEFT)  it requires at least one edge from left to right.
                else:
                    
                    ret+=findF(k, p, 0, x-l, LEFT)*sc.comb(q,l) #external nodes can only connect the "root" of the right tree
                    
                    #ret+=findF(k, p, 0, x-l, LEFT)*sc.comb(q,1)*sc.comb((p-1)*q,l-1)   
    else:
        if k==0:
            if p==1:
                if q==0:
                    if x==0:
                        ret=1
                    else:
                        ret=0
                else:
                    print "Error 1"
            else:
                print "Errro 0"
        else:
            ret=0
            for i in xrange(k):
                tmp=0
                for j in xrange(k-i-1,x-i):
                    tmp += findF(k-i-1,k-i,i+1,j, LEFT)*findF(i, i+1, k-i, x-j, RIGHT)
                ret += tmp*sc.comb(k-1,i)
                
    
    if k not in Fdict:
        Fdict[k]={}
    if p not in Fdict[k]:
        Fdict[k][p]={}
    if q not in Fdict[k][p]:
        Fdict[k][p][q]={}
    if x not in Fdict[k][p][q]: 
        Fdict[k][p][q][x]={}
    Fdict[k][p][q][x][side]=ret
     
    
    return ret
Exemplo n.º 3
0
def solution3():
    '''
    >>> solution3()
    137846528820
    '''
    from scipy.misc import comb
    print comb(40, 20, exact=True)
Exemplo n.º 4
0
Arquivo: pf.py Projeto: ivanov/PyPsy
    def eval_gof( self, data):
        probs = self.eval(data.levels)
        expected = probs * data.Ntrials
        # Compute various goodness of fit statistics
        LL = -2*sum((data.Ncorr*np.log(expected/(data.Ncorr+np.finfo(float).eps))
            +(data.Ntrials-data.Ncorr)*np.log((data.Ntrials-expected)/(data.Ntrials-data.Ncorr+np.finfo(float).eps))));
        X2 = sum((data.Ncorr-expected)**2./expected/(1.-probs))

        # Adding eps to avoid log(0) Nans
        self.prinsNLL = -sum( data.Ncorr*np.log(probs+np.finfo(float).eps)+(data.Ntrials-data.Ncorr)*np.log(1.-probs+np.finfo(float).eps) )

        # Treutwein/Strasburger 1999 Eq 6 (likelihood of the data)
        L_ts = 2**(sum( data.Ntrials ))
        LL_ts = 0.0
        #L_ts = 1.0
        for level in np.arange( len(data.levels) ):
            # TODO: is right to use observed data or function values?: next two lines can chg to try fitted
            thisN = data.Ntrials[level]
            thisCorr = data.Ncorr[level]
            L_ts *= misc.comb( thisN, thisCorr ) * (probs[level]**thisCorr) * (1.0 - probs[level])**(thisN-thisCorr) 
            LL_ts += np.log(misc.comb( thisN, thisCorr )) + thisCorr*np.log(probs[level]) +np.log(1.0 - probs[level])*(thisN-thisCorr) 

        #TODO: This is how Prins' clamps the lapse.  Parameterize.
        if (self.params[self.PARAM_UPPER] < 0) or (self.params[self.PARAM_UPPER] > 0.05):
            self.prinsNLL=np.inf

        return probs,LL,X2,L_ts,LL_ts,self.prinsNLL
Exemplo n.º 5
0
 def centroid(self):
     '''Find the centroid x and y from these coefficients'''
     
     xcen = 0.0
     ycen = 0.0
     
     fluxtot = self.total_flux()
     
     for i1 in range(self.n1):            
         if i1%2==0: continue #consider odd i1
                         
         for i2 in range(self.n2):                    
             if i2%2!=0: continue # consider even i2
                                                                     
             xcen = xcen + np.power(i1+1,0.5)*np.power(2,0.5*(2-i1-i2))* np.power(comb(i1+1,(i1+1)/2)*comb(i2,i2/2),0.5)*self.coeff[i1,i2]
                 
             
     for i1 in range(self.n1):
         if i1%2!=0: continue #consider even i1   
                  
         for i2 in range(self.n2):
             if i2%2==0:continue # consider odd i2
                                                      
             ycen = ycen + np.power(i2+1,0.5)*np.power(2,0.5*(2-i2-i1))* np.power(comb(i2+1,(i2+1)/2)*comb(i1,i1/2),0.5)*self.coeff[i1,i2]
         
     xcen = xcen*np.sqrt(pi)*self.beta*self.beta/fluxtot                    
     ycen = ycen*np.sqrt(pi)*self.beta*self.beta/fluxtot
             
     return xcen,ycen
Exemplo n.º 6
0
def get_motifspace_size(q, n):
    """return length of search space according to equation which is mentioned in Section 3.1 of the paper"""
    return reduce(
        lambda x, y: x + (int(sc.comb(q, y, exact=True)) * 4 ** (q - y)),
        [i for i in range(1, n + 1)],
        int(sc.comb(q, 0, exact=True)) * 4 ** (q - 0),
    )
Exemplo n.º 7
0
def brive(N, replace_zeros=True):
    """
    The brive estimator

    Parameters
    ----------
    N : np.array, int
       Counts vector
    replace_zeros: bool
       Replaces zeros with uniform prior

    Returns
    -------
    pvals
    """
    N = N.astype(np.int)
    n = sum(N)
    pvals = np.zeros(len(N), dtype=np.float64)
    for i in range(len(N)):
        if N[i]==0 or N[i]==1: continue
        trials = [comb(t-1, N[i]-1) / (t * (comb(n, N[i])))
                  for t in range(N[i], n+1)]
        pvals[i] = (float(N[i]-1)) * sum(trials)

    if replace_zeros:
        m = sum(pvals)
        if 0 < m < 1 and (pvals==0).sum() > 0:
            pvals[pvals==0] = (1 - m) / (pvals==0).sum()
    return pvals
Exemplo n.º 8
0
    def pdf(self, x, k, n, p):
        '''distribution of success runs of length k or more

        Parameters
        ----------
        x : float
            count of runs of length n
        k : int
            length of runs
        n : int
            total number of observations or trials
        p : float
            probability of success in each Bernoulli trial

        Returns
        -------
        pdf : float
            probability that x runs of length of k are observed

        Notes
        -----
        not yet vectorized

        References
        ----------
        Muselli 1996, theorem 3
        '''

        q = 1-p
        m = np.arange(x, (n+1)//(k+1)+1)[:,None]
        terms = (-1)**(m-x) * comb(m, x) * p**(m*k) * q**(m-1) \
                * (comb(n - m*k, m - 1) + q * comb(n - m*k, m))
        return terms.sum(0)
Exemplo n.º 9
0
def daub(p):
    """
    The coefficients for the FIR low-pass filter producing Daubechies wavelets.

    p>=1 gives the order of the zero at f=1/2.
    There are 2p filter coefficients.

    Parameters
    ----------
    p : int
        Order of the zero at f=1/2, can have values from 1 to 34.

    """
    sqrt = np.sqrt
    if p < 1:
        raise ValueError("p must be at least 1.")
    if p==1:
        c = 1/sqrt(2)
        return np.array([c,c])
    elif p==2:
        f = sqrt(2)/8
        c = sqrt(3)
        return f*np.array([1+c,3+c,3-c,1-c])
    elif p==3:
        tmp  = 12*sqrt(10)
        z1 = 1.5 + sqrt(15+tmp)/6 - 1j*(sqrt(15)+sqrt(tmp-15))/6
        z1c = np.conj(z1)
        f = sqrt(2)/8
        d0 = np.real((1-z1)*(1-z1c))
        a0 = np.real(z1*z1c)
        a1 = 2*np.real(z1)
        return f/d0*np.array([a0, 3*a0-a1, 3*a0-3*a1+1, a0-3*a1+3, 3-a1, 1])
    elif p<35:
        # construct polynomial and factor it
        if p<35:
            P = [comb(p-1+k,k,exact=1) for k in range(p)][::-1]
            yj = np.roots(P)
        else:  # try different polynomial --- needs work
            P = [comb(p-1+k,k,exact=1)/4.0**k for k in range(p)][::-1]
            yj = np.roots(P) / 4
        # for each root, compute two z roots, select the one with |z|>1
        # Build up final polynomial
        c = np.poly1d([1,1])**p
        q = np.poly1d([1])
        for k in range(p-1):
            yval = yj[k]
            part = 2*sqrt(yval*(yval-1))
            const = 1-2*yval
            z1 = const + part
            if (abs(z1)) < 1:
                z1 = const - part
            q = q * [1,-z1]

        q = c * np.real(q)
        # Normalize result
        q = q / np.sum(q) * sqrt(2)
        return q.c[::-1]
    else:
        raise ValueError("Polynomial factorization does not work "
              "well for p too large.")
Exemplo n.º 10
0
Arquivo: ARI.py Projeto: DennisLZL/ari
def ARI(trueLab, predLab):
    """
    compute adjusted rand index, ranges in [-1, 1] with random assignment score 0 and perfect score 1
    :param trueLab: ground truth labels
    :param predLab: predicted labels
    :return: adjusted rand index
    """
    n = len(trueLab)
    trueLab = np.array(trueLab)
    predLab = np.array(predLab)

    trueCluster = dict(zip(set(trueLab), [np.where(trueLab == x)[0] for x in set(trueLab)]))
    predCluster = dict(zip(set(predLab), [np.where(predLab == x)[0] for x in set(predLab)]))

    nTrue = len(trueCluster)
    nPred = len(predCluster)

    cTable = np.zeros((nTrue, nPred))

    for i in range(nTrue):
        for j in range(nPred):
            cTable[i, j] = len(np.intersect1d(trueCluster.values()[i], predCluster.values()[j]))

    a = comb(np.sum(cTable, axis=1), 2).sum()
    b = comb(np.sum(cTable, axis=0), 2).sum()
    c = comb(n, 2)

    return (comb(cTable, 2).sum() - (a * b) / c) / (0.5 * (a + b) - (a * b) / c)
Exemplo n.º 11
0
def rare(y, size):
	notabs = ~np.isnan(y)
	t = y[notabs]
	N = np.sum(t)
	diff = N - t
	rare_calc = np.sum(1 - comb(diff, size)/comb(N, size))
	return rare_calc
Exemplo n.º 12
0
def estimate(E, r, f, lp):
    if f <= r:
        return 0.0
    if E - f < lp:
        return 1.0
    probs = [(1 - (1.0 * scal(comb(E - f, lp)) / scal(comb(e, lp)))) for e in range(E, E - (r+ 1), -1)]
    return reduce(lambda x, y: x * y, probs, 1.0)
def bilinear(b, a, fs=1.0):
    """Return a digital filter from an analog filter using the bilinear transform.
    The bilinear transform substitutes ``(z-1) / (z+1``) for ``s``.
    """
    #This function has been copied out of scipy
    fs =float(fs)
    a,b = map(num.atleast_1d,(a,b))
    D = len(a) - 1
    N = len(b) - 1
    artype = float
    M = max([N,D])
    Np = M
    Dp = M
    bprime = num.zeros(Np+1,artype)
    aprime = num.zeros(Dp+1,artype)
    for j in range(Np+1):
        val = 0.0
        for i in range(N+1):
            for k in range(i+1):
                for l in range(M-i+1):
                    if k+l == j:
                        val += comb(i,k)*comb(M-i,l)*b[N-i]*pow(2*fs,i)*(-1)**k
        bprime[j] = num.real(val)
    for j in range(Dp+1):
        val = 0.0
        for i in range(D+1):
            for k in range(i+1):
                for l in range(M-i+1):
                    if k+l == j:
                        val += comb(i,k)*comb(M-i,l)*a[D-i]*pow(2*fs,i)*(-1)**k
        aprime[j] = num.real(val)
    #return aprime, bprime
    return normalize(bprime, aprime)
def multiple_alignment(word_list):
    '''Returns the multiple alignment of a given list of words.'''
    from itertools import product
    from operator import add, mul
    from scipy.misc import comb

    # There are some issues scoring the first symbols, so force a match here and remove it from the alignment later.
    word_list = ['$'+word for word in word_list]

    # Initialize scoring and backtrack dictionaries, along with the indices and base score.
    S, backtrack = {}, {}
    perm_list = list(product([0, -1], repeat=len(word_list)))[1:]
    base_score = -1*comb(len(word_list), 2, exact=True)

    for index in product(*map(xrange,map(lambda s: len(s) + 1, word_list))):

        # We forced a match with the first symbols, so the zero-shell should lead to the zero index.
        if reduce(mul, index) == 0:
            # Since we forced a match with the first symbol, we want to force starting point to be the zero index.
            if sum(index) == 0:
                # All symbols match.
                S[index] = 0
            else:
                # Make it smaller than the lowest possible score.
                S[index] = 2*base_score*reduce(add, map(len, word_list))

        else:
            # Use previous scores to determine the best score for the current index.
            previous_scores = [S[tuple(map(add, index, perm))] for perm in perm_list]
            current_index_scores = []
            for perm in perm_list:
                chars = [word_list[i][index[i]-1] if perm_value == -1 else '-' for i, perm_value in enumerate(perm)]
                current_index_scores.append(base_score + sum([comb(chars.count(ch), 2, exact=True) for ch in set(chars)]))

            scores = map(add, previous_scores, current_index_scores)
            backtrack[index], S[index] = max(enumerate(scores), key=lambda p: p[1])

    # Initialize the alignment and indicies.
    alignment = word_list
    current_index = map(len, word_list)

    # Get the max score.
    # Note: The forced match at start of each word does not change the max score, as matched symbols have a score of zero.
    max_score = S[tuple(current_index)]

    # Quick lambda function to insert indels.
    insert_indel = lambda word, i: word[:i] + '-' + word[i:]

    # Insert indels to get the alignment.
    while reduce(mul, current_index) != 0:
        for i, perm_value in enumerate(perm_list[backtrack[tuple(current_index)]]):
            if perm_value == 0:
                alignment[i] = insert_indel(alignment[i], current_index[i])
            else:
                current_index[i] -= 1

    # Note: We don't need to prepend any indels because we forced a match at the start of all words.
    # Remove the forced match from all alignments to recover the correct alignment.
    return [str(max_score)] + [str(aligned[1:].seq) for aligned in alignment]
 def rectangles_numbers(M, N):
     '''
     正方形是屬於長方形的一種。或者說,正方形是長方形的特例。矩形的定義是四個角皆為 90° 的四邊形。
     如果要算不包含正方形的長方形,只需把所有的算出,然後減去正方形的個數。
     '''
     horizontal = sm.comb(M+1, 2, exact=True)
     vertical = sm.comb(N+1, 2, exact=True)
     return horizontal * vertical
Exemplo n.º 16
0
def hyperg(k, N, m, n, verbose=False):
    """
    k = intersection; m = population 1; n = population 2; N = total population.
    """
    exact=1
    if verbose and k%100==0: printnow("k: %s" % k)
    #return comb(m,k,exact) * comb(N-m,n-k,exact)/ comb(N,n,exact)
    return comb(m,k) * comb(N-m,n-k)/ comb(N,n)
Exemplo n.º 17
0
 def hypergeometricown(self, N, K, n, k):
   """
     N= total number of genes in population
     K= number of GOA
     n= select a sample (top 50, bottom half, etc.)
     k= number of successes in the sample
   """
   return comb(K, k) * comb(N-K, n-k) / comb(N, n)
Exemplo n.º 18
0
 def runs_prob_odd(self, r):
     n0, n1 = self.n0, self.n1
     k = (r+1)//2
     tmp0 = comb(n0-1, k-1)
     tmp1 = comb(n1-1, k-2)
     tmp3 = comb(n0-1, k-2)
     tmp4 = comb(n1-1, k-1)
     return (tmp0 * tmp1 + tmp3 * tmp4)  / self.comball
def elm_comb(rho, N, m):
    offered_traffic = rho / (1 + rho)
    numerator = comb(N-1, m, exact=True)*pow(offered_traffic, m)*pow(1-offered_traffic, N-m)
    denominator = 0
    for i in range(0, m+1):
        denominator += comb(N-1, i, exact=True)*pow(offered_traffic,  i)*pow(1-offered_traffic, N-i)
    
    return numerator / denominator
Exemplo n.º 20
0
def mv_hypergeometric(x, m):
    """
    x : number of draws for each category.
    m : size of each category.
    """
    x = np.asarray(x)
    m = np.asarray(m)
    return log(comb(m, x).prod() / comb(m.sum(), x.sum()))
Exemplo n.º 21
0
def prob(N, k, n, m):
    part1 = comb(N - 2, k)
    part2 = comb(N - 2 - k, n - k - 1)
    part3 = comb(N - n - 1, m - k - 1)
    part4 = comb(N - 2, n - 1)
    part5 = comb(N - 2, m - 1)
    p = (part1 * part2 * part3) / (part4 * part5)
    return p
Exemplo n.º 22
0
def colorcount(n):
    if n > 1:
        return comb(7, n, exact=True) * (
            comb(19, n - 1, exact=True) - n * sum(comb(i, n - 2, exact=True) for i in range(0, 9))
        )
    # 	elif n==2:
    # 		return comb(7,n,exact=True)
    else:
        return 0
Exemplo n.º 23
0
def vlj(l,j,m_a,p, q):
    
    vlj = (-1**p)*(m_a + l + 2*j)
    vlj *= comb(m_a + j + l - 1, l-1)
    vlj *= comb(j+l-1, l-1)
    vlj *= comb(l-1, p - j)
    vlj /= comb(q+l+j, l)
    
    return vlj
Exemplo n.º 24
0
def peter(n):
    if n >= 23:
        return peter(45 - n)
    else:
        count = 0
        for k in range((n-9)/4+1):
            j = n-9-4*k
            count += misc.comb(9,k)*misc.comb(9+j-1,j)*((-1)**(k+j*2))
        return count/(4**9)
Exemplo n.º 25
0
def compute_data_points(e, c):
    ue = np.linspace(c, max_ue, max_ue - c + 1)
    num_chosen = [x for x in range(c, e)] + [e]*(max_ue - e + 1)

    all_comb = comb(ue, num_chosen)
    good_comb = comb([x-c for x in ue], [x-c for x in num_chosen])
    expected = all_comb / good_comb

    return ue, expected
Exemplo n.º 26
0
def lamp_test(p_type, p_mut, total):
    significance = 0.05
    tests = 2503339*3
    sigma = float(significance)/float(tests)
    function = misc.comb(p_mut, p_type)/misc.comb(total, p_type)
    if sigma < function:
        return False
    else:
        return True
Exemplo n.º 27
0
def gevrey_tanh(T, n, sigma=sigma_tanh, K=K_tanh):
    """
    Provide the flat output y(t) = phi(t), with the gevrey-order
    1+1/sigma, and the derivatives up to order n.
    :param t: [0, ... , t_end]  (numpy array)
    :param n: (integer)
    :param sigma: (float)
    :param K: (float)
    :return: np.array([[phi], ... ,[phi^(n)]])
    """

    t_init = t = np.linspace(0., T, int(0.5*10**(2+np.log10(T))))

    # pop
    t = np.delete(t, 0, 0)
    t = np.delete(t, -1, 0)

    # main
    tau = t/T

    a = dict()
    a[0] = K*(4*tau*(1-tau))**(1-sigma)/(2*(sigma-1))
    a[1] = (2*tau - 1)*(sigma-1)/(tau*(1-tau))*a[0]
    for k in xrange(2, n+2):
        a[k] = (tau*(1-tau))**-1 * ((sigma-2+k)*(2*tau-1)*a[k-1]+(k-1)*(2*sigma-4+k)*a[k-2])

    yy = dict()
    yy[0] = np.tanh(a[1])
    if n > 0:
        yy[1] = a[2]*(1-yy[0]**2)
    z = dict()
    z[0] = (1-yy[0]**2)
    for i in xrange(2, n+1):
        sum_yy = np.zeros(len(t))
        for k in xrange(i):
            if k == 0:
                sum_z = np.zeros(len(t))
                for j in xrange(i):
                    sum_z += -sm.comb(i-1, j)*yy[j]*yy[i-1-j]
                z[i-1] = sum_z
            sum_yy += sm.comb(i-1, k)*a[k+2]*z[i-1-k]
        yy[i] = sum_yy

    # push
    phi = np.nan*np.zeros((n+1, len(t)+2))
    for i in xrange(n+1):
        phi_temp = 0.5*yy[i]
        if i == 0:
            phi_temp += 0.5
            phi_temp = np.insert(phi_temp, 0, [0.], axis=0)
            phi[i, :] = np.append(phi_temp, [1.])
        else:
            phi_temp = np.insert(phi_temp, 0, [0.], axis=0)
            # attention divide by T^i
            phi[i, :] = np.append(phi_temp, [0.])/T**i

    return phi, t_init
def serial_perm_equal_params(n):
    ev = n / math.factorial(n)
    ev2 = n**2 / math.factorial(n)
    for i in range(n-2, 0, -1):
        pmatch = np.prod([1/k for k in range(n, n-i, -1)])
        pfail = 1/(n-i)
        ev += i * comb(n, i) * pmatch * pfail
        ev2 += i**2 * comb(n, i) * pmatch * pfail
    var = ev2 - (ev)**2
    return ev, var
Exemplo n.º 29
0
    def central_geom_moment(self, p, q):
        m = np.arange(0, p + 1)
        n = np.arange(0, q + 1)
        x_0 = self.centroid()['x']
        y_0 = self.centroid()['y']
        M = self.geom_moments(p, q)

        # return (comb(p,m,exact=False)*(-x_0)**(p-m)).dot(M.dot((comb(q,n,exact=False)*(-y_0)**(q-n))))
        return (M.dot((comb(q, n, exact=False) * (-y_0) ** (q - n)))).dot(comb(p, m, exact=False) * (-x_0) ** (p - m))
        np.dot()
Exemplo n.º 30
0
def comb(n, k):
    res = np.rint(spm.comb(n, k, False)).astype(int)
    if np.all(res >= 0) and np.all(res < _MAX_INT_FLOAT):
        return res
    elif isinstance(n, abc.Iterable) or isinstance(k, abc.Iterable):
        broad = np.broadcast(np.asarray(n), np.asarray(k))
        res = np.empty(broad.shape, dtype=object)
        res.flat = [spm.comb(n_, k_, True) for n_, k_ in broad]
        return res
    else:
        return spm.comb(n, k, True)
Exemplo n.º 31
0
def c_b(B):
    # 2**(B-2)
    s = 0
    for i in range(B-1):
        s += sc_m.comb(B-2, i, exact=True)
    return s
Exemplo n.º 32
0
print dta[[
    'AVYRSEXP', 'AVSALK', 'PERSPENK', 'PTRATIO', 'PCTAF', 'PCTCHRT', 'PCTYRRND'
]].head(10)

formula = 'NABOVE + NBELOW ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT '
formula += '+ PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF'

##### Aside: Binomial distribution

# Toss a six-sided die 5 times, what's the probability of exactly 2 fours?

stats.binom(5, 1. / 6).pmf(2)

from scipy.misc import comb
comb(5, 2) * (1 / 6.)**2 * (5 / 6.)**3

from statsmodels.formula.api import glm
glm_mod = glm(formula, dta, family=sm.families.Binomial()).fit()

print glm_mod.summary()

# The number of trials

glm_mod.model.data.orig_endog.sum(1)

glm_mod.fittedvalues * glm_mod.model.data.orig_endog.sum(1)

# First differences: We hold all explanatory variables constant at their means and manipulate the percentage of low income households to assess its impact
# on the response variables:
Exemplo n.º 33
0
def bernstein_poly(i, n, t):
    """
     The Bernstein polynomial of n, i as a function of t
    """
    return comb(n, i) * ( t**(n-i) ) * (1 - t)**i
Exemplo n.º 34
0
def multivariate_lagrange(points, n):
    """
    Given the list of values *points*, construct a multidimensional
    (with dimension equal to the length of each of the given points)
    polynomial of degree *n* using the method developed in the
    following reference:

    Kamron Saniee, A Simple Expression for Multivariate Lagrange
    Interpolation, SIAM, 2007.

    The method has a restriction: if p is the number of *point*s and m
    is one less than the problem domain (i.e., m = `len(points[0]) -
    1`) then p = *n* + m choose *n*.
    """

    # check dimensions
    p = len(points)
    m = len(points[0]) - 1

    if not comb(n + m, n, exact=True) == p:
        raise ValueError('dimension mismatch')

    # setup symbols and build expression
    z_vec = SYM.Matrix([points[i][-1] for i in range(p)])
    coef = SYM.symbols(' '.join(['a{}'.format(i) for i in range(1, p + 1)]))
    var = SYM.symbols(' '.join(['x{}'.format(i) for i in range(1, m + 1)]))
    # handle trivial case, n = 0
    if n == 0:
        return SYM.Float(points[0][-1]), var, 1., [1.]
    z_terms = [SYM.Poly.from_dict({x: 1}, var) for x in poly_power_seq(m, n)]
    z = SYM.Poly.from_dict(dict(zip(poly_power_seq(m, n), coef)), var)
    # build M matrix
    M_rows = []
    for point_i in points:
        z_i = z(*point_i[:-1])
        M_rows.append([float(z_i.coeff(x)) for x in coef])
    M = NP.array(M_rows)
    delta = NP.linalg.det(M)
    # compute delta_i
    delta_i_list = []
    B = NP.ones(p - 1)
    C = NP.array(z_terms[:-1])
    D = M[-1, -1]
    for i in range(p):
        # using block matrix property of determinants which assumes
        # that the matrix A below is non-singular (the typical case)
        # --- the code fails safe to calculating the symbolic
        # determinant when A is singular (which will be slow for large
        # problems)
        A = NP.vstack((M[ :i,  :-1],
                       M[i+1:, :-1]))
        try:
            b = NP.linalg.solve(A, B)
            # using row interchange property
            delta_i_list.append((-1)**(p-1-i) * NP.linalg.det(A) * (D - NP.dot(C, b)))
        except NP.linalg.linalg.LinAlgError:
            logger.warning('singular matrix encountered (this will happen when, e.g., points contain many zeros) --- resorting to symbolic determinant calculation which will be slow for large problems')
            row_i = SYM.Matrix([z_terms])
            M_i = SYM.Matrix(M_rows)
            M_i.row_del(i)
            M_i = M_i.row_insert(p, row_i)
            # using row interchange property
            delta_i_list.append(M_i.det() * (-1)**(p-1-i))
    f = lagrange_interpolator(points, delta, delta_i_list)
    return SYM.Poly(f.simplify()), var, delta, delta_i_list
Exemplo n.º 35
0
#!/usr/bin/env python3.6
"""
PROBLEM: 053
AUTHOR:  Dirk Meijer
STATUS:  done
EXPLANATION:
    scipy combinatorics
"""

from Euler.tictoc import tic, toc
from Euler.eprint import eprint
from scipy.misc import comb

if __name__ == "__main__":
    tic()
    S = 0
    for n in range(23, 101):
        for k in range(1, n + 1):
            S += comb(n, k) > 1e6
    print(S)
    toc()
    exit()
Exemplo n.º 36
0
def pmf(n, k, p):
    return comb(n, k) * p**k * (1 - p)**(n - k)
Exemplo n.º 37
0
]  # window specifies the starting and ending time of the period that the data user is interested in

# step 3: pre-sanitize the database
sanitized_profile_baseline = util.sanitize_data(
    day_profile,
    distance_metric='euclidean',
    anonymity_level=anonymity_level,
    rep_mode=rep_mode)
loss_generic_metric = pe.get_information_loss(
    data_gt=day_profile,
    data_sanitized=sanitized_profile_baseline.round(),
    window=window)
print("information loss with generic metric %s" % loss_generic_metric)
df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample(
    frac=1)
subsample_size_max = int(comb(len(df_subsampled_from), 2))
print('total number of pairs is %s' % subsample_size_max)

# step 4: sample a subset of pre-sanitized database and form the data points into pairs
subsample_size = int(round(subsample_size_max))
sp = Subsampling(data=df_subsampled_from)
data_pair = sp.uniform_sampling(subsample_size=subsample_size)

# User receives the data pairs and label the similarity
sim = Similarity(data=data_pair)
sim.extract_interested_attribute(interest=interest, window=window)
similarity_label, class_label = sim.label_via_silhouette_analysis(
    range_n_clusters=range(2, 8))

# step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
# lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem.
Exemplo n.º 38
0
def daub(p):
    """
    The coefficients for the FIR low-pass filter producing Daubechies wavelets.

    p>=1 gives the order of the zero at f=1/2.
    There are 2p filter coefficients.

    Parameters
    ----------
    p : int
        Order of the zero at f=1/2, can have values from 1 to 34.

    Returns
    -------
    daub : ndarray
        Return

    """
    sqrt = np.sqrt
    if p < 1:
        raise ValueError("p must be at least 1.")
    if p == 1:
        c = 1 / sqrt(2)
        return np.array([c, c])
    elif p == 2:
        f = sqrt(2) / 8
        c = sqrt(3)
        return f * np.array([1 + c, 3 + c, 3 - c, 1 - c])
    elif p == 3:
        tmp = 12 * sqrt(10)
        z1 = 1.5 + sqrt(15 + tmp) / 6 - 1j * (sqrt(15) + sqrt(tmp - 15)) / 6
        z1c = np.conj(z1)
        f = sqrt(2) / 8
        d0 = np.real((1 - z1) * (1 - z1c))
        a0 = np.real(z1 * z1c)
        a1 = 2 * np.real(z1)
        return f / d0 * np.array([a0, 3 * a0 - a1, 3 * a0 - 3 * a1 + 1,
                                  a0 - 3 * a1 + 3, 3 - a1, 1])
    elif p < 35:
        # construct polynomial and factor it
        if p < 35:
            P = [comb(p - 1 + k, k, exact=1) for k in range(p)][::-1]
            yj = np.roots(P)
        else:  # try different polynomial --- needs work
            P = [comb(p - 1 + k, k, exact=1) / 4.0**k
                 for k in range(p)][::-1]
            yj = np.roots(P) / 4
        # for each root, compute two z roots, select the one with |z|>1
        # Build up final polynomial
        c = np.poly1d([1, 1])**p
        q = np.poly1d([1])
        for k in range(p - 1):
            yval = yj[k]
            part = 2 * sqrt(yval * (yval - 1))
            const = 1 - 2 * yval
            z1 = const + part
            if (abs(z1)) < 1:
                z1 = const - part
            q = q * [1, -z1]

        q = c * np.real(q)
        # Normalize result
        q = q / np.sum(q) * sqrt(2)
        return q.c[::-1]
    else:
        raise ValueError("Polynomial factorization does not work "
              "well for p too large.")
Exemplo n.º 39
0
def bp_easyline(n):
    return comb(2 * n, n, exact = True)
Exemplo n.º 40
0
def pascal(n, kind='symmetric', exact=True):
    """
    Returns the n x n Pascal matrix.

    The Pascal matrix is a matrix containing the binomial coefficients as
    its elements.

    .. versionadded:: 0.11.0

    Parameters
    ----------
    n : int
        The size of the matrix to create; that is, the result is an n x n
        matrix.
    kind : str, optional
        Must be one of 'symmetric', 'lower', or 'upper'.
        Default is 'symmetric'.
    exact : bool, optional
        If `exact` is True, the result is either an array of type
        numpy.uint64 (if n <= 35) or an object array of Python long integers.
        If `exact` is False, the coefficients in the matrix are computed using
        `scipy.misc.comb` with `exact=False`.  The result will be a floating
        point array, and the values in the array will not be the exact
        coefficients, but this version is much faster than `exact=True`.

    Returns
    -------
    p : (n, n) ndarray
        The Pascal matrix.

    Notes
    -----
    See http://en.wikipedia.org/wiki/Pascal_matrix for more information
    about Pascal matrices.

    Examples
    --------
    >>> from scipy.linalg import pascal
    >>> pascal(4)
    array([[ 1,  1,  1,  1],
           [ 1,  2,  3,  4],
           [ 1,  3,  6, 10],
           [ 1,  4, 10, 20]], dtype=uint64)
    >>> pascal(4, kind='lower')
    array([[1, 0, 0, 0],
           [1, 1, 0, 0],
           [1, 2, 1, 0],
           [1, 3, 3, 1]], dtype=uint64)
    >>> pascal(50)[-1, -1]
    25477612258980856902730428600L
    >>> from scipy.misc import comb
    >>> comb(98, 49, exact=True)
    25477612258980856902730428600L

    """

    if kind not in ['symmetric', 'lower', 'upper']:
        raise ValueError("kind must be 'symmetric', 'lower', or 'upper'")

    if exact:
        if n > 35:
            L_n = np.empty((n, n), dtype=object)
            L_n.fill(0)
        else:
            L_n = np.zeros((n, n), dtype=np.uint64)
        for i in range(n):
            for j in range(i + 1):
                L_n[i, j] = comb(i, j, exact=True)
    else:
        L_n = comb(*np.ogrid[:n, :n])

    if kind is 'lower':
        p = L_n
    elif kind is 'upper':
        p = L_n.T
    else:
        p = np.dot(L_n, L_n.T)

    return p
Exemplo n.º 41
0
def invhilbert(n, exact=False):
    """
    Compute the inverse of the Hilbert matrix of order `n`.

    The entries in the inverse of a Hilbert matrix are integers.  When `n`
    is greater than 14, some entries in the inverse exceed the upper limit
    of 64 bit integers.  The `exact` argument provides two options for
    dealing with these large integers.

    Parameters
    ----------
    n : int
        The order of the Hilbert matrix.
    exact : bool
        If False, the data type of the array that is returned is np.float64,
        and the array is an approximation of the inverse.
        If True, the array is the exact integer inverse array.  To represent
        the exact inverse when n > 14, the returned array is an object array
        of long integers.  For n <= 14, the exact inverse is returned as an
        array with data type np.int64.

    Returns
    -------
    invh : (n, n) ndarray
        The data type of the array is np.float64 if `exact` is False.
        If `exact` is True, the data type is either np.int64 (for n <= 14)
        or object (for n > 14).  In the latter case, the objects in the
        array will be long integers.

    See Also
    --------
    hilbert : Create a Hilbert matrix.

    Notes
    -----
    .. versionadded:: 0.10.0

    Examples
    --------
    >>> from scipy.linalg import invhilbert
    >>> invhilbert(4)
    array([[   16.,  -120.,   240.,  -140.],
           [ -120.,  1200., -2700.,  1680.],
           [  240., -2700.,  6480., -4200.],
           [ -140.,  1680., -4200.,  2800.]])
    >>> invhilbert(4, exact=True)
    array([[   16,  -120,   240,  -140],
           [ -120,  1200, -2700,  1680],
           [  240, -2700,  6480, -4200],
           [ -140,  1680, -4200,  2800]], dtype=int64)
    >>> invhilbert(16)[7,7]
    4.2475099528537506e+19
    >>> invhilbert(16, exact=True)[7,7]
    42475099528537378560L

    """
    if exact:
        if n > 14:
            dtype = object
        else:
            dtype = np.int64
    else:
        dtype = np.float64
    invh = np.empty((n, n), dtype=dtype)
    for i in xrange(n):
        for j in xrange(0, i + 1):
            s = i + j
            invh[i, j] = ((-1) ** s * (s + 1) *
                          comb(n + i, n - j - 1, exact) *
                          comb(n + j, n - i - 1, exact) *
                          comb(s, i, exact) ** 2)
            if i != j:
                invh[j, i] = invh[i, j]
    return invh
Exemplo n.º 42
0
Rosalind #: 090
URL: http://rosalind.info/problems/wfmd/
'''

from scipy.misc import comb

with open('data/rosalind_wfmd.txt') as input_data:
    N, m, g, k = [int(num) for num in input_data.read().strip().split()]

# Determine the probabiliy of a given of recessive allels in the first generation.
# Use a binomial random variable with the given parameters.
# Note:  We omit the 0th term throughout the problem, as it has no contribution to the desired probability.
#        For future problems, start the ranges at 0 if the 0 term ever becomes necessary.
p_rec = 1.0 - (m / (2.0 * N))
p = [
    comb(2 * N, i) * ((p_rec)**i) * (1.0 - p_rec)**(2 * N - i)
    for i in range(1, 2 * N + 1)
]

# Determine the probabiliy of a given of recessive allels in the 2nd to k-th generations.
# Use the total law of probability, along with the probabilities from the previous generation.
# i.e., P(1 Rec) = P(1 Rec | 0 Rec in previous gen) +  P(1 Rec | 1 Rec in previous gen) + ... + P(1 Rec | 2N Rec in previous gen)
# Notice that the conditional probabilities are binomial terms, similar to the first generation calculations.
for gen in range(2, g + 1):
    temp_p = []
    for j in range(1, 2 * N + 1):
        temp_term = [
            comb(2 * N, j) * ((x / (2.0 * N))**j) *
            (1.0 - (x / (2.0 * N)))**(2 * N - j) for x in range(1, 2 * N + 1)
        ]
        temp_p.append(sum([temp_term[i] * p[i]
Exemplo n.º 43
0
  def run(self, niter):
    for i in xrange(niter):
      # sample z w/ limit on size 
      # random permute dimensions
      for d in np.random.permutation(range(self.xdim)):
        # sample z_d
        # initialize
        final_z = self.z.copy()
        zd_old = self.z[d]
        self.z[d] = -1;
        a_size = np.sum(self.z == zd_old)

        max_log_prob_perturbed = np.log(a_size + self.alpha[zd_old]) \
                                 - self.gp.nll + helper.gumbel()
        
        # find all possible category assignments
        
        # if z[d] is alone, the possible other category assignment is
        other_cat = np.unique(self.z)
        other_cat = other_cat[np.logical_and(other_cat != zd_old, other_cat != -1)]
        # otherwise, need to remove z[d] and add one additional category
        if a_size > 0 and other_cat.size + 1 < self.n_add:
          for a in xrange(self.n_add):
            if (a not in other_cat) and (a != zd_old):
              other_cat = np.append(other_cat, [a])
              break

        # start sampling
        for a in np.random.permutation(other_cat):
          a_size = np.sum(self.z == a)
          if a_size < self.dim_limit:
            self.z[d] = a
            gp = self.get_gp()
            log_prob = np.log(a_size + self.alpha[a]) - gp.nll + helper.gumbel()
            if log_prob > max_log_prob_perturbed:
              max_log_prob_perturbed = log_prob
              self.gp = gp
              final_z = self.z.copy()
        self.z = final_z
        # end of sample z_d

        # sample k_d
        # initialize
        final_k = self.k.copy()
        kd_old = self.k[d]
        beta_post = lambda x: comb(self.beta[0]+x-1., x)/((1./self.beta[1]+1.)**x)
        max_log_prob_perturbed = beta_post(kd_old) - self.gp.nll + helper.gumbel()

        # define range for k_d? current k_d \pm 10
        other_k = np.arange(-5, 5) + kd_old
        other_k = other_k[np.logical_and(other_k >= 2, other_k != kd_old)]

        # start sampling
        for b in np.random.permutation(other_k):
          self.k[d] = b
          gp = self.get_gp()
          log_prob = beta_post(b) - gp.nll + helper.gumbel()
          if log_prob > max_log_prob_perturbed:
              max_log_prob_perturbed = log_prob
              self.gp = gp
              final_k = self.k.copy()
        self.k = final_k
    return self.gp, self.z, self.k
Exemplo n.º 44
0
def myComb(a, b):
    return comb(a, b, exact=True)
Exemplo n.º 45
0
    NXSUBAPS = numpy.array([7] * NWFS)
    NSUBAPS = numpy.array([36] * NWFS)
    SUBAPDIAM = numpy.array([telConfig["WFS"]["subapDiam"]] * NWFS)
    GSALT = numpy.array([0] * NWFS)
    GSTYPE = numpy.array([1] * NWFS)
    PUPILSHIFT = numpy.array(([1, 1], [0, 0]))
    PUPILMAG = numpy.array([NXSUBAPS[0]] * NWFS)
    PUPILROT = numpy.array([0] * NWFS)
    PUPILROT[0] = 0
    OBS = 0.285
    NCPU = 1
    PART = 0
    PUPIL_MASK = telConfig["WFS"]["pupilMask"]
    waveL = 500e-9
    gam = numpy.array([waveL] * NWFS)
    combs = int(comb(GSPOS.shape[1], 2, exact=True))
    selector = numpy.array((range(GSPOS.shape[0])))
    selector = numpy.array((list(itertools.combinations(selector, 2))))

    NLAYERS = 2
    r0 = numpy.array([0.1] * NLAYERS)
    L0 = numpy.array([25.] * NLAYERS)
    LAYERHEIGHTS = numpy.array([0., 9281.91628112])
    fitL0 = True
    offsets = True

    params = CovarianceMatrix(NWFS, PUPIL_MASK, TEL_DIAM, SUBAPDIAM, GSALT,
                              GSPOS, gam, NLAYERS, LAYERHEIGHTS, offsets,
                              fitL0, L0, PUPILSHIFT, PUPILROT, True)
    s = time.time()
    nmat = params.make_covariance_matrix(r0, L0, PUPILSHIFT, PUPILROT)
Exemplo n.º 46
0
 def test_big(self):
     p = pascal(50)
     assert_equal(p[-1, -1], comb(98, 49, exact=True))
Exemplo n.º 47
0

h3k27ac_cell_type_names = ["neuron", "microglia", "glia"]
marker_names = ["NeuN+", "Pu.1+", "NeuN-/Pu.1-"]
brain_regions = [
    "hpc", "dlpfc", "allbr", "hpc_female_controls", "hpc_female_cases",
    "hpc_male_controls", "hpc_male_cases"
]
clusters = [
    "exPFC1", "exPFC2", "exCA1", "exCA3", "GABA1", "GABA2", "exDG", "MG",
    "ODC1", "ODC2", "OPC", "ASC1", "ASC2", "NSC", "END"
]
numClusters = len(clusters)
totGreaterThanZeroTests = len(clusters) * len(h3k27ac_cell_type_names) * len(
    brain_regions)
totCellTypeTests = misc.comb(len(h3k27ac_cell_type_names),
                             2) * len(clusters) * len(brain_regions)
padj_threshold = 0.05
padj_stringent = 0.01
logfc_cutoff = 0.5
numColsPlot = 5
numRowsPlot = 3
h1 = 0.2
h2 = 0.05
prefix = "/habib_markers_analysis_disttss_filter/"
suffix = "l2fc.txt"

tStatMatrix = dict()
pValMatrix = dict()
l2fcMeansMatrix = dict()

plotColor = dict()
Exemplo n.º 48
0
def evaluation_total_usage(n):
    """
    In the demo, we will showcase an example of special purpose publication.
    The data user wants the published energy database to maximally retain the information about peak-time energy usage
    """

    # Initialization of some useful classes
    util = Utilities()
    pe = PerformanceEvaluation()

    # step 1: get the database to be published
    day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl')
    day_profile = day_profile.fillna(0)
    day_profile = day_profile.iloc[
        0:90, 0::
        4]  # subsample the database to improve the speed for demonstration purpose
    day_profile.index = range(len(day_profile.index))
    rep_mode = 'mean'
    anonymity_level = n  # desired anonymity level

    # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
    # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the
    # starting and ending time of the peak usage time
    interest = 'window-usage'
    window = [17, 21]

    sanitized_profile_best = util.sanitize_data(
        day_profile,
        distance_metric='self-defined',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        mode=interest,
        window=window)

    # step 3: pre-sanitize the database
    sanitized_profile_baseline = util.sanitize_data(
        day_profile,
        distance_metric='euclidean',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode)

    loss_best_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_best,
        mode=interest,
        window=window)

    loss_generic_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_baseline,
        mode=interest,
        window=window)
    # print("information loss with learned metric %s" % loss_generic_metric)

    df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample(
        frac=1)
    subsample_size_max = int(comb(len(df_subsampled_from), 2))

    print('total number of pairs is %s' % subsample_size_max)

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics',
                                     stat_type=interest,
                                     window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm,
        window=window)

    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm,
                                           window=window)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric_deep = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        mode=interest,
        window=window)

    loss_learned_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile,
        mode=interest,
        window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size
Exemplo n.º 49
0
def ensemble_error(n_classifier, error):
    k_start = int(math.ceil(n_classifier / 2.))
    probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)
            for k in range(k_start, n_classifier + 1)]
    return sum(probs)
Exemplo n.º 50
0
    def W(self, n1, n2, n3):
        """
        Calculates an instance of W from Fluke. et. al.
        """

        return np.sqrt(comb(n1, n1 / 2) * comb(n2, n2 / 2) * comb(n3, n3 / 2))
Exemplo n.º 51
0
b = np.zeros(n)
head = np.where(a > tt)
tail = np.where(a < tt)
b[head] = 1
b[tail] = 0
nh = np.size(head)
nt = np.size(tail)

x = np.arange(n)
plt.figure('T=' + str(tt))
plt.plot(x, b, '.k')
plt.ylim(-0.1, 2.)
plt.figure('histogram')
plt.hist(b, bins=2)

pdata = misc.comb(n, nt) * (tt**nt) * ((1. - tt)**nh)
pdatanr = 1.
ptnr = 1.
pt = pdata * ptnr
plt.plot(tt, pt)

nn = 501
t = np.zeros(nn)
pt = np.zeros(nn)
plt.figure('T vs P(T)')

for ii in np.arange(nn):
    t0 = 1. / (nn - 1) * ii
    pdata = (t0**nt) * ((1. - t0)**nh)
    ptnr = 1.
Exemplo n.º 52
0
def background_noise(unlabel_intensity, na, parent_atoms, parent_label, daughter_atoms, daughter_label):
    noise = unlabel_intensity * math.pow(na, parent_label)\
        * comb(parent_atoms - daughter_atoms, parent_label - daughter_label)\
        * comb(daughter_atoms, daughter_label)
    return noise
Exemplo n.º 53
0
# rosalind_eval

import numpy as np
from scipy.misc import comb
    
f = open('rosalind_indc.txt', 'r')
t = f.readlines()
n = 2 * np.int(t[0].rstrip())



out = np.log10(np.array([comb(n, i, exact=1) * .5**n 
                                  for i in range(n+1)]).cumsum()[::-1])[1:]

out.tofile('rosalind_indc_sub.txt', sep=' ')

Exemplo n.º 54
0
 def prob(self, k):
     assert isinstance(k,
                       int), "event must occur an integer number of times"
     return comb(self.n, k) * (self.p**k) * ((1 - self.p)**(self.n - k))
def gen_dice_pdf(N,diceLim):
    P = diceLim/6
    return [comb(N,i)* P**i * (1-P)**(N-i) for i in range(N+1)]
Exemplo n.º 56
0
pdatanr = 1.
ptnr = 1.
pt = pdata * ptnr
plt.plot(t, pt)
#plt.show()

n = 256
nn = 501
t = np.zeros(nn)
pt = np.zeros(nn)
plt.figure('T vs P(T)')

for ii in np.arange(nn):
    t0 = 1. / (nn - 1) * ii
    pdata = (t0**nt) * ((1. - t0)**nh)
    ptnr = 1.

    t[ii] = t0
    pt[ii] = pdata * ptnr
    print(t0, pdata, t0**nt, (1. - t0)**nh, nt, nh, misc.comb(n, nt))

plt.plot(t, pt, '-')

plt.figure('normalized to max')
mp = np.max(pt)

pt = 1. / mp * pt
plt.plot(t, pt, '-')
print(pt[0:10])
plt.show()
Exemplo n.º 57
0
def evaluation_total_usage(n, df_subsampled_from, day_profile):
    interest = 'window-usage'
    window = [17, 21]
    anonymity_level = n
    rep_mode = 'mean'

    subsample_size_max = int(comb(len(df_subsampled_from), 2))
    print('total number of pairs is %s' % subsample_size_max)
    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics',
                                     stat_type=interest,
                                     window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm,
        window=window)

    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm,
                                           window=window)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric_deep = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        mode=interest,
        window=window)

    loss_learned_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile,
        mode=interest,
        window=window)
    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size
Exemplo n.º 58
0
def wignerd(j,m,n=0,approx_lim=10):
    '''
        Wigner "small d" matrix. (Euler z-y-z convention)
        example:
            j = 2
            m = 1
            n = 0
            beta = linspace(0,pi,100)
            wd210 = wignerd(j,m,n)(beta)

        some conditions have to be met:
             j >= 0
            -j <= m <= j
            -j <= n <= j

        The approx_lim determines at what point
        bessel functions are used. Default is when:
            j > m+10
              and
            j > n+10

        for integer l and n=0, we can use the spherical harmonics. If in
        addition m=0, we can use the ordinary legendre polynomials.
    '''

    if (j < 0) or (abs(m) > j) or (abs(n) > j):
        raise ValueError("wignerd(j = {0}, m = {1}, n = {2}) value error.".format(j,m,n) \
            + " Valid range for parameters: j>=0, -j<=m,n<=j.")

    if (j > (m + approx_lim)) and (j > (n + approx_lim)):
        #print('bessel (approximation)')
        return lambda beta: jv(m-n, j*beta)

    if (floor(j) == j) and (n == 0):
        if m == 0:
            #print('legendre (exact)')
            return lambda beta: legendre(j)(cos(beta))
        elif False:
            #print('spherical harmonics (exact)')
            a = sqrt(4.*pi / (2.*j + 1.))
            return lambda beta: a * conjugate(sph_harm(m,j,beta,0.))

    jmn_terms = {
        j+n : (m-n,m-n),
        j-n : (n-m,0.),
        j+m : (n-m,0.),
        j-m : (m-n,m-n),
        }

    k = min(jmn_terms)
    a, lmb = jmn_terms[k]

    b = 2.*j - 2.*k - a

    if (a < 0) or (b < 0):
        raise ValueError("wignerd(j = {0}, m = {1}, n = {2}) value error.".format(j,m,n) \
            + " Encountered negative values in (a,b) = ({0},{1})".format(a,b))

    coeff = power(-1.,lmb) * sqrt(comb(2.*j-k,k+a)) * (1./sqrt(comb(k+b,b)))

    #print('jacobi (exact)')
    return lambda beta: coeff \
        * power(sin(0.5*beta),a) \
        * power(cos(0.5*beta),b) \
        * jacobi(k,a,b)(cos(beta))
Exemplo n.º 59
0
def comb2(n):
    # the exact version is faster for k == 2: use it by default globally in
    # this module instead of the float approximate variant
    return comb(n, 2, exact=1)
Exemplo n.º 60
0
def adjusted_rand_score(labels_true, labels_pred):
    """Rand index adjusted for chance

    The Rand Index computes a similarity measure between two clusterings
    by considering all pairs of samples and counting pairs that are
    assigned in the same or different clusters in the predicted and
    true clusterings.

    The raw RI score is then "adjusted for chance" into the ARI score
    using the following scheme::

        ARI = (RI - Expected_RI) / (max(RI) - Expected_RI)

    The adjusted Rand index is thus ensured to have a value close to
    0.0 for random labeling independently of the number of clusters and
    samples and exactly 1.0 when the clusterings are identical (up to
    a permutation).

    ARI is a symmetric measure::

        adjusted_rand_score(a, b) == adjusted_rand_score(b, a)

    Parameters
    ----------
    labels_true : int array, shape = [n_samples]
        Ground truth class labels to be used as a reference

    labels_pred : array, shape = [n_samples]
        Cluster labels to evaluate

    Returns
    -------
    ari: float
       Similarity score between -1.0 and 1.0. Random labelings have an ARI
       close to 0.0. 1.0 stands for perfect match.

    Examples
    --------

    Perfectly maching labelings have a score of 1 even

      >>> from sklearn.metrics.cluster import adjusted_rand_score
      >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1])
      1.0
      >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])
      1.0

    Labelings that assign all classes members to the same clusters
    are complete be not always pure, hence penalized::

      >>> adjusted_rand_score([0, 0, 1, 2], [0, 0, 1, 1])  # doctest: +ELLIPSIS
      0.57...

    ARI is symmetric, so labelings that have pure clusters with members
    coming from the same classes but unnecessary splits are penalized::

      >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2])  # doctest: +ELLIPSIS
      0.57...

    If classes members are completely split across different clusters, the
    assignment is totally incomplete, hence the ARI is very low::

      >>> adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3])
      0.0

    References
    ----------

    .. [Hubert1985] `L. Hubert and P. Arabie, Comparing Partitions,
      Journal of Classification 1985`
      http://www.springerlink.com/content/x64124718341j1j0/

    .. [wk] http://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index

    See also
    --------
    adjusted_mutual_info_score: Adjusted Mutual Information

    """
    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
    n_samples = labels_true.shape[0]
    classes = np.unique(labels_true)
    clusters = np.unique(labels_pred)
    # Special limit cases: no clustering since the data is not split;
    # or trivial clustering where each document is assigned a unique cluster.
    # These are perfect matches hence return 1.0.
    if (classes.shape[0] == clusters.shape[0] == 1
            or classes.shape[0] == clusters.shape[0] == 0
            or classes.shape[0] == clusters.shape[0] == len(labels_true)):
        return 1.0

    contingency = contingency_matrix(labels_true, labels_pred)

    # Compute the ARI using the contingency data
    sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1))
    sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0))

    sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten())
    prod_comb = (sum_comb_c * sum_comb_k) / float(comb(n_samples, 2))
    mean_comb = (sum_comb_k + sum_comb_c) / 2.
    return ((sum_comb - prod_comb) / (mean_comb - prod_comb))