Пример #1
0
def prob(sample, reference):
    sample_mean = 0
    sample_var_lo = 1
    sample_var_hi = 1
    n_samples = 0
    for col in sample.index:
        if col.endswith('FPKM') or '_' not in col:
            sample_mean += sample[col]
            n_samples += 1
            col_base = col.strip('_FPKM')
            if col_base + '_conf_lo' in sample.index:
                D_lo = sample[col] - sample[col_base + '_conf_lo']
                D_hi = sample[col_base + '_conf_hi'] - sample[col]
                sample_var_lo += D_lo**2
                sample_var_hi += D_hi**2
            else:
                sample_var_lo += sample[col]
                sample_var_hi += sample[col]

    sample_mean /= n_samples
    lo = sample_mean - np.sqrt(sample_var_lo)
    hi = sample_mean + np.sqrt(sample_var_hi)
    lo_prob = stats.zprob((lo - np.mean(reference,axis=1)) /
                          (np.std(reference,axis=1) + 10))
    hi_prob = stats.zprob((hi - np.mean(reference,axis=1)) /
                          (np.std(reference,axis=1) + 10))
    return float(hi_prob - lo_prob)
Пример #2
0
def wilcoxon_test(a, b, alpha = .05):
    """
    Performs the Wilcoxon Rank non-parametric test for two algorithms.
    """
    N = len(a) # Number of datasets
    # Compute the differences and keep the signs
    differences, signs = ([abs(a[i] - b[i]) for i in xrange(N)],
                          [a[i] - b[i] for i in xrange(N)])
    tmp = sorted(differences)
    # The rank is the median between the index of the first element equal
    # to v in tmp (index(v)+1) and the index of the last element equal to v
    # (index(v)+count(v))
    ranks = [(tmp.count(v)+tmp.index(v)*2+0x1)/2e0 for v in differences]
    # Add up the ranks for positive and negative signs
    r_plus = r_minus = 0.0
    for i in xrange(N):
        if signs[i] < 0:
            r_minus += ranks[i]
        elif signs[i] > 0:
            r_plus += ranks[i]
        else:
            r_minus += ranks[i]*2**-1
            r_plus += ranks[i]/2.
    # Compute the minimum of both sums
    T = min([r_plus, r_minus])
    # Check if it can be approximated by a gaussian distribution
    if N <= 30:
        return {"result" : T > wilcoxon_table[alpha][N],
                "statistic" : T,
                "critical" : wilcoxon_table[alpha][N]}
    else:
        z = (T - N*(N + 1)/4) / math.sqrt(N*(N + 1)*(2*N + 1)/24)
        return {"result" : st.zprob(z)*2 > alpha,
                "statistic" : z,
                "critical" : st.zprob(z)}
Пример #3
0
 def probability_of_f2_being_better_than(self, design):
     m1 = self.f2_mean
     s1 = self.f2_std
     n1 = len(self.f2_vals)
     m2 = design.f2_mean
     s2 = design.f2_std
     n2 = len(design.f2_vals)
     t = -(m1 - m2) / ( s1**2/n1 + s2**2/n2 ) ** 0.5   
     return stats.zprob(t)
Пример #4
0
def wald_wolfowitz(sequence):
    """
    implements the wald-wolfowitz runs test:
    http://en.wikipedia.org/wiki/Wald-Wolfowitz_runs_test
    http://support.sas.com/kb/33/092.html

    :param sequence: any iterable with at most 2 values. e.g.
                     '1001001'
                     [1, 0, 1, 0, 1]
                     'abaaabbba'

    :rtype: a dict with keys of 
        `n_runs`: the number of runs in the sequence 
        `p`: the support to reject the null-hypothesis that the number of runs 
             supports a random sequence
        `z`: the z-score, used to calculate the p-value 
        `sd`, `mean`: the expected standard deviation, mean the number of runs, 
                      given the ratio of numbers of 1's/0's in the sequence

    >>> r = wald_wolfowitz('1000001')
    >>> r['n_runs'] # should be 3, because 1, 0, 1
    3

    >>> r['p'] < 0.05 # not < 0.05 evidence to reject Ho of random sequence
    False

    # this should show significance for non-randomness
    >>> li = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
    >>> wald_wolfowitz(li)['p'] < 0.05
    True

    """
    R = n_runs = sum(1 for s in groupby(sequence, lambda a: a))

    n = float(sum(1 for s in sequence if s == sequence[0]))
    m = float(sum(1 for s in sequence if s != sequence[0]))

    # expected mean runs
    ER = ((2 * n * m ) / (n + m)) + 1
    # expected variance runs
    VR = (2 * n * m * (2 * n * m - n - m )) / ((n + m)**2 * (n + m - 1)) 
    O = (ER - 1) * (ER - 2) / (n + m - 1.)
    assert VR - O < 0.001, (VR, O)

    SD = math.sqrt(VR)
    # Z-score
    Z = (R - ER) / SD

    return {'z': Z, 'mean': ER, 'sd': SD, 'p': zprob(Z), 'n_runs': R}
Пример #5
0
def wald_wolfowitz(sequence):
    """
    implements the wald-wolfowitz runs test:
    http://en.wikipedia.org/wiki/Wald-Wolfowitz_runs_test
    http://support.sas.com/kb/33/092.html

    :param sequence: any iterable with at most 2 values. e.g.
                     '1001001'
                     [1, 0, 1, 0, 1]
                     'abaaabbba'

    :rtype: a dict with keys of
        `n_runs`: the number of runs in the sequence
        `p`: the support to reject the null-hypothesis that the number of runs
             supports a random sequence
        `z`: the z-score, used to calculate the p-value
        `sd`, `mean`: the expected standard deviation, mean the number of runs,
                      given the ratio of numbers of 1's/0's in the sequence

    >>> r = wald_wolfowitz('1000001')
    >>> r['n_runs'] # should be 3, because 1, 0, 1
    3

    >>> r['p'] < 0.05 # not < 0.05 evidence to reject Ho of random sequence
    False

    # this should show significance for non-randomness
    >>> li = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
    >>> wald_wolfowitz(li)['p'] < 0.05
    True

    """
    R = n_runs = sum(1 for s in groupby(sequence, lambda a: a))

    n = float(sum(1 for s in sequence if s == sequence[0]))
    m = float(sum(1 for s in sequence if s != sequence[0]))

    # expected mean runs
    ER = ((2 * n * m) / (n + m)) + 1
    # expected variance runs
    VR = (2 * n * m * (2 * n * m - n - m)) / ((n + m)**2 * (n + m - 1))
    O = (ER - 1) * (ER - 2) / (n + m - 1.)
    assert VR - O < 0.001, (VR, O)

    SD = math.sqrt(VR)
    # Z-score
    Z = (R - ER) / SD

    return {'z': Z, 'mean': ER, 'sd': SD, 'p': zprob(Z), 'n_runs': R}
def example_test_a_proportion(p=0.1, n=100, target=0.1, significance_level=0.95):
    """ The math behind testing a single proportion """

    # sample standard deviation
    sigma = sqrt(p*(1-p)/n)

    # z-score
    z = (p-target)/sigma

    # check in statistical table
    prob = stats.zprob(z)

    # If we observe a large p-value, for example larger than 0.05 or 0.1,
    # then we cannot reject the null hypothesis of identical proportions
    alpha = 1 - significance_level
    if prob > alpha:
        return 'Proportion is equal to target'
    else:
        return 'Proportion is not equal to target'
def example_compare_two_proportions(p1=0.1, n1=100, p2=0.1, n2=100, significance_level=0.95):
    """ The math behind comparing two proportions """

    # overall sample proportion
    p = ((p1*n1)+(p2*n2))/(n1+n2)
    # standard error
    se = sqrt(p*(1.-p)*((1./n1)+(1./n2)))
    # z-score
    z = (p1-p2)/se

    # check in statistical table
    prob = stats.zprob(z)

    # If we observe a large p-value, for example larger than 0.05 or 0.1,
    # then we cannot reject the null hypothesis of identical proportions
    alpha = 1 - significance_level
    if prob > alpha:
        return 'Proportions are equal'
    else:
        return 'Proportions are not equal'
def example_test_a_proportion(p=0.1,
                              n=100,
                              target=0.1,
                              significance_level=0.95):
    """ The math behind testing a single proportion """

    # sample standard deviation
    sigma = sqrt(p * (1 - p) / n)

    # z-score
    z = (p - target) / sigma

    # check in statistical table
    prob = stats.zprob(z)

    # If we observe a large p-value, for example larger than 0.05 or 0.1,
    # then we cannot reject the null hypothesis of identical proportions
    alpha = 1 - significance_level
    if prob > alpha:
        return 'Proportion is equal to target'
    else:
        return 'Proportion is not equal to target'
def example_compare_two_proportions(p1=0.1,
                                    n1=100,
                                    p2=0.1,
                                    n2=100,
                                    significance_level=0.95):
    """ The math behind comparing two proportions """

    # overall sample proportion
    p = ((p1 * n1) + (p2 * n2)) / (n1 + n2)
    # standard error
    se = sqrt(p * (1. - p) * ((1. / n1) + (1. / n2)))
    # z-score
    z = (p1 - p2) / se

    # check in statistical table
    prob = stats.zprob(z)

    # If we observe a large p-value, for example larger than 0.05 or 0.1,
    # then we cannot reject the null hypothesis of identical proportions
    alpha = 1 - significance_level
    if prob > alpha:
        return 'Proportions are equal'
    else:
        return 'Proportions are not equal'
Пример #10
0
  def getMoranGeary(self, myLayer, myBand, m):

    # formula for Moran's I
    # I = [ sum i=<1..n> sum j= <1..n> w(i,j) (x(i) - x(m)) (x(j) - x(m)) / 
    #          sum i=<1..n> (x(i) - x(m))^2 ] * 
    # #    [ n / sum i=<1..n> sum j= <1..n> w(i,j) ]
    # where n = number of pixels, 
    #       w(i,j) = weight (1 if j is next to i, 0 otherwise)
    #       x(i) = value at position i
    #       x(m) = global mean of layer

    # formula for Geary's C
    # [ sum i=<1..n> sum j= <1..n> w(i,j) (x(i) - x(j)) / 
    #   sum i=<1..n> (x(i) - x(m))^2 ] * 
    #  (n -1) / 2 * sum i=<1..n> sum j= <1..n> w(i,j)
    # variables as Moran's I

    # Variance Moran's I (assuming normality)
    # Variance = [ (n^2S1 - nS2 + 3S0^2) / (S0^2(n^2-1)) ] - E^2
    # Where
    # S0= sum i=<1..n> sum j=<1..n> (w(ij)), i<>j
    # S1= 1/2 sum i=<1..n> sum j=<1..n> (w(ij) + w(ji))^2, i<>j
    # S2= sum i=<1..n> [sum j=<1..n> w(ij) + sum j=<1..n> w(ji)]^2
    # E=Expected = 1/(N^2-1)
    # Zscore for Moran's I assuming normality
    # Zscore =  I-E / Variance^0.5

    # Variance Moran's I (randomisation test version)
    # Variance = [ [n((n^2-3n+3)S1 - nS2 + 3S0^2) - k((n^2-n)S1-2nS2+6S0^2)] ] /
    #              [ (n-1)(n-2)(n-3)S0^2 ] ] - E^2
    # Where S0,1,2,E as above
    # k = [ (sum i=<1..n> (x(i) - x(m))^4 ) / n ] / 
    #     [ (sum i=<1..n> (x(i) - x(m))^2 ) / n ]^2
    # Zscore as above

    # initialise variables
    myN=0
    # denominator for Moran & Geary are the same
    myDenominator=0
    # Numerator is different
    myNumeratorMI=float(0)
    myNumeratorGC=float(0)
    myNumeratorCount=0

    # initialise variables
    [myS0,myS1,myS2,myKNum,myKDenom,myK]=[0,0,0,0,0,0]
    [myMoranI,myVarianceMIAN,myZMIAN,myPMIAN,
     myVarianceMIRV,myZMIRV,myPMIRV,
     myGearyC,myVarianceGCAN,myZGCAN,myPGCAN,
     myVarianceGCRV,myZGCRV,myPGCRV]=[None,None,None,None,None,
                                      None,None,None,None,None,
                                      None,None,None,None]

    # remember ndv
    myNDV=QString(u'null (no data)')
    myOE=QString(u'out of extent')
    # set up extent parameters
    xMin=myLayer.extent().xMinimum()
    yMin=myLayer.extent().yMinimum()
    xMax=myLayer.extent().xMaximum()
    yMax=myLayer.extent().yMaximum()
    xDim=myLayer.width()
    yDim=myLayer.height()
    xSize=(xMax-xMin)/xDim
    ySize=(yMax-yMin)/yDim

    # loop through all points
    for i in range(xDim):
      x=xMin+(xSize/2)+(i*xSize)
      for j in range(yDim):
        y=yMin+(ySize/2)+(j*ySize)
        # get value
        zstr=myLayer.identify(QgsPoint(x,y))[1].values()[myBand]

        # do sums if we have a value
        if not zstr==myNDV:
          z=float(zstr)
          myN+=1
          myDenominator+=pow(z-m,2)
          myKNum+=pow(z-m,4)
          myKDenom=myDenominator # same calculation at this point
          myS2_ct=0

          # loop through adjacent points
          for ii in range(-1*self.Radius,self.Radius+1):
            xx=x+(ii*xSize)
            for jj in range(-1*self.Radius,self.Radius+1):
              yy=y+(jj*ySize)
              zzstr=myLayer.identify(QgsPoint(xx,yy))[1].values()[myBand]

              ## ignore if nodata or on the diagonal
              if not (zzstr==myNDV or zzstr==myOE or abs(ii)==abs(jj)) :
                zz=float(zzstr)
                myNumeratorMI = myNumeratorMI + (z-m)*(zz-m)
                myNumeratorGC = myNumeratorGC + pow(z-zz,2)
                myNumeratorCount+=1
                myS0+=1 ## w(ij) = 1
                myS1+=4 ## (w(ij) + w(ji))^2 = (1+1)^2 = 4
                myS2_ct+=2 ## w(ij) + w(ji)

          # finish S2 running total by squaring the total adjacents
          myS2+=pow(myS2_ct,2)

    # now put numerator and denominator together 
    if myDenominator==0 or myNumeratorCount==0:
      myMoranI=None
      myGearyC=None
    else:
      myMoranI= float(myN)/float(myNumeratorCount)*myNumeratorMI/myDenominator
      myGearyC= float(myN-1)/(2*float(myNumeratorCount))*myNumeratorGC/myDenominator

      # Stats for Moran's I
      # Expected value of Moran's I
      myE=-1*pow(myN-1,-1)

      # Finish S1 calculation
      myS1=myS1/2

      # Variance of Moran's I Assuming Normality
      myVarianceMIAN=(((pow(myN,2)*myS1) - (myN*myS2) + (3*(pow(myS0,2)))) /\
                       (pow(myS0,2)*(pow(myN,2)-1))) - pow(myE,2)


      # Variance Moran's I Randomisation Version
      if myN>0 and myKDenom>0:
        myK = (myKNum / myN) / pow(myKDenom / myN,2)
        myVarianceMIRV = ((myN*((pow(myN,2)-(3*myN)+3)*myS1 - myN*myS2 + 3*pow(myS0,2)) \
                             - myK*((pow(myN,2)-myN)*myS1-2*myN*myS2+6*pow(myS0,2))) /\
                            ( (myN-1)*(myN-2)*(myN-3)*pow(myS0,2) )) - pow(myE,2)

      if myVarianceMIAN > 0:
        # Zscore for Moran's I assuming Normality
        myZMIAN =  (myMoranI-myE) / pow(myVarianceMIAN,0.5)
        # P value that Moran's I shows no significant autocorrelation
        myPMIAN = 2*(1-zprob(myZMIAN))

      if myVarianceMIRV > 0:
        # Zscore for Moran's I randomisation version
        myZMIRV =  (myMoranI-myE) / pow(myVarianceMIRV,0.5)
        # P value that Moran's I shows no significant autocorrelation
        myPMIRV = 2*(1-zprob(myZMIRV))

      # Variance, z-score & p value for Geary's C
      # Normality version
      myVarianceGCAN = ((((2*myS1)+myS2)*(myN-1)-(4*pow(myS0,2))))/(2*(myN+1)*pow(myS0,2))
      if myVarianceGCAN>0:
        myZGCAN = -1*(myGearyC - 1) / pow(myVarianceGCAN,0.5)
        myPGCAN = 2*(1-zprob(myZGCAN))

      # Now the random version
      myVarianceGCRV = ((((myN-1)*myS1)*((pow(myN,2)-(3*myN)+3-((myN-1)*myK)))) \
                          - ((((myN-1)*myS2)*((pow(myN,2)+(3*myN)-6-((pow(myN,2)-myN+ 2)*myK))))/4) \
                          + (pow(myS0,2)*(pow(myN,2)-3-(pow(myN-1,2)*myK)))) /\
                          ((myN*(myN-2)*(myN-3))*pow(myS0,2))
      if myVarianceGCRV>0:
        myZGCRV = -1*(myGearyC - 1) / pow(myVarianceGCRV,0.5)
        myPGCRV = 2*(1-zprob(myZGCRV))

    return [myMoranI,
            myVarianceMIAN,myZMIAN,myPMIAN,
            myVarianceMIRV,myZMIRV,myPMIRV,
            myGearyC,
            myVarianceGCAN,myZGCAN,myPGCAN,
            myVarianceGCRV,myZGCRV,myPGCRV]
Пример #11
0
        mpl.figure(figsize=(args.figwidth, args.figheight))
        slice = slice.dropna(how='any')
        priors = np.ones((n_pos, len(FPKM_cols))) / n_pos
        widgets = ['Time %s:'%ts, Percentage(), Bar(), ETA()]
        progress = ProgressBar(widgets=widgets)
        for gene in progress(slice.index):
            if gene not in frame.index: continue
            #if sum(np.isnan(slice.ix[gene])):
            #    assert False
            #    continue
            normed = (slice.ix[gene] / max(slice.ix[gene]) *
                      np.mean(best_cycle.ix[gene], axis=1))
            for i, col in enumerate(FPKM_cols):
                std = get_std(col, frame)

                evidence = stats.zprob(-np.abs((normed -
                                                frame[col][gene])/(std+1)))

                updated = bayes(priors[:,i], evidence)
                #assert not sum(np.isnan(updated))
                priors[:,i] = updated
        my_cm = mpl.cm.__getattribute__(args.colormap)
        n_pos, n_samples = np.shape(priors)
        plots = []
        for i in range(n_samples):
            plots.extend(mpl.plot(priors[:,i],
                                  label=FPKM_cols[i].replace('_FPKM', ''),
                                  color = my_cm(i * 256 / (n_samples-1))))
        ax = mpl.gca()
        Y = priors.max()
        dY = 0.25 * Y
        Y += dY
Пример #12
0
 def d_two(self):
     n_number = stats.zprob(self.calc_d_two())
     return n_number
Пример #13
0
 def delta(self):
     n_number = stats.zprob(self.calc_d_one())
     return n_number
Пример #14
0
def zTransform(r, n):
    z = np.log((1 + r) / (1 - r)) * (np.sqrt(n - 3) / 2)
    p = zprob(-z)
    return p