예제 #1
0
 def compare(self, subWins=5, baseSteps=25, pCutoff=.00001, diffCutoff=1, minSize=1e3):
     """Compare the control and the test sample.
     subWins: number of windows to split a region into
     baseSteps: to avoid some of the autocorrelation given by the reads, take only every
     baseSteps-th base
     pCutoff: the p-value cutoff to either follow up or not
     minSize: minimal size of a region to still be tested
     return: a list of significant regions"""
     sigRegs = []
     if self.size < minSize:
         return sigRegs
     realDiff = self.testCov[::baseSteps] - self.contCov[::baseSteps]
     self.aveDiff = realDiff.mean()
     diff = sign01vec(realDiff)
     try:
         testRes = wald_wolfowitz(diff)
         self.p = testRes['p']
     except ZeroDivisionError:  # this means that there are only zeroes (or ones)
         self.p = 0
         if abs(self.aveDiff) > diffCutoff:
             sigRegs = [self]
         return sigRegs
     if self.p > pCutoff:
         return sigRegs
     nextSize = int(self.size/subWins)
     for startPoint in xrange(self.start, self.end, nextSize):
         relStart = startPoint-self.start
         relEnd = relStart+nextSize
         reg = genomicRegion(self.chrom, startPoint, startPoint+nextSize,
                             self.contCov[relStart:relEnd], self.testCov[relStart:relEnd])
         sigRegs.extend(reg.compare(subWins, baseSteps, pCutoff, diffCutoff, minSize))
     if not sigRegs and abs(self.aveDiff) > diffCutoff:
         sigRegs = [self]
     return sigRegs
예제 #2
0
def wald_wolfowitz_test(x):
    '''
    null hypothesis: samples from alternating sequence
    010101010101

    '''
    if len(set(x)) == 1:
        return None, None, None
    if len(x) <= 2:
        return None, None, None
    p, z, n_runs, sd, mean = wald_wolfowitz(x).values()
    return p < 0.05, p, int(n_runs) / float(len(x))
예제 #3
0
	def match(self, pair):
		self.pair = pair
		v1 = pair.s1["vector"]
		v2 = pair.s2["vector"]
		m = len(v1)
		n = len(v2)
		N = m + n
		k = min(N - 1, self.k)
		if m == 0 or n == 0 or np.linalg.norm(v1) == 0 or np.linalg.norm(v2) == 0:
			return 0

		vs = np.concatenate((v1, v2))
		g = kneighbors_graph(vs, mode='distance', n_neighbors=k)
		mst = minimum_spanning_tree(g, overwrite=True)
		edges = np.array(mst.nonzero()).T
		labels = np.array([0] * m + [1] * n)

		c = labels[edges]
		runs_edges = edges[c[:, 0] == c[:, 1]]

		# number of runs is the total number of observations minus edges within each run
		R = N - len(runs_edges)

		# expected value of R
		e_R = ((2.0 * m * n) / N) + 1

		# variance of R is _numer/_denom
		v = 2 * m * n * (2 * m * n - N) / (N ** 2 * (N - 1))

		# see Eq. 1 in Friedman 1979
		# W approaches a standard normal distribution
		W = (R - e_R) / np.sqrt(v)

		self.tsim = -1 if np.isnan(W) else W

		bydim = []
		for d in range(len(v1[0])):
			sorteddim = np.argsort(vs[:, d])
			wd = wald_wolfowitz(labels[sorteddim])
			bydim.append(wd['z'])

		self._features = [self.tsim]
		if self.dimfeatures:
			self._features += bydim

		return self.tsim
예제 #4
0
print CP_SPE(p1, RNoise)
#print myData
#print fitData
print "###############"

#Iterative Nonlinear Regression
i = 15
j = 21
xdata = range(i, j)
ydata = myData[i + 1:j + 1]

lmParams, lmCov = optimize.curve_fit(nlmFit,
                                     xdata=xdata,
                                     ydata=ydata,
                                     maxfev=5000)
lmFitData = [nlmFit(x, lmParams[0], lmParams[1], lmParams[2]) for x in xdata]
lmResids = nlmFitResiduals(xdata, ydata, lmParams[0], lmParams[1], lmParams[2])

#P-value for runs test on resids
run = [x >= 0 for x in lmResids]
runsTest = wald_wolfowitz(run)

print lmParams
print xdata
print ydata
print lmFitData
print lmResids

print "#################"
print run
print 1 - runsTest['p']
예제 #5
0
print RNoise
print CP_FDM(p1)
print CP_SDM(p1)
print CP_SPE(p1,RNoise)
#print myData
#print fitData
print "###############"

#Iterative Nonlinear Regression
i = 15
j = 21
xdata=range(i,j)
ydata=myData[i+1:j+1]

lmParams,lmCov = optimize.curve_fit(nlmFit,xdata=xdata,ydata=ydata,maxfev=5000)
lmFitData = [nlmFit(x,lmParams[0],lmParams[1],lmParams[2]) for x in xdata]
lmResids = nlmFitResiduals(xdata,ydata,lmParams[0],lmParams[1],lmParams[2])

#P-value for runs test on resids
run = [x>=0 for x in lmResids]
runsTest = wald_wolfowitz(run)

print lmParams
print xdata
print ydata
print lmFitData
print lmResids

print "#################"
print run
print 1-runsTest['p']