def compare(self, subWins=5, baseSteps=25, pCutoff=.00001, diffCutoff=1, minSize=1e3): """Compare the control and the test sample. subWins: number of windows to split a region into baseSteps: to avoid some of the autocorrelation given by the reads, take only every baseSteps-th base pCutoff: the p-value cutoff to either follow up or not minSize: minimal size of a region to still be tested return: a list of significant regions""" sigRegs = [] if self.size < minSize: return sigRegs realDiff = self.testCov[::baseSteps] - self.contCov[::baseSteps] self.aveDiff = realDiff.mean() diff = sign01vec(realDiff) try: testRes = wald_wolfowitz(diff) self.p = testRes['p'] except ZeroDivisionError: # this means that there are only zeroes (or ones) self.p = 0 if abs(self.aveDiff) > diffCutoff: sigRegs = [self] return sigRegs if self.p > pCutoff: return sigRegs nextSize = int(self.size/subWins) for startPoint in xrange(self.start, self.end, nextSize): relStart = startPoint-self.start relEnd = relStart+nextSize reg = genomicRegion(self.chrom, startPoint, startPoint+nextSize, self.contCov[relStart:relEnd], self.testCov[relStart:relEnd]) sigRegs.extend(reg.compare(subWins, baseSteps, pCutoff, diffCutoff, minSize)) if not sigRegs and abs(self.aveDiff) > diffCutoff: sigRegs = [self] return sigRegs
def wald_wolfowitz_test(x): ''' null hypothesis: samples from alternating sequence 010101010101 ''' if len(set(x)) == 1: return None, None, None if len(x) <= 2: return None, None, None p, z, n_runs, sd, mean = wald_wolfowitz(x).values() return p < 0.05, p, int(n_runs) / float(len(x))
def match(self, pair): self.pair = pair v1 = pair.s1["vector"] v2 = pair.s2["vector"] m = len(v1) n = len(v2) N = m + n k = min(N - 1, self.k) if m == 0 or n == 0 or np.linalg.norm(v1) == 0 or np.linalg.norm(v2) == 0: return 0 vs = np.concatenate((v1, v2)) g = kneighbors_graph(vs, mode='distance', n_neighbors=k) mst = minimum_spanning_tree(g, overwrite=True) edges = np.array(mst.nonzero()).T labels = np.array([0] * m + [1] * n) c = labels[edges] runs_edges = edges[c[:, 0] == c[:, 1]] # number of runs is the total number of observations minus edges within each run R = N - len(runs_edges) # expected value of R e_R = ((2.0 * m * n) / N) + 1 # variance of R is _numer/_denom v = 2 * m * n * (2 * m * n - N) / (N ** 2 * (N - 1)) # see Eq. 1 in Friedman 1979 # W approaches a standard normal distribution W = (R - e_R) / np.sqrt(v) self.tsim = -1 if np.isnan(W) else W bydim = [] for d in range(len(v1[0])): sorteddim = np.argsort(vs[:, d]) wd = wald_wolfowitz(labels[sorteddim]) bydim.append(wd['z']) self._features = [self.tsim] if self.dimfeatures: self._features += bydim return self.tsim
print CP_SPE(p1, RNoise) #print myData #print fitData print "###############" #Iterative Nonlinear Regression i = 15 j = 21 xdata = range(i, j) ydata = myData[i + 1:j + 1] lmParams, lmCov = optimize.curve_fit(nlmFit, xdata=xdata, ydata=ydata, maxfev=5000) lmFitData = [nlmFit(x, lmParams[0], lmParams[1], lmParams[2]) for x in xdata] lmResids = nlmFitResiduals(xdata, ydata, lmParams[0], lmParams[1], lmParams[2]) #P-value for runs test on resids run = [x >= 0 for x in lmResids] runsTest = wald_wolfowitz(run) print lmParams print xdata print ydata print lmFitData print lmResids print "#################" print run print 1 - runsTest['p']
print RNoise print CP_FDM(p1) print CP_SDM(p1) print CP_SPE(p1,RNoise) #print myData #print fitData print "###############" #Iterative Nonlinear Regression i = 15 j = 21 xdata=range(i,j) ydata=myData[i+1:j+1] lmParams,lmCov = optimize.curve_fit(nlmFit,xdata=xdata,ydata=ydata,maxfev=5000) lmFitData = [nlmFit(x,lmParams[0],lmParams[1],lmParams[2]) for x in xdata] lmResids = nlmFitResiduals(xdata,ydata,lmParams[0],lmParams[1],lmParams[2]) #P-value for runs test on resids run = [x>=0 for x in lmResids] runsTest = wald_wolfowitz(run) print lmParams print xdata print ydata print lmFitData print lmResids print "#################" print run print 1-runsTest['p']