def read_Interactions(contactCountsFile, biasFile, outliers=None): mainDic={} print("Reading the contact counts file to generate bins...") startT = time.time() observedInterAllSum=0 #used observedIntraAllSum=0 #used observedInterAllCount=0 observedIntraAllCount=0 #notused observedIntraInRangeSum=0 #used observedIntraInRangeCount=0 #notused minObservedGenomicDist=float('inf') #notused maxObservedGenomicDist=0 #notused linectr = 0 outlierposctr = 0 #Loop through every line in the contactCountsFile with gzip.open(contactCountsFile, 'rt') as f: for lines in f: if outliers != None and outlierposctr<len(outliers): if linectr == outliers[outlierposctr]: linectr+=1 outlierposctr+=1 continue ch1,mid1,ch2,mid2,contactCount=lines.split() #create the interaction contactCount=float(contactCount) interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)]) interxn.setCount(contactCount) interactionType = interxn.getType(distLowThres,distUpThres) if interactionType=='inter': observedInterAllSum += interxn.getCount() observedInterAllCount +=1 else: # any type of intra observedIntraAllSum +=interxn.getCount() observedIntraAllCount +=1 if interactionType=='intraInRange': #interxn.setDistance(interxn.getDistance()+(1000-interxn.getDistance()) % 1000) minObservedGenomicDist=min(minObservedGenomicDist,interxn.getDistance()) maxObservedGenomicDist=max(maxObservedGenomicDist,interxn.getDistance()) if interxn.getDistance() not in mainDic: mainDic[interxn.getDistance()] = [0,0] mainDic[interxn.getDistance()][1]+=interxn.getCount() observedIntraInRangeSum +=interxn.getCount() observedIntraInRangeCount +=1 linectr+=1 endT = time.time() print("Interactions file read. Time took %s" % (endT-startT)) with open(logfile, 'w') as log: log.write("\n\nInteractions file read successfully\n") log.write("------------------------------------------------------------------------------------\n") log.write("Observed, Intra-chr in range: pairs= "+str(observedIntraInRangeCount) +"\t totalCount= "+str(observedIntraInRangeSum)+"\n") log.write("Observed, Intra-chr all: pairs= "+str(observedIntraAllCount) +"\t totalCount= "+str(observedIntraAllSum)+"\n") log.write("Observed, Inter-chr all: pairs= "+str(observedInterAllCount) +"\t totalCount= "+str(observedInterAllSum)+"\n") log.write("Range of observed genomic distances [%s %s]" % (minObservedGenomicDist,maxObservedGenomicDist) + "\n"), log.write("\n") return (mainDic,observedInterAllSum,observedIntraAllSum,observedIntraInRangeSum) # from read_Interactions
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic, outliersline, outliersdist, observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo): with open(logfile, 'a') as log: log.write("\nFitting a univariate spline to the probability means\n"), log.write( "------------------------------------------------------------------------------------\n" ), splineX = None newSplineY = None residual = None FDRx = None FDRy = None if not interOnly: if outliersdist != None: y = [f for _, f in sorted(zip(x, y), key=lambda pair: pair[0])] x.sort() for i in range(1, len(x)): if x[i] <= x[i - 1]: print( "ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct." ) print("Avg. distance of bin(i-1)... %s" % x[i - 1]) print("Avg. distance of bin(i)... %s" % x[i]) sys.exit(2) # maximum residual allowed for spline is set to min(y)^2 splineError = min(y) * min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) tempMaxX = max(x) tempMinX = min(x) tempList = sorted([dis for dis in mainDic]) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX <= i <= tempMaxX: splineX.append(i) splineY = ius(splineX) #print(splineY) #print(yerr) ir = IsotonicRegression(increasing=False) newSplineY = ir.fit_transform(splineX, splineY) #print(newSplineY) residual = sum([i * i for i in (y - ius(x))]) if visual == True: xi = np.linspace(min(x), max(x), 5 * len(x)) yi = ius(xi) print("Plotting %s" % (outfilename + ".png")) plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.plot(myUtils.scale_a_list(splineX, toKb), myUtils.scale_a_list(newSplineY, toProb), 'g-', label="spline-" + str(passNo), linewidth=2) plt.errorbar(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list(y, toProb), myUtils.scale_a_list(yerr, toProb), fmt='r.', label="Mean with std. error", linewidth=2) #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large') #plt.xlabel('Genomic distance (kb)',fontsize='large') plt.ylabel('Contact probability (x10$^{-5}$)') plt.xlabel('Genomic distance (kb)') if distLowThres > 0 and distUpThres < float("inf"): plt.xlim( myUtils.scale_a_list([distLowThres, distUpThres], toKb)) plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if distLowThres > 0 and distUpThres < float("inf"): plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)') plt.xlabel('Genomic distance (log-scale)') plt.savefig(outfilename + '.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'rt') intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 discardCount = 0 p_vals = [] q_vals = [] biasl = [] biasr = [] for line in infile: ch1, mid1, ch2, mid2, contactCount = line.rstrip().split() contactCount = float(contactCount) interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)]) interxn.setCount(contactCount) mid1 = int(mid1) mid2 = int(mid2) interactionType = interxn.getType(distLowThres, distUpThres) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if biasDic: if ch1 in biasDic and mid1 in biasDic[ch1]: bias1 = biasDic[ch1][mid1] if ch2 in biasDic and mid2 in biasDic[ch2]: bias2 = biasDic[ch2][mid2] biasl.append(bias1) biasr.append(bias2) if (bias1 < 0 or bias2 < 0) and interactionType != 'inter': prior_p = 1.0 p_val = 1.0 discardCount += 1 elif interactionType == 'intraInRange' and not interOnly: distToLookUp = max(interxn.getDistance(), min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * (bias1 * bias2) p_val = scsp.bdtrc(interxn.getCount() - 1, observedIntraInRangeSum, prior_p) intraInRangeCount += 1 elif interactionType == 'intraShort' and not interOnly: prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 elif interactionType == 'intraLong' and not interOnly: prior_p = 1.0 #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY p_val = 1.0 intraOutOfRangeCount += 1 else: if allReg or interOnly: prior_p = interChrProb * (bias1 * bias2) p_val = scsp.bdtrc(interxn.getCount() - 1, observedInterAllSum, prior_p) interCount += 1 else: p_val = 1.0 #p_vals.append(p_val) p_vals.append(p_val) infile.close() outlierThres = 0 # Do the BH FDR correction if allReg: outlierThres = 1.0 / (possibleIntraInRangeCount + possibleInterAllCount) q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleInterAllCount + possibleIntraInRangeCount) elif interOnly and not allReg: outlierThres = 1.0 / possibleInterAllCount q_vals = myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount) else: outlierThres = 1.0 / possibleIntraInRangeCount q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleIntraInRangeCount) print("Outlier threshold is... %s" % (outlierThres)) #now we write the values back to the file infile = gzip.open(infilename, 'rt') if resolution: outfile = gzip.open( outfilename + '.res' + str(resolution) + '.significances.txt.gz', 'wt') else: outfile = gzip.open(outfilename + '.significances.txt.gz', 'wt') print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt")) outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n" ) count = 0 for line in infile: words = line.rstrip().split() chr1 = words[0] midPoint1 = int(words[1]) chr2 = words[2] midPoint2 = int(words[3]) interactionCount = float(words[4]) p_val = p_vals[count] q_val = q_vals[count] bias1 = biasl[count] bias2 = biasr[count] if (allReg or interOnly) and chr1 != chr2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if (allReg or not interOnly) and chr1 == chr2: interactionDistance = abs(midPoint1 - midPoint2) if myUtils.in_range_check(interactionDistance, distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if p_val < outlierThres: outliersline.add(count) outliersdist.add(abs(midPoint1 - midPoint2)) count += 1 outfile.close() infile.close() if visual == True: print("Plotting q-values to file %s" % outfilename + ".qplot.png") minFDR = 0.0 maxFDR = 0.05 increment = 0.001 FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment, outfilename + ".qplot") with open(logfile, 'a') as log: log.write("Spline successfully fit\n"), log.write("\n"), log.write("\n"), return [ splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy ] # from fit_Spline
def read_All_Interactions(infilename, biasDic): sys.stderr.write( "\nReading all the interactions and then sorting the intra chr ones in range according to genomic distance\n" ) sys.stderr.write( "------------------------------------------------------------------------------------\n" ) # global variables initialized by this function global observedIntraAllSum global observedIntraAllCount global observedIntraInRangeSum global observedIntraInRangeCount global observedInterAllSum global observedInterAllCount global minObservedGenomicDist global maxObservedGenomicDist #read the interactions file - create a two dimensional numpy array with each row is a [distance,count] pair infile = gzip.open(infilename, 'r') for line in infile: words = line.rstrip().split() interxn = myUtils.Interaction( [words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(int(words[4])) chrIndex1 = chrList.index(interxn.chr1) chrIndex2 = chrList.index(interxn.chr2) chr1 = words[0] chr2 = words[2] midPoint1 = int(words[1]) midPoint2 = int(words[3]) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic) > 0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1 = biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2 = biasDic[chr2][midPoint2] if interxn.mid1 not in listOfMappableFrags[ chrIndex1] or interxn.mid2 not in listOfMappableFrags[ chrIndex2]: sys.stderr.write("Not-mappable fragment pair: %s\t" % str(interxn.chr1) + "%d\t" % interxn.mid1 + "%s\t" % str(interxn.chr2) + "%d\n" % interxn.mid2) continue if interxn.type == 'inter': observedInterAllSum += interxn.hitCount observedInterAllCount += 1 else: # any type of intra observedIntraAllSum += interxn.hitCount observedIntraAllCount += 1 if interxn.getType(distLowThres, distUpThres) == 'intraInRange': minObservedGenomicDist = min(minObservedGenomicDist, interxn.distance) maxObservedGenomicDist = max(maxObservedGenomicDist, interxn.distance) # every pair should already be in the dictionary with a zero interaction count dictkey = str(interxn.chr1) + '-' + str( min(interxn.mid1, interxn.mid2)) + '-' + str( max(interxn.mid1, interxn.mid2)) if not dictkey in possiblePairsPerDistance: sys.exit("Illegal fragment pair") else: possiblePairsPerDistance[dictkey] = [ interxn.distance, interxn.hitCount, bias1 * bias2 ] #--now with biases observedIntraInRangeSum += interxn.hitCount observedIntraInRangeCount += 1 # END else # END for infile.close() sys.stderr.write("Total of \t"+str(observedIntraAllCount) +" observed intra-chr fragment pairs,\t"\ +str(observedIntraInRangeCount) +" observed intra-chr fragment pairs in range,\t"\ +str(observedInterAllCount) +" observed inter-chr fragment pairs\n" ) sys.stderr.write("Total of \t"+str(observedIntraAllSum) +" observed intra-chr read counts,\t"\ +str(observedIntraInRangeSum) +" observed intra-chr read counts in range,\t"\ +str(observedInterAllSum) +" observed inter-chr read counts\n" ) sys.stderr.write("Range of observed genomic distances [%d %d]" % (minObservedGenomicDist, maxObservedGenomicDist) + "\n") # sort the interactions if not already sorted sortedInteractions = [] for i in possiblePairsPerDistance: sortedInteractions.append(possiblePairsPerDistance.get(i)) t = time.time() myUtils.sort_by_column( sortedInteractions, 0) #in-place sorting according to column index 0 (first column) sys.stderr.write( "Total time for sorting interactions according to genomic distance: %.3f\n" % (time.time() - t)) return sortedInteractions #from read_All_Interactions
def fit_Spline(x, y, yerr, infilename, sortedInteractions, biasDic, figname, passNo): sys.stderr.write("\nFit a univariate spline to the probability means\n") sys.stderr.write( "------------------------------------------------------------------------------------\n" ) sys.stderr.write("baseline intra-chr probability: " + repr(baselineIntraChrProb) + "\tbaseline inter-chr probability: " + repr(baselineInterChrProb) + "\n") # xi and yi will be used only for visualization purposes # acutal fit and residual is all done on vectors x and y xi = np.linspace(min(x), max(x), overSample * len(x)) # assume residualFactor==-1: splineError = min(y) * min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) yi = ius(xi) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance tempMaxX = max(x) tempMinX = min(x) tempList = sorted(list(set([int(i[0]) for i in sortedInteractions]))) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX <= i and i <= tempMaxX: splineX.append(i) # END for #print len(splineX) splineY = ius(splineX) # R vector format rSplineX = ro.FloatVector(splineX) rSplineY = ro.FloatVector(splineY) rMonoReg = ro.r['monoreg'] # do the antitonic regression allRres = rMonoReg(rSplineX, rSplineY, type="antitonic") rNewSplineY = allRres[3] # convert data back to Python format newSplineY = [] diff = [] diffX = [] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i] - newSplineY[i]) > 0: diff.append(splineY[i] - newSplineY[i]) diffX.append(splineX[i]) # END for #print len(splineX) residual = sum([i * i for i in (y - ius(x))]) if visual == True: ### Now plot the results sys.stderr.write("Plotting %s" % figname + ".png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.plot(myUtils.scale_a_list(splineX, toKb), myUtils.scale_a_list(newSplineY, toProb), 'g-', label="spline-" + str(passNo), linewidth=2) plt.errorbar(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list(y, toProb), myUtils.scale_a_list(yerr, toProb), fmt='r.', label="Mean with std. error", linewidth=2) if useInters: plt.plot(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list([baselineIntraChrProb for i in x], toProb), 'k-', label="Baseline intra-chromosomal") plt.plot(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list([baselineIntraChrProb for i in x], toProb), 'b-', label="Baseline inter-chromosomal") plt.ylabel('Contact probability (x10$^{-5}$)', fontsize='large') plt.xlabel('Genomic distance (kb)', fontsize='large') if distLowThres > -1 and distUpThres > -1: plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres], toKb)) plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if useInters: plt.loglog(x, [baselineIntraChrProb for i in x], 'k-') plt.loglog(x, [baselineIntraChrProb for i in x], 'b-') if distLowThres > -1 and distUpThres > -1: plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)', fontsize='large') plt.xlabel('Genomic distance (log-scale)', fontsize='large') plt.savefig(outdir + '/' + figname + '.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'r') intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " + repr(distUpThres) + "\n") p_vals = [] q_vals = [] for line in infile: words = line.rstrip().split() interxn = myUtils.Interaction( [words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(int(words[4])) chr1 = words[0] chr2 = words[2] midPoint1 = int(words[1]) midPoint2 = int(words[3]) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic) > 0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1 = biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2 = biasDic[chr2][midPoint2] if (bias1 < 0 or bias2 < 0) and interxn.type != 'inter': prior_p = 1.0 p_val = 1.0 p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraInRange': # make sure the interaction distance is covered by the probability bins distToLookUp = max(interxn.distance, min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) #prior_p=newSplineY[i] prior_p = newSplineY[i] * (bias1 * bias2 ) # biases added in the picture intraInRangeCount += 1 ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum, prior_p) p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraShort': prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraLong': # out of range bigger than distUpThres # use the prior of the baseline intra-chr interaction probability prior_p = 1.0 #baselineIntraChrProb*(bias1*bias2) # biases added in the picture p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum, prior_p) intraOutOfRangeCount += 1 p_vals.append(p_val) else: if useInters: #prior_p=baselineIntraChrProb prior_p = baselineInterChrProb * ( bias1 * bias2) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum, prior_p) interCount += 1 p_vals.append(p_val) # END for infile.close() # Do the BH FDR correction if useInters: q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleInterAllCount + possibleIntraAllCount) sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " + repr(possibleInterAllCount + possibleIntraAllCount) + "\n") else: q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleIntraInRangeCount) sys.stderr.write("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount) + "\n") infile = gzip.open(infilename, 'r') outfile = gzip.open(outdir + '/' + figname + '.significances.txt.gz', 'w') sys.stderr.write("Writing p-values to file %s" % figname + ".significances.txt.gz\n") count = 0 outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n" ) for line in infile: words = line.rstrip().split() chrNo1 = words[0] midPoint1 = int(words[1]) chrNo2 = words[2] midPoint2 = int(words[3]) interactionCount = int(words[4]) p_val = p_vals[count] q_val = q_vals[count] if useInters == False and chrNo1 == chrNo2: # intra interactionDistance = abs(midPoint1 - midPoint2) # dist if myUtils.in_range_check(interactionDistance, distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1), midPoint1, str(chrNo2), midPoint2, interactionCount, p_val, q_val)) elif useInters == True and chrNo1 != chrNo2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1), midPoint1, str(chrNo2), midPoint2, interactionCount, p_val, q_val)) #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) count += 1 # END for - printing pvals and qvals for all the interactions outfile.close() isOutlier = [] distsBelow = [] distsAbove = [] intcountsBelow = [] intcountsAbove = [] belowThresCount = 0 aboveThresCount = 0 outlierThres = 1.0 / possibleIntraInRangeCount for interactionDistance, interactionCount, bias12 in sortedInteractions: # make sure the interaction distance is covered by the probability bins distToLookUp = max(interactionDistance, min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * float(bias12) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interactionCount - 1, observedIntraInRangeSum, prior_p) if p_val < outlierThres: distsBelow.append(interactionDistance) intcountsBelow.append(interactionCount) isOutlier.append(1) belowThresCount += 1 else: distsAbove.append(interactionDistance) intcountsAbove.append(interactionCount) isOutlier.append(0) aboveThresCount += 1 # END for - doing the outlier check for all interactions in sortedInteractions if visual == True: sys.stderr.write("Plotting results of extracting outliers to file %s" % figname + ".extractOutliers.png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(111) downsample = 30 # for the non-outliers randIndcsAbove = sample([i for i in range(len(intcountsAbove))], len(intcountsAbove) / downsample) randIndcsAbove = sorted(randIndcsAbove) downsample = 20 # for the outliers randIndcsBelow = sample([i for i in range(len(intcountsBelow))], len(intcountsBelow) / downsample) randIndcsBelow = sorted(randIndcsBelow) plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow], toKb), [intcountsBelow[i] for i in randIndcsBelow], 'r.', label="Outliers (p-value < 1/M)") plt.plot(myUtils.scale_a_list(splineX + [maxObservedGenomicDist], toKb), [ newSplineY[i] * observedIntraInRangeSum for i in range(len(newSplineY)) ] + [newSplineY[-1] * observedIntraInRangeSum], 'g-', label="spline-" + str(passNo) + " (x N)", linewidth=2.5) plt.xlabel('Genomic distance (kb)') plt.ylabel('Contact counts') print(repr(len(intcountsBelow)) + "\t"), ## this limits y-axis of the hit count plots if len(intcountsBelow) > 0: plt.ylim([0, min(max(intcountsBelow), 1500)]) if distLowThres > -1 and distUpThres > -1: plt.xlim([0, distUpThres * toKb]) ax.legend(loc="upper right", fancybox=True) plt.savefig(outdir + '/' + figname + '.extractOutliers.png') sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\ repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n") if visual == True: sys.stderr.write("Plotting q-values to file %s" % figname + ".qplot.png\n") minFDR = 0.0 maxFDR = 0.05 increment = 0.001 FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment, figname + ".qplot") infile.close() return [splineX, newSplineY, residual, isOutlier, FDRx, FDRy] # from fit_Spline
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic): print("\nFit a univariate spline to the probability means\n"), print( "------------------------------------------------------------------------------------\n" ), #print("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\n"), # maximum residual allowed for spline is set to min(y)^2 splineError = min(y) * min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance tempMaxX = max(x) tempMinX = min(x) tempList = sorted([dis for dis in mainDic]) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX <= i and i <= tempMaxX: splineX.append(i) # END for splineY = ius(splineX) # R vector format rSplineX = ro.FloatVector(splineX) rSplineY = ro.FloatVector(splineY) rMonoReg = ro.r['monoreg'] # do the antitonic regression allRres = rMonoReg(rSplineX, rSplineY, type="antitonic") rNewSplineY = allRres[3] # convert data back to Python format newSplineY = [] diff = [] diffX = [] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i] - newSplineY[i]) > 0: diff.append(splineY[i] - newSplineY[i]) diffX.append(splineX[i]) # END if # END for ### Now newSplineY holds the monotonic contact probabilities residual = sum([i * i for i in (y - ius(x))]) ### Now plot the results plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.title( 'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e' % (residual), size='small') plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y], 'ro', label="Means") #plt.plot([i/1000.0 for i in xi], [i*100000 for i in yi],'g-',label="Spline fit") plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY], 'g-', label="Spline fit") #plt.plot([i/1000.0 for i in x], [normalizedInterChrProb*100000 for i in x],'k-',label="Random intra-chromosomal") #plt.plot([i/1000.0 for i in x], [interChrProb*100000 for i in x],'b-',label="Inter-chromosomal") plt.ylabel('Probability (1e-5)') plt.xlabel('Genomic distance (kb)') plt.xlim([min(x) / 1000.0, max(x) / 1000.0]) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') #plt.loglog(xi, yi, 'g-') plt.loglog(x, y, 'r.') # Data #plt.loglog(x, [normalizedInterChrProb for i in x],'k-') #plt.loglog(x, [interChrProb for i in x],'b-') plt.ylabel('Probability (log scale)') plt.xlabel('Genomic distance (log scale)') #plt.xlim([20000,100000]) plt.xlim([min(x), max(x)]) plt.savefig(outfilename + '.res' + str(resolution) + '.png') sys.stderr.write("Plotting %s" % outfilename + ".png\n") # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'r') intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 discardCount = 0 print("lower bound on mid-range distances " + repr(distLowThres) + ", upper bound on mid-range distances " + repr(distUpThres) + "\n"), p_vals = [] q_vals = [] for line in infile: words = line.rstrip().split() interxn = myUtils.Interaction( [words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(float(words[4])) chr1 = words[0] chr2 = words[2] midPoint1 = int(words[1]) midPoint2 = int(words[3]) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic) > 0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1 = biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2 = biasDic[chr2][midPoint2] if bias1 == -1 or bias2 == -1: p_val = 1.0 discardCount += 1 elif interxn.type == 'intra': if interxn.getType(distLowThres, distUpThres) == 'intraInRange': # make sure the interaction distance is covered by the probability bins distToLookUp = max(interxn.distance, min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * (bias1 * bias2 ) # biases added in the picture p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum, prior_p) intraInRangeCount += 1 elif interxn.getType(distLowThres, distUpThres) == 'intraShort': prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 elif interxn.getType(distLowThres, distUpThres) == 'intraLong': ## out of range distance ## use the prior of the baseline intra-chr interaction probability prior_p = baselineIntraChrProb * ( bias1 * bias2) # biases added in the picture p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum, prior_p) intraOutOfRangeCount += 1 # END if else: # inter #prior_p=normalizedInterChrProb prior_p = interChrProb * (bias1 * bias2 ) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum, prior_p) interCount += 1 # p_vals.append(p_val) # END for infile.close() # Do the BH FDR correction q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleInterAllCount + possibleIntraAllCount) #q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount) #print("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n"), infile = gzip.open(infilename, 'r') outfile = gzip.open( outfilename + '.res' + str(resolution) + '.significances.txt.gz', 'w') print("Writing p-values and q-values to file %s" % outfilename + ".significances.txt\n"), print("Number of pairs discarded due to bias not in range [0.5 2]\n"), outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n" ) count = 0 for line in infile: words = line.rstrip().split() chrNo1 = words[0] midPoint1 = int(words[1]) chrNo2 = words[2] midPoint2 = int(words[3]) interactionCount = int(words[4]) p_val = p_vals[count] q_val = q_vals[count] #if chrNo1==chrNo2: # intra # interactionDistance=abs(midPoint1-midPoint2) # dist # if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres): # outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) #else: # outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1), midPoint1, str(chrNo2), midPoint2, interactionCount, p_val, q_val)) count += 1 # END for - printing pvals and qvals for all the interactions outfile.close() infile.close() return [splineX, newSplineY, residual] # from fit_Spline
def read_All_Interactions(mainDic, contactCountsFile, noOfFrags): print("\nReading all the contact counts\n"), print( "------------------------------------------------------------------------------------\n" ), global observedInterAllSum global observedInterAllCount global observedIntraAllSum global observedIntraAllCount global observedIntraInRangeSum global observedIntraInRangeCount global minObservedGenomicDist global maxObservedGenomicDist #Xvals=[] #Xindices=[] #for i in range(noOfFrags): # Xvals.append([]) # Xindices.append([]) ## infile = gzip.open(contactCountsFile, 'r') count = 0 for line in infile: ch1, mid1, ch2, mid2, contactCount = line.split() ### FIXME: this part will need to be fixed for human etc #ch1='chr'+ch1 #ch2='chr'+ch2 contactCount = float(contactCount) interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)]) interxn.setCount(contactCount) count += 1 if count % 1000000 == 0: print count if interxn.type == 'inter': observedInterAllSum += interxn.hitCount observedInterAllCount += 1 else: # any type of intra observedIntraAllSum += interxn.hitCount observedIntraAllCount += 1 if interxn.getType(distLowThres, distUpThres) == 'intraInRange': minObservedGenomicDist = min(minObservedGenomicDist, interxn.distance) maxObservedGenomicDist = max(maxObservedGenomicDist, interxn.distance) if interxn.distance in mainDic: mainDic[interxn.distance][1] += contactCount observedIntraInRangeSum += interxn.hitCount observedIntraInRangeCount += 1 # END else # indx1=allFragsDic[ch1][mid1] # indx2=allFragsDic[ch2][mid2] #print str(indx1)+"\t"+str(indx2) # Xvals[indx1].append(contactCount) # Xindices[indx1].append(indx2) # Xvals[indx2].append(contactCount) # Xindices[indx2].append(indx1) # END for infile.close() print("Observed, Intra-chr in range: pairs= " + str(observedIntraInRangeCount) + "\t totalCount= " + str(observedIntraInRangeSum)) print("Observed, Intra-chr all: pairs= " + str(observedIntraAllCount) + "\t totalCount= " + str(observedIntraAllSum)) print("Observed, Inter-chr all: pairs= " + str(observedInterAllCount) + "\t totalCount= " + str(observedInterAllSum)) print( "Range of observed genomic distances [%d %d]" % (minObservedGenomicDist, maxObservedGenomicDist) + "\n"), #return (mainDic,Xvals,Xindices) # from read_All_Interactions return mainDic # from read_All_Interactions