def coverages(intersect): svs = {} overlaps = {} for interval in intersect: chrom1, start1, end1, svtype1, sv_id1 = interval[0:5] if sv_id1 not in svs: svs[sv_id1] = [] svs[sv_id1].append([chrom1, start1, end1, svtype1]) # if source == 'CCDG': # if sv_id1 not in ccdgs: # ccdgs[sv_id1] = float(0) # if source == 'CEPH': # if sv_id1 not in cephs: # cephs[sv_id1] = float(0) # if source == 'gnomAD': # if sv_id1 not in gnomads: # gnomads[sv_id1] = float(0) chrom2, start2, end2, svtype2, source2, sv_id2 = interval[5:11] if sv_id1 not in overlaps: overlaps[sv_id1] = [] if svtype1 == svtype2: overlaps[sv_id1].append( [chrom2, start2, end2, svtype2, source2, sv_id2]) for key in overlaps: if len(overlaps[key]) > 0: tmpA = BedTool(svs[key]) tmpB = BedTool(overlaps[key]) coverage = tmpA.coverage(tmpB) for sv in coverage: cov_frac = sv[7] covs[key] = float(cov_frac)
def pybedtoolcoverage(dicoInit, dicoThread, thread_name): bed = BedTool(dicoThread[thread_name]['bed']) bam = BedTool(dicoThread[thread_name]['bam']) cov = bed.coverage(bam, d=True) shutil.move( cov.fn, dicoInit['tmp'] + "/" + str(dicoThread[thread_name]['bam_num']) + ".cov")
def compute_unique_coverage(unique_bam, repnames_bedfile): """Use bedtools to calculate the number of unique reads that overlap the repeat ranges""" print(f"Computing overlaps between {unique_bam} and {repnames_bedfile}...") a = BedTool(repnames_bedfile) b = BedTool(unique_bam) c = a.coverage(b, counts=True) return c
def find_telomeres(window,genomefile,telomeres_gff,percentile,outfilename): from pybedtools import BedTool import pandas as pd import numpy as np win = BedTool().window_maker(g=genomefile,w=window) tel = BedTool(telomeres_gff) cov = win.coverage(tel) df = pd.read_csv(cov.fn,sep="\t",names=["chr","start","end","depth","n bp","len","cov"]) df["per_bp"] = df["depth"]/df["len"] a = np.percentile(df["per_bp"],percentile) tel = df[df["per_bp"]>=a].reset_index() tel["num"] = "tel"+tel.index.map(str) tel2=tel[["chr","start","end","num"]] tel2.to_csv(outfilename,sep="\t",header=None,index=None)
def normalize(geneid2tval): norma = sum(geneid2tval.values()) return dict([ (x[0], x[1]*1000000/norma) for x in geneid2tval.items() ]) def get_geneid(intersection): attrs = dict( [x.split("=") for x in intersection[-2].split(";")]) return attrs["ID"] mapped_reads = BedTool(args.path) genes = BedTool(args.genes); geneid2tval = {}; if(args.stranded): mapped_to_genes = 0 for cov in genes.coverage(b=mapped_reads, F=0.51, s=True, sorted=True): geneid2tval[cov.name] = int(cov[9])/len(cov); mapped_to_genes += int(cov[9]) sys.stderr.write("\nTotal reads: %d\nReads mapped to genes: %d\nFraction mapped %1.2f\n\n" % (len(mapped_reads), mapped_to_genes, mapped_to_genes/len(mapped_reads))) else: geneid2count = defaultdict(float); curname = '' cur_geneids = []; shared_reads = 0; for interval in mapped_reads.intersect(b = genes, wo = True, f=0.51, sorted=True): #print(curname); #print(interval.name)
counter += 1 if stop: break bins = BedTool(binfile) Results = {} tracknames = [] for bed in sorted(bedfiles): (vol, fname) = os.path.split(bed) # fname = "Density_"+re.sub('\.(bed|vcf)','',fname) fname = re.sub('\.(bed|vcf\.gz|gff)', '', fname) print(fname) tracknames.append(fname) bedtrack = BedTool(bed) cov = bins.coverage(bedtrack) for interval in cov: window = interval[3] # print("window is",window) if not window in Results: Results[window] = { "coords": [interval[0], interval[1], interval[2]] } m = Results[window] if not "coverage" in m: m["coverage"] = {fname: interval[6]} else: m["coverage"][fname] = interval[6] with open(tracks, "w") as ggtrack:
def coverage_bed(window_bed, input_coverage_bed, output_coverage_bed): """function to calculate TFBS %coverage on sliding windows""" windows = BedTool(window_bed) motifs = BedTool(input_coverage_bed) # calculate TFBS coverage and save the output file windows.coverage(motifs, output=output_coverage_bed)
for k in chromosomes.keys(): print chrom_Intervals[k][0:10] with open('humanBedInterval.bed', 'w') as f: for k in chromosomes.keys(): f.write('\n'.join( '%s\t%d\t%d' % (k, chrom_Intervals[k][i], chrom_Intervals[k][i + 1] - 1) for i in range(len(chrom_Intervals[k]) - 1)) + '\n') a = BedTool('humanBedInterval.bed').sort() b = BedTool('/mnt/disks/data-vcf/GSN79Tumor_normal.vcf') print a.head() print b.head() a.coverage(b).saveas('VCFCoverage.bed') #,hist=True #for k in chromosomes.keys(): positionHistogram = defaultdict(list) with open('/mnt/disks/data-vcf/GSN79Tumor_normal.vcf', 'r') as f: for line in f.readlines(): if line and line.startswith('#') == 0: if line.split('\t')[0] not in positionHistogram.keys(): positionHistogram[line.split('\t')[0]] = [] positionHistogram[line.split('\t')[0]].append( int(line.split('\t')[1])) for k in chromosomes.keys(): plt.hist(positionHistogram[k], bins=chrom_Intervals[k]) plt.savefig(k + 'SNPDense.png')
def genDataset(genes, testTrain): # second argument is test or train bed dictionary dataset = {'SNP': defaultdict(list), 'indel': defaultdict(list)} #random_index = randrange(0,len(genes)) #with open('out.txt','w') as f: if len(genes) > 1: for gene in genes[0:10]: #random_index[0:11]: #gene = genes[i] print gene if gene and gene.startswith('1\t') or gene.startswith('22\t'): geneInfo = gene.split('\t') interval = map(int, geneInfo[1:3]) #f.write('\t'.join([geneInfo[0]]+geneInfo[1:3])+'\n') #bin1 = np.arange(interval[0],interval[1],100) #bin2 = np.arange(interval[0]+50,interval[1],100) #geneBed = BedTool(gene,from_string=True) #SNPtest = testTrain['SNP'].intersect(geneBed,wa=True) #indelTest = testTrain['indel'].intersect(geneBed,wa=True) #snpDensity = [] #indelDensity = [] #print [np.arange(interval[0],interval[1],100),np.arange(interval[0]+50,interval[1],100)] for bin in [np.arange(interval[0], interval[1], 100)[0:4] ]: #,np.arange(interval[0]+50,interval[1],100)]: for i in range(len(bin) - 1): interval = bin[i:i + 2] geneNaming = geneInfo[3].strip('\n') + '|' + '-'.join( [geneInfo[0]] + map(str, interval)) #geneInfo[0:3] #f.write('Gene Name: ' + geneNaming + '\n') #try: densityBedInt = BedTool('\n'.join( np.vectorize(lambda x: geneInfo[0] + '\t%d\t%d' % (x - 10, x + 10))(np.arange( interval[0] + 10, interval[1] - 10, 5))), from_string=True) #except: # print '\n'.join(np.vectorize(lambda x: geneInfo[0]+'\t%d\t%d'%(x-5,x+5))(np.arange(interval[0]+5,interval[1]-5))) #try: densitySNP = np.vectorize( lambda line: float(line.split('\t')[-1]))(filter( None, str(densityBedInt.coverage( testTrain['SNP'])).split('\n'))) #except: # print str(densityBedInt.coverage(testTrain['SNP'])).split('\n') densityIndel = np.vectorize( lambda line: float(line.split('\t')[-1]))(filter( None, str(densityBedInt.coverage( testTrain['indel'])).split('\n'))) dataset['SNP'][geneNaming] = densitySNP dataset['indel'][geneNaming] = densityIndel print 'duasdf' print dataset dump(dataset['SNP'], open('testData.p', 'wb')) #.keys(),dataset['SNP'].values()] #f.write('FINISH 1\n')#testTrain['SNP'].head() else: for gene in genes: print gene if gene and gene.startswith('1\t') or gene.startswith('22\t'): geneInfo = gene.split('\t') interval = map(int, geneInfo[1:3]) #f.write('\t'.join([geneInfo[0]]+geneInfo[1:3])+'\n') #bin1 = np.arange(interval[0],interval[1],100) #bin2 = np.arange(interval[0]+50,interval[1],100) #geneBed = BedTool(gene,from_string=True) #SNPtest = testTrain['SNP'].intersect(geneBed,wa=True) #indelTest = testTrain['indel'].intersect(geneBed,wa=True) #snpDensity = [] #indelDensity = [] #print [np.arange(interval[0],interval[1],100),np.arange(interval[0]+50,interval[1],100)] for bin in [ np.arange(interval[0], interval[1], 100), np.arange(interval[0] + 50, interval[1], 100) ]: for i in range(len(bin) - 1): interval = bin[i:i + 2] geneNaming = geneInfo[3].strip('\n') + '|' + '-'.join( [geneInfo[0]] + map(str, interval)) #geneInfo[0:3] #f.write('Gene Name: ' + geneNaming + '\n') #try: densityBedInt = BedTool('\n'.join( np.vectorize(lambda x: geneInfo[0] + '\t%d\t%d' % (x - 10, x + 10))(np.arange( interval[0] + 10, interval[1] - 10, 5))), from_string=True) #except: # print '\n'.join(np.vectorize(lambda x: geneInfo[0]+'\t%d\t%d'%(x-5,x+5))(np.arange(interval[0]+5,interval[1]-5))) #try: densitySNP = np.vectorize( lambda line: float(line.split('\t')[-1]))(filter( None, str(densityBedInt.coverage( testTrain['SNP'])).split('\n'))) #except: # print str(densityBedInt.coverage(testTrain['SNP'])).split('\n') densityIndel = np.vectorize( lambda line: float(line.split('\t')[-1]))(filter( None, str(densityBedInt.coverage( testTrain['indel'])).split('\n'))) dataset['SNP'][geneNaming] = densitySNP dataset['indel'][geneNaming] = densityIndel print 'FINISH 1' print dataset return dataset
for line in f: chromCount += 1 # bedread = '\n'.join('\t'.join('%s\t%d\t%d'%tuple([line.split('\t')[0]]+sorted(np.vectorize(lambda x: int(x))(line.split('\t')[1:3]))))) #interval = sorted(np.vectorize(lambda x: int(x))(line.split('\t')[1:3])) interval = line.split('\t')[1] histInterval[line.split('\t')[0]] = list( np.arange(0., int(interval), 250000.)) + [int(interval)] # [0] if chromCount > 22: break bedHist = BedTool('\n'.join('\n'.join('\t'.join([key] + [ str(int(x)) for x in [histInterval[key][i], histInterval[key][i + 1]] ]) for i in range(len(histInterval[key]) - 1)) for key in histInterval.keys()), from_string=True).saveas('BedHist.bed') bedTrans = bedHist.coverage( BedTool(transposonGFF)).sort().saveas('coverage.bed') transposonDensity = [] for line in str(bedTrans).splitlines(): if line: lineList = line.split('\t') transposonDensity.append(lineList[0:3] + [float(lineList[-1])]) transposonDensityArr = np.array(transposonDensity) print transposonDensityArr window = 21 centromereRegions = '' nonCentromereRegions = '' for chrom in histInterval: print chrom if len(histInterval[chrom]) > window: arraySubset = transposonDensityArr[transposonDensityArr[:, 0] == chrom]
def plot_gene(gene, outpref, bams, bedname, gtfname, peakname, labels, nsubplots, normalize, scale): cds_indices = [] transcript_end = 0 if os.path.isfile(gtfname): gtf = open(gtfname, 'r') for line in gtf.readlines(): tabbed = line.split('\t') csome, feature = tabbed[0], tabbed[2] strand = tabbed[6] start = str(int(tabbed[3]) - 1) end = tabbed[4] if feature == 'CDS': if strand == '-': inds = (int(end) - 1, int(start)) else: inds = (int(start), int(end) - 1) cds_indices.append(inds) else: strand = '+' print('{} strand'.format(strand)) peakset = get_peakset(peakname, normalize) coverage = {} bed = BedTool(bedname) for label, bamname in zip(labels, bams): #add replicates here bam = BedTool(bamname) cov = bed.coverage(bam, d=True, stream=True) indices = {} gene_cov = [] tot_cov, fullcov = 0, 0 for i, line in enumerate(cov): items = str(line).split('\t') del items[2:-2] csome, start, intind, nreads = items #print line indices[int(start) + int(intind) - 1] = i gene_cov.append(int(nreads)) if normalize and (csome, start) not in peakset: tot_cov += int(nreads) fullcov += int(nreads) if strand == '-': length_exon = len(indices) for ind in indices: indices[ind] = length_exon - indices[ind] - 1 print('coverage', tot_cov, fullcov) if not normalize: tot_cov = fullcov coverage = fill_coverage(coverage, strand, gene_cov, label, normalize, tot_cov) print("coverage calculated for {}: {}...".format( label, ','.join([str(x) for x in gene_cov[:10]]))) peaks = get_peaks(peakname, indices, strand) sns.set_style("ticks") if not scale: figp, axes = plt.subplots(figsize=(5, 2 * nsubplots), nrows=nsubplots, sharey=True, sharex=True) else: figp, axes = plt.subplots(figsize=(3.5, 1.5 * nsubplots), nrows=nsubplots, sharey=False, sharex=True) #change size if nsubplots == 1: axes = [axes] print("{} subplots".format(len(axes))) #pal1 = cm.ScalarMappable(sns.light_palette("navy", as_cmap=True, reverse=True)).to_rgba(range(len(labels)/2)) #pal2 = cm.ScalarMappable(sns.light_palette("orange", reverse=True, as_cmap=True)).to_rgba(range(len(labels)/2)) #colours = [(0,0,0,1)]*len(labels) colours = [ '#000000', '#2294a3' ] * nsubplots #,'#006e90','#f18f01']*nsubplots #,'#adcad6','#c2724d','#9883e5','#f76b49','#3a208e','#f24236'] #don't like the colours? change them here! #colours[::2] = pal1 #colours[1::2] = pal2 #print colours gene_len = coverage[labels[-1]]['gene_ind'][-1] maxpeak = 10 sp = 0 def get_unique(seq): seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] uniq_labels = get_unique(labels) npersubplot = len(uniq_labels) / nsubplots reps = False maxpeaks = {subplot: 10 for subplot in range(len(axes))} for i, label, col in zip(range(len(uniq_labels)), uniq_labels, colours): if len(axes) > 1: ax = axes[sp] else: ax = axes[0] xs = coverage[label]['gene_ind'] if len(coverage[label]['count']) > 1: ys = coverage[label]['count'] #print ys[:10] std = np.std(ys, axis=0) ys = np.mean(ys, axis=0) reps = True else: ys = coverage[label]['count'][0] if len(xs) == 1: xs = xs[0] ax.plot(xs, ys, label=label, color=col) if reps: ax.fill_between(xs, ys - std, ys + std, alpha=0.3, facecolor=col) maxpeak = max(maxpeak, ax.get_ylim()[1]) if scale: maxpeaks[sp] = max(maxpeaks[sp], maxpeak) #ax.set_ylim([0,maxpeak + int(0.1*maxpeak)]) #else: #maxpeak = ax.get_ylim()[1] #max([maxpeak]+ax.get_ylim()[1]) if i >= (sp + 1) * (npersubplot) - 1: sp += 1 if scale: maxpeak = 10 for i, ax in enumerate(axes): if scale: maxpeak = maxpeaks[i] #max(10,ax.get_ylim()[1]) #ax.set_ylim([0,maxpeak]) for (s, e) in cds_indices: rect = matplotlib.patches.Rectangle((indices[s], 0), indices[e] - indices[s], maxpeak, angle=0.0, alpha=0.1, color='#a5abaf') ax.add_patch(rect) for peak in peaks: rect = matplotlib.patches.Rectangle((peak[0], 0), peak[1] - peak[0], maxpeak, angle=0.0, alpha=0.3, color='#ffff00') ax.add_patch(rect) #ax.set_xlabel(gene) ax.xaxis.set_ticks( np.arange(0, gene_len, max(100 * np.round(gene_len / 500., 0), 500))) ax.set_xlim([0, gene_len + 1]) #print maxpeak, int(0.05*maxpeak) #if not scale: ax.set_ylim([0, maxpeak]) #+int(0.05*maxpeak)]) if normalize: ax.set_ylabel('normalized\ncoverage') else: ax.set_ylabel('coverage') if i == nsubplots - 1: ax.set_xlabel(gene) ax.legend(loc=2) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) #sns.despine(fig=figp) plt.tight_layout() plt.savefig(outpref + '_' + gene + '_coverage.pdf', dpi=500, bbox_inches='tight')
rpk = coverage.mean() * 1000 return [ rpk, sum(coverage), len(coverage) / 1000, coverage.mean(), variation(coverage) ] mapping = BedTool(args.path) annotation = BedTool(args.genes) name2stat = {} curname = '' curints = [] for cov in annotation.coverage(b=mapping, F=0.2, s=True, sorted=True, d=True): if (cov.name == curname): curints.append(cov) else: if (curints): name2stat[curname] = assign_coverage(curints) curints = [cov] curname = cov.name else: name2stat[curname] = assign_coverage(curints) ### Normalize to RPKM normfactor = sum([x[1] for x in name2stat.values() if x[1]]) / 1000000 for l in name2stat.values(): l[1] = (l[1] / normfactor) / l[2]
break bins = BedTool(binfile) strains = {} Results = {} with open(bamlist, "r") as fh: reader = csv.reader(fh, delimiter=",") for row in reader: filename = row[0] strain = row[1] group = row[2] print(row) strains[strain] = group bam = BedTool(filename) cov = bins.coverage(bam, sorted=True, g=args.genomefile, bed=True) for interval in cov: window = interval[3] print("window is", window) if not window in Results: Results[window] = { "coords": [interval[0], interval[1], interval[2]] } m = Results[window] normdepth = "%.2f" % (int(interval[6]) / straindepths[strain]) if not "coverage" in m: m["coverage"] = {strain: normdepth} else: m["coverage"][strain] = normdepth with open(tracks, "w") as ggtrack: