def main(): snps = IntervalFile("bedtools/tests/data/snps.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # find snps that overlap with repeat annotations for a in snps: for hit in rmsk.search(a.chrom, a.start, a.end): print a.chrom, a.start, a.end, a.name, print hit.chrom, hit.start, hit.end, hit.name
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for al in bam: chrom = bam.getrname(al.rname) start = al.pos end = al.aend name = al.qname for hit in rmsk.search(chrom, start, end): print chrom, start, end, name, print hit.chrom, hit.start, hit.end, hit.name
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # Example 1: # Method: IntervalFile.all_hits() # Report _all_ of the rmsk features that overlap with the BAM alignment for al in bam: strand = "+" if al.is_reverse: strand = "-" i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand) for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75): print "\t".join(str(x) for x in [i, hit])
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # Example 1: # Method: IntervalFile.all_hits() # Report _all_ of the rmsk features that overlap with the BAM alignment for al in bam: strand = "+" if al.is_reverse: strand = "-" i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand) for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75): print "\t".join(str(x) for x in [i,hit])
def main(): """ Examples of printing each interval in an interval file. - Works with BED, GTF and VCF files. - Can be uncompressed or GZIP compressed. """ # 0.1 Each interval in a BED file for exon in IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed"): print exon # 0.2 Each gene in a GTF file for gene in IntervalFile("bedtools/tests/data/genes.hg18.chr21.gtf"): print gene # 0.3 Each gene in a _compressed_ GTF file for gene in IntervalFile("bedtools/tests/data/genes.hg18.chr21.gtf.gz"): print gene
def main(args): """ Examples of printing each interval in an interval file. - Works with BED, GTF and VCF files. - Can be uncompressed or GZIP compressed. """ ########################################################## # ex1. Report the coordinates of overlap b/w exons and rmsk # # Equivalent to: intersectBed -a exons -b rmsk # Uses: IntervalFile.all_hits() ########################################################## genes = IntervalFile(args.genefile) peaks = IntervalFile(args.peakfile) for gene in genes: for peak_hit in peaks.all_hits(gene): print "\t".join(str(f) for f in [gene.chrom, peak_hit.o_start, peak_hit.o_end])
class IntervalFileTest(unittest.TestCase): file = "data/rmsk.hg18.chr21.bed" def setUp(self): self.file = os.path.join(PATH, self.file) self.bed = IntervalFile(self.file) def testOverlaps(self): hits = self.bed.search("chr21", 9719768, 9739768) print len(hits) self.assertEqual(len(hits), 8) for hit in hits: self.assert_(hit.start <= 9739768 and hit.end >= 9719768) def testStrands(self): hits = self.bed.search("chr21", 9719768, 9739768, "+") for hit in hits: self.assert_(hit.strand == '+') hits = self.bed.search("chr21", 9719768, 9739768, "-") for hit in hits: self.assert_(hit.strand == '-')
def main(): """ """ ########################################################## # ex1. Report the coordinates of overlap b/w exons and rmsk # # Equivalent to: intersectBed -a exons -b rmsk # Uses: IntervalFile.all_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # allow 1kb of "slop" on each side of the exon # when looking for hits window = 1000 for exon in exons: # add the slop and search exon_slop = Interval(exon.chrom, exon.start-window, exon.end + window, exon.strand) for rmsk_hit in rmsk.all_hits(exon_slop): print "\t".join(str(f) for f in [exon, rmsk_hit])
def main(): """ """ ########################################################## # ex1. Report the coordinates of overlap b/w exons and rmsk # # Equivalent to: intersectBed -a exons -b rmsk # Uses: IntervalFile.all_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # allow 1kb of "slop" on each side of the exon # when looking for hits window = 1000 for exon in exons: # add the slop and search exon_slop = Interval(exon.chrom, exon.start - window, exon.end + window, exon.strand) for rmsk_hit in rmsk.all_hits(exon_slop): print "\t".join(str(f) for f in [exon, rmsk_hit])
class IntervalFileTest(unittest.TestCase): file = "data/rmsk.hg18.chr21.bed" def setUp(self): self.file = os.path.join(PATH, self.file) self.bed = IntervalFile(self.file) def testOverlaps(self): i = Interval("chr21", 9719768, 9739768) hits = self.bed.all_hits(i) print len(hits) self.assertEqual(len(hits), 8) for hit in hits: self.assert_(hit.start <= 9739768 and hit.end >= 9719768) def testStrands(self): i = Interval("chr21", 9719768, 9739768, "+") hits = self.bed.all_hits(i, same_strand=True) for hit in hits: self.assert_(hit.strand == '+') i = Interval("chr21", 9719768, 9739768, "-") hits = self.bed.all_hits(i, same_strand=True) for hit in hits: self.assert_(hit.strand == '-')
def Main(): args=ParseArg() #store bam files and count information: bams={} total_reads=np.zeros(len(args.bams)) for i in range(len(args.bams)): temp_name=args.name[i] print >> sys.stderr, "\nReading bam file:"+temp_name+"..." bams[temp_name]=pysam.Samfile(args.bams[i],'rb') if args.normalize: for b in bams[temp_name]: if not b.is_unmapped: total_reads[i]+=1 if total_reads[i]%10000==0: print >> sys.stderr, " reading %d reads..\r"%(total_reads[i]), output=open(args.output+"_count.txt",'w') #read interval regions: intervals=IntervalFile(args.interval) header='\t'.join (str(f) for f in ['chr','start','end','name','score']) + '\t' + '\t'.join(str(f) for f in args.name ) output.write(header+'\n') print >> sys.stderr,"\n\n Start counting reads for intervals..." for interval in intervals: if 'random' in interval.chrom: continue print_line='\t'.join (str(f) for f in [interval.chrom,interval.start,interval.end,interval.name,interval.score]) for i in range(len(args.bams)): name=args.name[i] count=Count_num(bams[name],interval,args.len,args.fragmentL,total_reads[i]) print_line=print_line+'\t'+str(count) output.write(print_line+'\n') #close files output.close()
def main(): # setup a reverse_complement translation rev_table = string.maketrans('ACGTacgt', 'TGCAtgca') def revcomp(seq, rev_table): return seq.translate(rev_table) # open your fasta file fasta = Fastafile("bedtools/tests/data/chr21.fa") # open your bed file bed = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # for each bed, grab the the DNA in that interval for b in bed: # grab the seq, rev. comp if necessary seq = fasta.fetch(b.chrom, b.start, b.end) if b.strand == "-": seq = revcomp(seq, rev_table) # print the interval and the seq print b.chrom, b.start, b.end, b.strand, seq
def main(): """ """ ########################################################## # ex1. Report the coordinates of overlap b/w exons and rmsk # # Equivalent to: intersectBed -a exons -b rmsk # Uses: IntervalFile.all_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for exon in exons: for rmsk_hit in rmsk.all_hits(exon): print "\t".join( str(f) for f in [exon.chrom, rmsk_hit.o_start, rmsk_hit.o_end]) ########################################################## # ex2. Report the original features for overlapping # exons and rmsk # # Equivalent to: intersectBed -a exons -b rmsk -wa -wb # Uses: IntervalFile.all_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for exon in exons: for rmsk_hit in rmsk.all_hits(exon): print "\t".join( str(f) for f in [ exon.chrom, exon.start, exon.end, exon.name, exon.score, exon.strand, rmsk_hit.chrom, rmsk_hit.start, rmsk_hit.end, rmsk_hit.name, rmsk_hit.score, rmsk_hit.strand ]) ########################################################## # ex3. Report the count of rmsk overlapping each exon # # Equivalent to: intersectBed -a exons -b rmsk -c # Uses: IntervalFile.count_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for exon in exons: # get the number of hits in rmsk num_hits = rmsk.count_hits(exon) print "\t".join( str(f) for f in [ exon.chrom, exon.start, exon.end, exon.name, exon.score, exon.strand, num_hits ]) ########################################################## # ex4. Report exons that overlap at least one rmsk # # Equivalent to: intersectBed -a exons -b rmsk -u # Uses: IntervalFile.any_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for exon in exons: # does this exon overlap any rmsk? if rmsk.any_hits(exon): print "\t".join( str(f) for f in [ exon.chrom, exon.start, exon.end, exon.name, exon.score, exon.strand ]) ########################################################## # ex5. Report exons that DO NOT overlap at least one rmsk # # Equivalent to: intersectBed -a exons -b rmsk -v # Uses: IntervalFile.any_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for exon in exons: # does this exon overlap any rmsk? if not rmsk.any_hits(exon): print "\t".join( str(f) for f in [ exon.chrom, exon.start, exon.end, exon.name, exon.score, exon.strand ]) ########################################################## # ex6. Report overlap b/w exons and rmsk on the same strand # # Equivalent to: intersectBed -a exons -b rmsk -s # Uses: IntervalFile.all_hits(same_strand=True) ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for exon in exons: # use "same_strand" to enforce, well, same strand. for rmsk_hit in rmsk.all_hits(exon, same_strand=True): print "\t".join( str(f) for f in [ exon.chrom, exon.start, exon.end, exon.strand, rmsk_hit. chrom, rmsk_hit.start, rmsk_hit.end, rmsk_hit.strand ]) ########################################################## # ex7. Report overlap b/w exons and rmsk where the rmsk # feature covers at least 50% of the exon. # # Equivalent to: intersectBed -a exons -b rmsk -f 0.50 # Uses: IntervalFile.all_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for exon in exons: # use "ovlp_pct" to enforce the faction of overlap w.r.t to exon for rmsk_hit in rmsk.all_hits(exon, ovlp_pct=0.50): print "\t".join( str(f) for f in [ exon.chrom, exon.start, exon.end, exon.strand, rmsk_hit. chrom, rmsk_hit.start, rmsk_hit.end, rmsk_hit.strand ]) ########################################################## # ex8. Report overlap b/w exons and rmsk where the rmsk # feature covers at least 50% of the exon. # # Equivalent to: intersectBed -a exons -b rmsk -s -f 0.50 # Uses: IntervalFile.all_hits() ########################################################## exons = IntervalFile("bedtools/tests/data/exons.hg18.chr21.bed") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for exon in exons: # use "same_strand" to enforce, well, same strand. for rmsk_hit in rmsk.all_hits(exon, same_strand=True, ovlp_pct=0.50): print "\t".join( str(f) for f in [ exon.chrom, exon.start, exon.end, exon.strand, rmsk_hit. chrom, rmsk_hit.start, rmsk_hit.end, rmsk_hit.strand ])
["help", "interval", "folder", "ovlp_pct", "suffix"]) folder = "./" percent = 0.5 for o, a in opts: if o in ("-h", "--help"): show_help() exit(0) elif o in ("-i", "--interval"): interval_file = a elif o in ("-f", "--folder"): folder = a elif o in ("-p", "--ovlp_pct"): percent = float(a) elif o in ("-s", "--suffix"): suffix = a bedlist = [] name = 'chr\tstart\tend' for i in restlist: i = i.strip() bedlist.append(IntervalFile(folder + i + suffix)) name = name + '\t' + i.split("_")[-1] print name intervals = IntervalFile(interval_file) for i in intervals: line = i.chrom + '\t' + str(i.start) + '\t' + str(i.end) for j in bedlist: num = j.count_hits(i, ovlp_pct=percent) line = line + '\t' + str(num) print line
from bedtools import Interval, IntervalFile # Input inFileName = sys.argv[1] inFileName2 = sys.argv[2] outFileName = sys.argv[3] min_value = 1000000 inFile = open(inFileName2, "r") for line in inFile: line = line.strip("\n") ll = line.split("\t") min_value = min(min_value, float(ll[3]) - 0.01) inFile.close() footprints = IntervalFile(inFileName2) inFile = open(inFileName, "r") outFile = open(outFileName, "w") for line in inFile: line = line.strip("\n") ll = line.split("\t") chr = ll[0] pos1 = int(ll[1]) pos2 = int(ll[2]) query = Interval(chr, pos1, pos2) score = min_value for h in footprints.search(query): score = max(score, float(h.name))
########################################################################## # Add footprint column to features file ########################################################################## newFeaturesFileName = outLoc + "newFeaturesFile.bed" toRemove.append(newFeaturesFileName) min_value = 1000000 inFile = open(bitInsideFootFileName, "r") for line in inFile: line = line.strip("\n") ll = line.split("\t") min_value = min(min_value, float(ll[3]) - 0.01) inFile.close() footprints = IntervalFile(bitInsideFootFileName) inFile = open(featuresFileName, "r") outFile = open(newFeaturesFileName, "w") for line in inFile: line = line.strip("\n") ll = line.split("\t") chr = ll[0] pos1 = int(ll[1]) pos2 = int(ll[2]) query = Interval(chr, pos1, pos2) score = min_value for h in footprints.search(query): score = max(score, float(h.name))
def setUp(self): self.file = os.path.join(PATH, self.file) self.bed = IntervalFile(self.file)