def testNoOverlap(self): """test empty input.""" self.assertEqual( Intervals.truncate([(0, 5), (10, 15)], [(5, 10)]), [(0, 5), (10, 15)]) self.assertEqual( Intervals.truncate([(5, 10)], [(0, 5), (10, 15)]), [(5, 10)]) self.assertEqual( Intervals.truncate([(0, 5), (5, 10)], [(10, 15)]), [(0, 5), (5, 10)])
def processChunk( contig, regions ): if contig == None: return start = 0 end = contigs[contig] regions = Intervals.combineIntervals( regions ) for xstart, xend in Intervals.complementIntervals( regions, start, end ): locations.append( ("intergenic", "intergenic", contig, "+", xstart, xend, ".") )
def testMultiple(self): """test empty input.""" self.assertEqual( Intervals.intersect([(0, 5), (10, 15)], [(0, 5)]), [(0, 5)]) self.assertEqual( Intervals.intersect([(0, 5), (10, 15)], [(0, 10)]), [(0, 5)]) self.assertEqual( Intervals.intersect([(0, 5), (10, 15)], [(0, 15)]), [(0, 5), (10, 15)]) self.assertEqual( Intervals.intersect([(0, 5), (5, 10)], [(0, 10)]), [(0, 5), (5, 10)])
def testSingle(self): """test empty input.""" self.assertEqual(Intervals.truncate([(0, 5)], [(0, 5)]), []) self.assertEqual(Intervals.truncate([(0, 5)], [(0, 3)]), [(3, 5)]) self.assertEqual(Intervals.truncate([(0, 3)], [(0, 5)]), []) self.assertEqual(Intervals.truncate([(0, 5)], [(3, 5)]), [(0, 3)]) self.assertEqual(Intervals.truncate([(3, 5)], [(0, 5)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(5, 10)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(5, 20)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(0, 10)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(0, 10)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(0, 20)]), [])
def count( self, bed ): '''update internal counts.''' results = [] for track in self.tracks: try: overlaps = [ (x[0],x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end ) ] except KeyError: overlaps = [] results.append( (len(overlaps), Intervals.calculateOverlap( [(bed.start, bed.end),], Intervals.combine( overlaps ) ) ) ) self.data = results
def combineMergedIntervals(bedfiles): '''combine intervals in a collection of bed files. Overlapping intervals between tracks are merged. Algorithm: 1. collect all intervals in all tracks into a single track 2. merge overlapping intervals 3. report all intervals that overlap with an interval in each track. ''' # get all intervals data_per_contig = collections.defaultdict(list) for bedfile in bedfiles: for contig in bedfile.contigs: i = [] for bed in bedfile.fetch(contig, parser=pysam.asBed()): i.append((bed.start, bed.end)) data_per_contig[contig].extend(i) # merge intervals for contig in data_per_contig.keys(): data_per_contig[contig] = Intervals.combine(data_per_contig[contig]) # filter intervals - take only those present in all bedfiles for contig, data in data_per_contig.iteritems(): for start, end in data: if isContainedInAll(contig, start, end, bedfiles): yield contig, start, end
def UTR3(transcript): exons = GTF.asRanges(transcript, "exon") cds = GTF.asRanges(transcript, "CDS") if len(cds) == 0: return list() utrs = Intervals.truncate(exons, cds) if transcript[0].strand == "+": utr3 = [exon for exon in utrs if exon[0] >= cds[-1][1]] else: utr3 = [exon for exon in utrs if exon[-1] <= cds[0][0]] for e in transcript: if e.feature == "exon": template_exon = e break returned_exons = [] for e in utr3: gtf = GTF.Entry().fromGTF(template_exon) gtf.start = e[0] gtf.end = e[1] returned_exons.append(gtf) return returned_exons
def combineMergedIntervals(bedfiles): '''combine intervals in a collection of bed files. Overlapping intervals between tracks are merged. Algorithm: 1. collect all intervals in all tracks into a single track 2. merge overlapping intervals 3. report all intervals that overlap with an interval in each track. ''' # get all intervals data_per_contig = collections.defaultdict(list) for bedfile in bedfiles: for contig in bedfile.contigs: i = [] for bed in bedfile.fetch(contig, parser=pysam.asBed()): i.append((bed.start, bed.end)) data_per_contig[contig].extend(i) # merge intervals for contig in list(data_per_contig.keys()): data_per_contig[contig] = Intervals.combine(data_per_contig[contig]) # filter intervals - take only those present in all bedfiles for contig, data in sorted(data_per_contig.items()): for start, end in data: if isContainedInAll(contig, start, end, bedfiles): yield contig, start, end
def toSequence(chunk, fasta): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk]) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand(strand) if not positive: intervals = [(lcontig - end, lcontig - start) for start, end in intervals] intervals.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] return "".join(s)
def toIntronIntervals(chunk): """convert a set of gtf elements within a transcript to intron coordinates. Will use first transcript_id found. Note that coordinates will still be forward strand coordinates """ if len(chunk) == 0: return [] contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id) for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def toSequence(chunk, fasta): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk]) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand(strand) if not positive: intervals = [(lcontig - end, lcontig - start) for start, end in intervals] intervals.reverse() s = [fasta.getSequence(contig, strand, start, end) for start, end in intervals] return "".join(s)
def cropGFF(gffs, options): """crop intervals in gff file.""" # read regions to crop with and convert intervals to intersectors E.info("reading gff for cropping: started.") other_gffs = GTF.iterator(IOTools.openFile(options.crop, "r")) cropper = GTF.readAsIntervals(other_gffs) ntotal = 0 for contig in cropper.keys(): intersector = bx.intervals.intersection.Intersecter() for start, end in cropper[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) ntotal += 1 cropper[contig] = intersector E.info("reading gff for cropping: finished.") E.info("reading gff for cropping: %i contigs with %i intervals." % (len(cropper), ntotal)) ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0 # do the actual cropping for gff in gffs: ninput += 1 if gff.contig in cropper: start, end = gff.start, gff.end overlaps = cropper[gff.contig].find(start, end) if overlaps: l = end - start a = numpy.ones(l) for i in overlaps: s = max(0, i.start - start) e = min(l, i.end - start) a[s:e] = 0 segments = Intervals.fromArray(a) if len(segments) == 0: ndeleted += 1 else: ncropped += 1 for s, e in segments: gff.start, gff.end = s + start, e + start noutput += 1 options.stdout.write("%s\n" % gff) continue noutput += 1 options.stdout.write("%s\n" % gff) if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i\n" % (ninput, noutput, ncropped, ndeleted))
def annotateRegulons(iterator, fasta, tss, options): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript]), max([x.end for x in transcript]) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) if options.merge_promotors: # merge the regulons (and rename - as sort order might have # changed) regulons = Intervals.combine(regulons) transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) nregulons += 1 x += 1 E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons))
def annotateRegulons( iterator, fasta, tss, options ): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator( iterator ) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand ) lcontig = fasta.getLength( gene[0][0].contig ) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] ) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = ( min( lcontig, max( 0, interval[0] ) ), min( lcontig, max( 0, interval[1] ) ) ) regulons.append( interval ) transcript_ids.append( transcript[0].transcript_id ) if options.merge_promotors: # merge the regulons (and rename - as sort order might have changed) regulons = Intervals.combine( regulons ) transcript_ids = ["%i" % (x+1) for x in range(len(regulons) )] gtf = GTF.Entry() gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id ) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write( "%s\n" % str(gtf) ) nregulons += 1 x += 1 E.info( "ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons) )
def cropGFF(gffs, options): """crop intervals in gff file.""" # read regions to crop with and convert intervals to intersectors E.info("reading gff for cropping: started.") other_gffs = GTF.iterator(IOTools.openFile(options.crop, "r")) cropper = GTF.readAsIntervals(other_gffs) ntotal = 0 for contig in cropper.keys(): intersector = bx.intervals.intersection.Intersecter() for start, end in cropper[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) ntotal += 1 cropper[contig] = intersector E.info("reading gff for cropping: finished.") E.info("reading gff for cropping: %i contigs with %i intervals." % (len(cropper), ntotal)) ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0 # do the actual cropping for gff in gffs: ninput += 1 if gff.contig in cropper: start, end = gff.start, gff.end overlaps = cropper[gff.contig].find(start, end) if overlaps: l = end - start a = numpy.ones(l) for i in overlaps: s = max(0, i.start - start) e = min(l, i.end - start) a[s:e] = 0 segments = Intervals.fromArray(a) if len(segments) == 0: ndeleted += 1 else: ncropped += 1 for s, e in segments: gff.start, gff.end = s + start, e + start noutput += 1 options.stdout.write("%s\n" % gff) continue noutput += 1 options.stdout.write("%s\n" % gff) if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i\n" % ( ninput, noutput, ncropped, ndeleted))
def count(self, bed): '''update internal counts.''' results = [] for track in self.tracks: try: overlaps = [(x[0], x[1]) for x in self.index[track][bed.contig].find(bed.start, bed.end)] except KeyError: overlaps = [] results.append((len(overlaps), Intervals.calculateOverlap( [(bed.start, bed.end), ], Intervals.combine(overlaps)))) self.data = results
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"): """select only those genes with a minimum length of a given feature.""" for gffs in gff_iterator: intervals = [(x.start, x.end) for x in gffs if x.feature == feature] intervals = Intervals.combine(intervals) t = sum((x[1] - x[0] for x in intervals)) if t >= min_length: yield gffs
def processChunk( gene_id, contig, strand, frame, regions ): if gene_id == None: return start = min( map( lambda x: x[0], regions ) ) end = max( map( lambda x: x[0], regions ) ) intervals = Intervals.complementIntervals( regions, start, end ) for start, end in intervals: locations.append( (gene_id, gene_id, contig, strand, start, end, frame ) )
def FilterEliminateOverlappingTranscripts(exons, filter_exons, eliminated_predictions, contig_sizes, options): """eliminate predictions that overlap or span a positive set of transcripts. """ eliminated = [] # convert list of filter exons into a list of ranges. filter_ranges = getRangesFromExons( filter_exons, both_strands=options.filter_remove_spanning_both_strands, contig_sizes=contig_sizes) for k, r in filter_ranges.items(): filter_ranges[k] = Intervals.combineIntervals(map(lambda x: x[:2], r)) exon_ranges = getRangesFromExons(exons, both_strands=False) # and now go through exons and delete transcripts whose # exons overlap one of the forbidden ranges for k, ee in exon_ranges.items(): if k not in filter_ranges: continue ff = filter_ranges[k] ee.sort() # set exon index e and filter index f # (both are indices in sorted lists) e, f = 0, 0 while e < len(ee): efrom, eto, id = ee[e] # increment filter, such that its extent # is larger than current range ee[e] to test. while f < len(ff) and ff[f][1] < efrom: f += 1 if f == len(ff): break if eto < ff[f][0]: # no overlap pass else: options.stdout.write("%s\t%s\n" % (id, "eliminated: filtered by %s:%i:%i" % (k, ff[f][0], ff[f][1]))) eliminated_predictions[id] = 0 eliminated.append((id, "f")) e += 1 return eliminated
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will use first transcript_id found. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id) for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def FilterEliminateOverlappingTranscripts( exons, filter_exons, eliminated_predictions, contig_sizes, options): """eliminate predictions that overlap or span a positive set of transcripts. """ eliminated = [] # convert list of filter exons into a list of ranges. filter_ranges = getRangesFromExons( filter_exons, both_strands=options.filter_remove_spanning_both_strands, contig_sizes=contig_sizes) for k, r in filter_ranges.items(): filter_ranges[k] = Intervals.combineIntervals(map(lambda x: x[:2], r)) exon_ranges = getRangesFromExons(exons, both_strands=False) # and now go through exons and delete transcripts whose # exons overlap one of the forbidden ranges for k, ee in exon_ranges.items(): if k not in filter_ranges: continue ff = filter_ranges[k] ee.sort() # set exon index e and filter index f # (both are indices in sorted lists) e, f = 0, 0 while e < len(ee): efrom, eto, id = ee[e] # increment filter, such that its extent # is larger than current range ee[e] to test. while f < len(ff) and ff[f][1] < efrom: f += 1 if f == len(ff): break if eto < ff[f][0]: # no overlap pass else: options.stdout.write( "%s\t%s\n" % (id, "eliminated: filtered by %s:%i:%i" % (k, ff[f][0], ff[f][1]))) eliminated_predictions[id] = 0 eliminated.append((id, "f")) e += 1 return eliminated
def get_windows(pvalues, window_size, threshold): # intervals are close closed windows = [(pos-window_size, pos+window_size+1) for pos in pvalues.index.values] merged_windows = Intervals.combine(windows) windows_min_p = [pvalues.ix[float(start):float(end-1)].min() for start, end in merged_windows] return zip(merged_windows, windows_min_p)
def processChunk(gene_id, contig, strand, frame, regions): if gene_id == None: return start = min(map(lambda x: x[0], regions)) end = max(map(lambda x: x[0], regions)) intervals = Intervals.complementIntervals(regions, start, end) for start, end in intervals: locations.append( (gene_id, gene_id, contig, strand, start, end, frame))
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will raise an error if more than one transcript is submitted. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] contig, strand, transcript_id = chunk[ 0].contig, chunk[0].strand, chunk[0].transcript_id for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." assert gff.transcript_id == transcript_id, "more than one transcript submitted" intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def testSingle(self): """test empty input.""" self.assertEqual(Intervals.intersect([(0, 5)], [(0, 5)]), [(0, 5)]) self.assertEqual(Intervals.intersect([(0, 5)], [(0, 3)]), [(0, 3)]) self.assertEqual(Intervals.intersect([(0, 3)], [(0, 5)]), [(0, 3)]) self.assertEqual(Intervals.intersect([(0, 5)], [(3, 5)]), [(3, 5)]) self.assertEqual(Intervals.intersect([(3, 5)], [(0, 5)]), [(3, 5)]) self.assertEqual(Intervals.intersect([(5, 10)], [(5, 20)]), [(5, 10)]) self.assertEqual(Intervals.intersect([(5, 10)], [(0, 20)]), [(5, 10)])
def annotateTTS(iterator, fasta, options): """annotate termination sites within iterator. Entries specified with ``--restrict-source are annotated``. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) tts = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript]), max( [x.end for x in transcript]) transcript_ids.append(transcript[0].transcript_id) # if tts is directly at start/end of contig, the tss will # be within an exon. otherwise, it is outside an exon. if is_negative_strand: tts.append( (max(0, mi - options.promotor), max(options.promotor, mi))) else: tts.append( (min(ma, lcontig - options.promotor), min(lcontig, ma + options.promotor))) if options.merge_promotors: # merge the promotors (and rename - as sort order might have # changed) tts = Intervals.combine(tts) transcript_ids = ["%i" % (x + 1) for x in range(len(tts))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "tts" x = 0 for start, end in tts: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) npromotors += 1 x += 1 if options.loglevel >= 1: options.stdlog.write( "# ngenes=%i, ntranscripts=%i, ntss=%i\n" % (ngenes, ntranscripts, npromotors))
def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.items(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.iteritems(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def get_windows(pvalues, window_size, threshold): # intervals are close closed windows = [(pos - window_size, pos + window_size + 1) for pos in pvalues.index.values] merged_windows = Intervals.combine(windows) windows_min_p = [ pvalues.ix[float(start):float(end - 1)].min() for start, end in merged_windows ] return zip(merged_windows, windows_min_p)
def findRetainedIntrons(infile, outfile): outf = IOTools.openFile(outfile, "w") for gene in GTF.gene_iterator(GTF.iterator(IOTools.openFile(infile))): gene_out = [] introns_out = [] # now find if any of the transcripts are retained intron # versions of any of the others for first, second in itertools.product(gene, gene): first = sorted( [entry for entry in first if entry.feature == "exon"], key=lambda x: x.start) second = sorted( [entry for entry in second if entry.feature == "exon"], key=lambda x: x.start) first_introns = set(GTF.toIntronIntervals(first)) second_introns = set(GTF.toIntronIntervals(second)) if len(first_introns-second_introns) > 0 and \ len(second_introns-first_introns) == 0: novel_introns = list(first_introns - second_introns) def _filterIntron(intron): return intron[0] > second[0].start and \ intron[1] < second[-1].end novel_introns = filter(_filterIntron, novel_introns) if len(novel_introns) > 0: gene_out.extend(first) for intron in novel_introns: introns_out.append(intron) introns_out = Intervals.combine(introns_out) template = gene[0][0] template.feature = "exon" for gff in introns_out: entry = GTF.Entry().copy(template) entry.start = gff[0] entry.end = gff[1] outf.write("%s\n" % str(entry))
def asRanges(gffs, feature=None): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if isinstance(feature, basestring): gg = filter(lambda x: x.feature == feature, gffs) elif feature: gg = filter(lambda x: x.feature in feature, gffs) else: gg = gffs[:] r = [(g.start, g.end) for g in gg] return Intervals.combine(r)
def asRanges(gffs, feature=None): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if isinstance(feature, str): gg = [x for x in gffs if x.feature == feature] elif feature: gg = [x for x in gffs if x.feature in feature] else: gg = gffs[:] r = [(g.start, g.end) for g in gg] return Intervals.combine(r)
def transform_third_codon(start, end, intervals_with_gff): """transform: only return nucleotide positions in window (start, end) that are in third codon position. """ intervals = [] for istart, iend, gff in intervals_with_gff: if gff.frame == ".": raise ValueError("need a frame for third codon positions.") # frame = nucleotides from start to next codon frame = int(gff.frame) # to make life easier, convert to 0-based coordinates, # with zero starting at first position in window # re-arrange positions on negative strand if Genomics.IsNegativeStrand(gff.strand): # convert to negative strand coordinates counting from 0 coordinate_offset = end reverse = True istart, iend = end - iend, end - istart else: istart, iend = istart - start, iend - start reverse = False coordinate_offset = start # make sure that you start on a second codon position and within window if istart < 0: frame = (frame + istart) % 3 istart = 0 if frame != 0: istart -= (3 - frame) istart += 2 iend = min(iend, end - start) for x in range(istart, iend, 3): if reverse: c = coordinate_offset - x - 1 else: c = coordinate_offset + x intervals.append((c, c + 1)) return Intervals.combineIntervals(intervals)
def __str__(self): single_exon_transcripts = 0 exons_per_transcript = [] intron_sizes = [] transcript_lengths = [] exon_sizes = [] for x in self.counts_exons_per_transcript.values(): x.sort() x = Intervals.combine(x) transcript_lengths.append(x[-1][1] - x[0][0]) exons_per_transcript.append(len(x)) for start, end in x: exon_sizes.append(end - start) if len(x) == 1: single_exon_transcripts += 1 continue last_end = x[0][1] for start, end in x[1:]: intron_sizes.append(start - last_end) last_end = end return "\t".join( map( str, ( len(self.counts_gene_ids), len(self.counts_transcript_ids), single_exon_transcripts, Stats.Summary(exons_per_transcript), Stats.Summary(exon_sizes), Stats.Summary(intron_sizes), Stats.Summary(transcript_lengths), ), ) )
def find_retained_introns(gene): '''Given a bundle of transcripts, find intervals matching retained introns. A retained intron is defined as an interval from an exon/intron boundary to the next where both boundaries are in the same exon of another transcript''' intron_intervals = [GTF.toIntronIntervals(transcript) for transcript in gene] intron_intervals = list(set( itertools.chain.from_iterable(intron_intervals))) intron_intervals.sort() for transcript in gene: exons = iter(sorted(GTF.asRanges(transcript))) introns = iter(intron_intervals) retained_introns = [] try: intron = introns.next() exon = exons.next() while True: if exon[1] < intron[0]: exon = exons.next() continue if intron[0] >= exon[0] and intron[1] <= exon[1]: E.debug("exon %s of transcript %s contains intron %s" % (exon, transcript[0].transcript_id, intron)) retained_introns.append(intron) intron = introns.next() except StopIteration: pass retained_introns = Intervals.combine(retained_introns) for intron in retained_introns: entry = GTF.Entry() entry = entry.copy(transcript[0]) entry.start = intron[0] entry.end = intron[1] yield entry
def __str__(self): single_exon_transcripts = 0 exons_per_transcript = [] intron_sizes = [] transcript_lengths = [] exon_sizes = [] for x in self.counts_exons_per_transcript.values(): x.sort() x = Intervals.combine(x) transcript_lengths.append(x[-1][1] - x[0][0]) exons_per_transcript.append(len(x)) for start, end in x: exon_sizes.append(end - start) if len(x) == 1: single_exon_transcripts += 1 continue last_end = x[0][1] for start, end in x[1:]: intron_sizes.append(start - last_end) last_end = end return "\t".join( map(str, ( len(self.counts_gene_ids), len(self.counts_transcript_ids), single_exon_transcripts, Stats.Summary(exons_per_transcript), Stats.Summary(exon_sizes), Stats.Summary(intron_sizes), Stats.Summary(transcript_lengths), )))
def testHalfEmpty(self): """test empty input.""" self.assertEqual(Intervals.intersect([(0, 5)], []), []) self.assertEqual(Intervals.intersect([], [(0, 5)]), [])
def testEmpty(self): """test empty input.""" self.assertEqual(Intervals.intersect([], []), [])
def testEmpty(self): """test empty input.""" self.assertEqual(Intervals.truncate([], []), [])
def testArray2(self): """test longer array.""" a = [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1] self.assertEqual(Intervals.fromArray(a), [(0, 3), (6, 9), (12, 15)]) self.assertEqual(Intervals.fromArray([not x for x in a]), [(3, 6), (9, 12)])
def transform_overlap(start, end, intervals_with_gff): """transform: overlap of intervals in x with y.""" y = Intervals.combineIntervals([(x[0], x[1]) for x in intervals_with_gff]) return Intervals.pruneIntervals(y, start, end)
def transform_complement(start, end, intervals_with_gff): y = Intervals.combineIntervals( map(lambda x: (x[0], x[1]), intervals_with_gff)) return Intervals.complementIntervals(y, start, end)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="which feature to use: gene/transcript/exon") parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string", help="BigWig with tag counts on both strands") parser.add_option("--plus-bw", dest="plus_wig", type="string", help="BigWig with tag counts from plus strand") parser.add_option("--minus-bw", dest="minus_wig", type="string", help="BigWig with tag counts from minus strand") parser.add_option("--bed", dest="bedfile", type="string", help="tabix indexed bed file with tag counts"), parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than start") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) if options.unstranded_wig: bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig) elif options.plus_wig: if not options.minus_wig: raise ValueError( "Please provide wigs for both strands or use --unstranded_wig") bamfile = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bamfile = iCLIP.make_getter(bedfile=options.bedfile) else: bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: try: exon_id = feature[0].exon_number except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = float(intron_counts) outlines.append([ gene_id, transcript_id, exon_id, str(float(exon_counts)), str(intron_counts) ]) options.stdout.write("\t".join([ "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count" ]) + "\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with IOTools.openFile(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
def transform_complement(start, end, intervals_with_gff): y = Intervals.combineIntervals([(x[0], x[1]) for x in intervals_with_gff]) return Intervals.complementIntervals(y, start, end)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option( "-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option( "-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option( "--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option( "--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option( "--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option( "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option( "--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with open(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in e.keys(): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # for item in iterator: # print len(item) # 3, 2 # for i in item: # print len(i) # 9, 9, 9, 9, 9 # print i.contig # print i.strand # print i.transcript_id # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = filter(lambda x: x.feature == feature, ichunk) else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = {x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";")} name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise "unimplemented" if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [fasta.getSequence(contig, strand, start, end) for start, end in intervals] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with,) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i+n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write(">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
def processChunk( query_id, matches ): """process a set of matches from query_id""" global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches global nremoved_regions, nqueries_removed_region global outfile_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0 nmatches = len(matches) new_matches = [] # absolute filters applicable to non-fragmentory matches for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue if match.mNMatches < options.threshold_min_matches: nremoved_nmatches += 1 continue if options.threshold_max_error_rate: r = 100.0 * math.power( options.threshold_max_error_rate, match.mNMatches + match.mNMismatches) if match.mPid < r: nremoved_pid += 1 x_nremoved_pid += 1 continue new_matches.append(match) matches = new_matches # filter matches if len(matches) == 0: if outfile_empty: outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\ (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) ) nskipped += 1 return if options.keep_unique_matches and len(matches) == 1: pass else: new_matches = [] for match in matches: if match.mQueryCoverage < options.threshold_min_query_coverage: nremoved_query_coverage += 1 x_nquery_coverage += 1 continue if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue new_matches.append( match ) matches = new_matches if len(matches) == 0: if outfile_empty: outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\ (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) ) nskipped += 1 return ## Remove queries matching to a forbidden region. This section ## will remove the full query if any of its matches matches in a ## forbidden region. keep = True for match in matches: if intersectors and match.mSbjctId in intersectors: found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo ) if found and not options.keep_forbidden or (found and not options.keep_forbidden): nremoved_regions += 1 keep = False continue if not keep: nqueries_removed_region += 1 if outfile_empty: outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id ) return ## check for full length matches for match in matches: if match.mQueryCoverage >= 99.9: full_matches.append(match) if match.mQueryCoverage > options.threshold_good_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 ## compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append( (match.mQueryFrom, match.mQueryTo) ) rest = Intervals.complement( intervals, 0, match.mQueryLength ) query_coverage = 100.0 * (match.mQueryLength - sum( map( lambda x: x[1] - x[0], rest) ) ) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append( query_id ) elif query_coverage > options.threshold_good_query_coverage: well_matched.append( query_id ) else: partially_matched.append( query_id ) aggregate_coverages.append( query_coverage ) ## select matches to output matches, msg = selectMatches( query_id, matches, options, queries_fasta ) if len(matches) > 0: for match in matches: if options.query_forward_coordinates: match.convertCoordinates() if options.output_format == "map": options.stdout.write( "%s\n" %\ "\t".join( map(str, ( match.mQueryId, match.mSbjctId, match.strand, "%5.2f" % match.mQueryCoverage, "%5.2f" % match.mSbjctCoverage, "%5.2f" % match.mPid, match.mQueryLength, match.mSbjctLength, match.mQueryFrom, match.mQueryTo, match.mSbjctFrom, match.mSbjctTo, ",".join( map(str,match.mBlockSizes) ), ",".join( map(str,match.mQueryBlockStarts)), ",".join( map(str,match.mSbjctBlockStarts)), )))) elif options.output_format == "psl": options.stdout.write( str(match) + "\n" ) noutput += 1 else: if outfile_empty: outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg) ) nempty += 1
def testNoOverlap(self): """test empty input.""" self.assertEqual(Intervals.intersect([(0, 5), (10, 15)], [(5, 10)]), []) self.assertEqual(Intervals.intersect([(5, 10)], [(0, 5), (10, 15)]), [])
def testEmpty(self): """test empty input.""" self.assertEqual(Intervals.fromArray([]), [])
def testArray1(self): """test simple array.""" a = [1, 1, 1, 0, 0, 0, 1, 1, 1] self.assertEqual(Intervals.fromArray(a), [(0, 3), (6, 9)]) self.assertEqual(Intervals.fromArray([not x for x in a]), [(3, 6)])
def annotateGenes(iterator, fasta, options): """annotate gene structures This method outputs intervals for first/middle/last exon/intron, UTRs and flanking regions. This method annotates per transcript. In order to achieve a unique tiling, use only a single transcript per gene and remove any overlap between genes. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nskipped = 0, 0, 0 results = [] increment = options.increment introns_detail = "introns" in options.detail exons_detail = "exons" in options.detail for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) try: lcontig = fasta.getLength(gene[0][0].contig) except KeyError: nskipped += 1 continue results = [] for transcript in gene: def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf) ntranscripts += 1 exons = [(x.start, x.end) for x in transcript if x.feature == "exon"] if len(exons) == 0: nskipped += 1 exons.sort() introns = [] end = exons[0][1] for exon in exons[1:]: introns.append((end, exon[0])) end = exon[1] # add flank start, end = exons[0][0], exons[-1][1] upstream, downstream = [], [] for x in xrange(0, options.flank, increment): upstream.append((start - increment, start)) start -= increment downstream.append((end, end + increment)) end += increment # remove out-of-bounds coordinates upstream = [x for x in upstream if x[0] >= 0] downstream = [x for x in downstream if x[1] <= lcontig] if is_negative_strand: exons.reverse() introns.reverse() upstream, downstream = downstream, upstream # add exons if exons_detail: _add(exons[0], "first_exon") if len(exons) > 1: _add(exons[-1], "last_exon") for e in exons[1:-1]: _add(e, "middle_exon") else: for e in exons: _add(e, "exon") # add introns if introns_detail: if len(introns) > 0: _add(introns[0], "first_intron") if len(introns) > 1: _add(introns[-1], "last_intron") for i in introns[1:-1]: _add(i, "middle_intron") else: for i in introns: _add(i, "intron") for x, u in enumerate(upstream): _add(u, "upstream_%i" % (increment * (x + 1))) for x, u in enumerate(downstream): _add(u, "downstream_%i" % (increment * (x + 1))) results.sort(key=lambda x: x.feature) cache = [] for key, vals in itertools.groupby(results, key=lambda x: x.feature): v = list(vals) intervals = [(x.start, x.end) for x in v] intervals = Intervals.combine(intervals) for start, end in intervals: r = GTF.Entry() r.copy(v[0]) r.start, r.end = start, end cache.append(r) cache.sort(key=lambda x: x.start) for r in cache: options.stdout.write("%s\n" % str(r)) E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" % (ngenes, ntranscripts, nskipped))
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("c").itemsize)) # AString.AString( "a").itemsize )) for contig, size in contig_sizes.items(): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) annotations[contig] = array.array("c", default_code * size) E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.openOutputFile("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError, msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gff.transcript_id entry.mSbjctId = gff.contig entry.strand = gff.strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def merge(iterator, max_distance=0, by_name=False, min_intervals=1, remove_inconsistent=False, resolve_blocks=False, stranded=False): """iterator for merging adjacent bed entries. *max_distance* > 0 permits merging of intervals that are not directly adjacent. If *by_name = True*, only entries with the same name are merged. If *remove_inconsistent*, overlapping intervals where the names are inconsistent will be removed. The score gives the number of intervals that have been merged. """ if remove_inconsistent and by_name: assert ValueError( "using both remove_inconsistent and by_name makes no sense") def iterate_chunks(iterator): max_end = defaultdict(int) to_join = defaultdict(list) last_name = defaultdict(str) last = iterator.next() if not stranded: strand = "." else: strand = last.strand max_end[strand] = last.end to_join[strand] = [last] for bed in iterator: if not stranded: strand = "." else: strand = bed.strand d = bed.start - max_end[strand] if bed.contig == last.contig: assert bed.start >= last.start, \ "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \ % (d, last, bed) if bed.contig != last.contig: for s in to_join: if to_join[s]: yield to_join[s] to_join[s] = [] max_end[s] = 0 elif (d > max_distance or (by_name and last_name[strand] != bed.name)): if to_join[strand]: yield to_join[strand] to_join[strand] = [] last = bed last_name[strand] = last.name max_end[strand] = max(bed.end, max_end[strand]) to_join[strand].append(bed) for strand in to_join: if to_join[strand]: yield to_join[strand] raise StopIteration c = E.Counter() for to_join in iterate_chunks(iterator): c.input += 1 if remove_inconsistent: names = set([x.name for x in to_join]) if len(names) > 1: c.skipped_inconcistent_intervals += 1 continue if resolve_blocks: # keep track of number of intervals in each entry for bed in to_join: bed["score"] = 1 merged = True while merged: joined = [] not_joined = [] merged = False while len(to_join) > 0: bed1, to_join = to_join[0], to_join[1:] intervals1 = bed1.toIntervals() for bed2 in to_join: intervals2 = bed2.toIntervals() if Intervals.calculateOverlap(intervals1, intervals2) > 0: intervals = Intervals.combine(intervals1 + intervals2) bed1.fromIntervals(intervals) bed1["score"] += bed2["score"] merged = True else: not_joined.append(bed2) joined.append(bed1) to_join = not_joined not_joined = [] to_join = joined joined = [] to_join = sorted(to_join, key=lambda x: int(x.start)) # keep only those with the created from the merge of the minimum # number of intervals for bed in to_join: if bed["score"] < min_intervals: c.skipped_min_intervals += 1 continue yield bed c.output += 1 else: if len(to_join) < min_intervals: c.skipped_min_intervals += 1 continue a = to_join[0] a.end = max([entry.end for entry in to_join]) a.score = len(to_join) yield a c.output += 1 E.info(str(c))