def get(self, contig, start, end): '''return intervals overlapping with key.''' if contig not in self.mIndex: raise KeyError("contig %s not in index" % contig) return [ (x.start, x.end, x.data) for x in self.mIndex[contig].find(quicksect.Interval(start, end)) ]
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--output-equivalent", dest="write_equivalent", action="store_true", help="write equivalent entries [default=%default].") parser.add_option("-f", "--output-full", dest="write_full", action="store_true", help="write full gff entries [default=%default].") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage columns [default=%default].") parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information [default=%default].") parser.set_defaults( write_equivalent=False, write_full=False, add_percent=False, ignore_strand=False, as_gtf=False, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) != 2: raise ValueError("two arguments required") input_filename1, input_filename2 = args # duplicated features cause a problem. Make sure # features are non-overlapping by running # gff_combine.py on GFF files first. E.info("reading data started") idx, genes2 = {}, set() for e in GTF.readFromFile(iotools.open_file(input_filename2, "r")): genes2.add(e.gene_id) if e.contig not in idx: idx[e.contig] = quicksect.IntervalTree() idx[e.contig].add(e.start, e.end, e) overlaps_genes = [] E.info("reading data finished: %i contigs" % len(idx)) # outfile_diff and outfile_overlap not implemented # outfile_diff = getFile( options, "diff" ) # outfile_overlap = getFile( options, "overlap" ) overlapping_genes = set() genes1 = set() # iterate over exons with iotools.open_file(input_filename1, "r") as infile: for this in GTF.iterator(infile): genes1.add(this.gene_id) try: intervals = idx[this.contig].find( quicksect.Interval(this.start, this.end)) except KeyError: continue others = [x.data for x in intervals] for other in others: overlapping_genes.add((this.gene_id, other.gene_id)) # check for identical/half-identical matches output = None for other in others: if this.start == other.start and this.end == other.end: output, symbol = other, "=" break else: for other in others: if this.start == other.start or this.end == other.end: output, symbol = other, "|" break else: symbol = "~" # if outfile_diff != options.stdout: outfile_diff.close() # if outfile_overlap != options.stdout: outfile_overlap.close() outfile = None ################################################################## ################################################################## ################################################################## # print gene based information ################################################################## if overlapping_genes: outfile = getFile(options, "genes_ovl") outfile.write("gene_id1\tgene_id2\n") for a, b in sorted(overlapping_genes): outfile.write("%s\t%s\n" % (a, b)) if outfile != options.stdout: outfile.close() outfile_total = getFile(options, "genes_total") outfile_total.write( "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n") outfile = getFile(options, "genes_uniq1") b = set([x[0] for x in overlapping_genes]) d = genes1.difference(b) outfile.write("gene_id1\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename1), len(genes1), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1))) outfile = getFile(options, "genes_uniq2") b = set([x[1] for x in overlapping_genes]) d = genes2.difference(b) outfile.write("gene_id2\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename2), len(genes2), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2))) if outfile_total != options.stdout: outfile_total.close() E.stop()
def after(self, contig, start, end, num_intervals=1, max_dist=2500): '''get closest interval after *end*.''' if contig not in self.mIndex: raise KeyError("contig %s not in index" % contig) return [(x.start, x.end, x.data) for x in self.mIndex[contig].right( quicksect.Interval(start, end), num_intervals, max_dist)]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with iotools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = quicksect.IntervalTree() for start, end in e[contig]: intersector.add(start, end) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find( quicksect.Interval(start, end))] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def cropGFF(gffs, filename_gff): """crop intervals in gff file.""" # read regions to crop with and convert intervals to intersectors E.info("reading gff for cropping: started.") other_gffs = GTF.iterator(iotools.open_file(filename_gff, "r")) cropper = GTF.readAsIntervals(other_gffs) ntotal = 0 for contig in list(cropper.keys()): intersector = quicksect.IntervalTree() for start, end in cropper[contig]: intersector.add(start, end) ntotal += 1 cropper[contig] = intersector E.info("reading gff for cropping: finished.") E.info("reading gff for cropping: %i contigs with %i intervals." % (len(cropper), ntotal)) ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0 # do the actual cropping for gff in gffs: ninput += 1 if gff.contig in cropper: start, end = gff.start, gff.end overlaps = cropper[gff.contig].find(quicksect.Interval(start, end)) if overlaps: l = end - start a = numpy.ones(l) for i in overlaps: s = max(0, i.start - start) e = min(l, i.end - start) a[s:e] = 0 segments = Intervals.fromArray(a) if len(segments) == 0: ndeleted += 1 else: ncropped += 1 for s, e in segments: gff.start, gff.end = s + start, e + start noutput += 1 yield (gff) continue noutput += 1 yield (gff) E.info("ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i" % (ninput, noutput, ncropped, ndeleted))