def buildIndex(self, filename): """read and index.""" idx = {} infile = iotools.open_file(filename, "r") for e in GTF.readFromFile(infile): if e.contig not in idx: idx[e.contig] = NCL.NCLSimple() idx[e.contig].add(e.start, e.end) infile.close() return idx
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--output-equivalent", dest="write_equivalent", action="store_true", help="write equivalent entries [default=%default].") parser.add_option("-f", "--output-full", dest="write_full", action="store_true", help="write full gff entries [default=%default].") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage columns [default=%default].") parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information [default=%default].") parser.set_defaults( write_equivalent=False, write_full=False, add_percent=False, ignore_strand=False, as_gtf=False, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) != 2: raise ValueError("two arguments required") input_filename1, input_filename2 = args # duplicated features cause a problem. Make sure # features are non-overlapping by running # gff_combine.py on GFF files first. E.info("reading data started") idx, genes2 = {}, set() for e in GTF.readFromFile(iotools.open_file(input_filename2, "r")): genes2.add(e.gene_id) if e.contig not in idx: idx[e.contig] = quicksect.IntervalTree() idx[e.contig].add(e.start, e.end, e) overlaps_genes = [] E.info("reading data finished: %i contigs" % len(idx)) # outfile_diff and outfile_overlap not implemented # outfile_diff = getFile( options, "diff" ) # outfile_overlap = getFile( options, "overlap" ) overlapping_genes = set() genes1 = set() # iterate over exons with iotools.open_file(input_filename1, "r") as infile: for this in GTF.iterator(infile): genes1.add(this.gene_id) try: intervals = idx[this.contig].find( quicksect.Interval(this.start, this.end)) except KeyError: continue others = [x.data for x in intervals] for other in others: overlapping_genes.add((this.gene_id, other.gene_id)) # check for identical/half-identical matches output = None for other in others: if this.start == other.start and this.end == other.end: output, symbol = other, "=" break else: for other in others: if this.start == other.start or this.end == other.end: output, symbol = other, "|" break else: symbol = "~" # if outfile_diff != options.stdout: outfile_diff.close() # if outfile_overlap != options.stdout: outfile_overlap.close() outfile = None ################################################################## ################################################################## ################################################################## # print gene based information ################################################################## if overlapping_genes: outfile = getFile(options, "genes_ovl") outfile.write("gene_id1\tgene_id2\n") for a, b in sorted(overlapping_genes): outfile.write("%s\t%s\n" % (a, b)) if outfile != options.stdout: outfile.close() outfile_total = getFile(options, "genes_total") outfile_total.write( "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n") outfile = getFile(options, "genes_uniq1") b = set([x[0] for x in overlapping_genes]) d = genes1.difference(b) outfile.write("gene_id1\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename1), len(genes1), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1))) outfile = getFile(options, "genes_uniq2") b = set([x[1] for x in overlapping_genes]) d = genes2.difference(b) outfile.write("gene_id2\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename2), len(genes2), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2))) if outfile_total != options.stdout: outfile_total.close() E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: " "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z " "andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default]") parser.add_option("-f", "--features", dest="features", type="string", action="append", help="features to collect " "[default=%default]") parser.add_option("-w", "--window-size", dest="window_size", type="int", help="window size in bp for histogram computation. " "Determines the bin size. " "[default=%default]") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="number of bins for histogram computation " "if window size is not given. " "[default=%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "genomic", "histogram", ), help="methods to apply. " "[default=%default]") parser.set_defaults( genome_file=None, window_size=None, num_bins=1000, value_format="%6.4f", features=[], method="genomic", ) (options, args) = E.start(parser, add_output_options=True) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.method == "histogram": gff = GTF.readFromFile(options.stdin) gff.sort(key=lambda x: (x.contig, x.start)) chunk = [] last_contig = None for entry in gff: if last_contig != entry.contig: processChunk(last_contig, chunk, options, fasta) last_contig = entry.contig chunk = [] chunk.append(entry) processChunk(last_contig, chunk, options, fasta) elif options.method == "genomic": intervals = collections.defaultdict(int) bases = collections.defaultdict(int) total = 0 for entry in GTF.iterator(options.stdin): intervals[(entry.contig, entry.source, entry.feature)] += 1 bases[(entry.contig, entry.source, entry.feature)] += entry.end - entry.start total += entry.end - entry.start options.stdout.write("contig\tsource\tfeature\tintervals\tbases") if fasta: options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n") else: options.stdout.write("\n") total_genome_size = sum( fasta.getContigSizes(with_synonyms=False).values()) for key in sorted(intervals.keys()): nbases = bases[key] nintervals = intervals[key] contig, source, feature = key options.stdout.write("\t".join( ("\t".join(key), str(nintervals), str(nbases)))) if fasta: options.stdout.write( "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig))) options.stdout.write( "\t%f\n" % (100.0 * float(nbases) / total_genome_size)) else: options.stdout.write("\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome (indexed).") parser.add_argument("-w", "--windows-bed-file", dest="filename_windows", type=str, help="gff file with windows to use.") parser.add_argument("-d", "--filename-data", dest="filename_data", type=str, help="gff file with data to use.") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="filename-data is gtf file") parser.add_argument("-f", "--features", dest="features", type=str, action="append", choices=("GC", ), help="features to compute.") parser.add_argument("-c", "--decorator", dest="decorator", type=str, choices=("counts", "gc", "gc3", "mean-length", "median-length", "percent-coverage", "median-score", "mean-score", "stddev-score", "min-score", "max-score"), help="decorators to use.") parser.add_argument("-e", "--skip-empty", dest="skip_empty", action="store_true", help="skip empty windows.") parser.add_argument( "-t", "--transform=", dest="transform", type=str, choices=("none", "overlap", "complement", "third_codon"), help="transform to use when mapping overlapping regions onto window.") parser.set_defaults( genome_file=None, filename_windows=None, filename_data=None, features=[], skip_empty=False, decorator="counts", transform="none", is_gtf=False, ) (args) = E.start(parser) # test_transform_third_codon() if not args.filename_windows: raise ValueError("please supply a gff file with window information.") if args.loglevel >= 1: args.stdlog.write("# reading windows...") args.stdlog.flush() windows = GTF.readAsIntervals( GTF.iterator(iotools.open_file(args.filename_windows, "r"))) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() if args.filename_data: if args.loglevel >= 1: args.stdlog.write("# reading data...") args.stdlog.flush() if args.is_gtf: gff_data = GTF.readFromFile( iotools.open_file(args.filename_data, "r")) else: gff_data = GTF.readFromFile( IOTOols.open_file(args.filename_data, "r")) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() data_ranges = GTF.SortPerContig(gff_data) else: # use windows to compute properties # by supplying no data and asking for the complement = original window gff_data = None data_ranges = None args.transform = "complement" map_contig2size = {} if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) map_contig2size = fasta.getContigSizes() else: for contig, values in list(windows.items()): map_contig2size[contig] = max(lambda x: x[1], values) fasta = None contigs = list(map_contig2size.keys()) contigs.sort() # proceed contig wise noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0 args.stdout.write("\t".join( map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1", "l1", "n2", "l2", "score", "extra_info"))) + "\n") for contig in contigs: skip = False if contig not in windows: ncontigs_skipped_windows += 1 skip = True if data_ranges and contig not in data_ranges: ncontigs_skipped_data += 1 skip = True if skip: continue noutput_contigs += 1 if data_ranges: annotateWindows( contig, windows[contig], gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta, args) else: annotateWindows(contig, windows[contig], [], fasta, args) E.info( "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i" % (len(windows), noutput_contigs, len(contigs), ncontigs_skipped_windows, ncontigs_skipped_data)) E.stop()