def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", action="store", choices=( "hierarchy", "set-field", "set-pattern", "set-none"), help="Method to use for conversion") parser.add_option("-g", "--gene-type", dest="gene_type", type="string", help="feature type to get gene_id from if possible [%default]") parser.add_option("-t", "--transcript-type", dest="transcript_type", type="string", help="feature type to get transcript_id from if possible [%default]") parser.add_option("-d", "--no-discard", dest="discard", action="store_false", help="Do not discard feature types specified by GENE_TYPE and TRANSCRIPT_TYPE") parser.add_option("--gene-id", dest="gene_field_or_pattern", type="string", help="Either field or pattern for the gene_id [%default]") parser.add_option("--transcript-id", dest="transcript_field_or_pattern", type="string", help="Either field or pattern for the transcript_id [%default]") parser.add_option("--parent-field", dest="parent", type="string", help="field that specifies the parent relationship. Currently only" "if left as Parent will features with multiple parents be parsed" "correctly""") parser.add_option("--read-twice", dest="read_twice", action="store_true", help="Instead of holding the whole file in memory, read once for parsing the " "hierarchy, and then again for actaully doing the conversion. Means a real file " "and not a pipe must be provided.""") parser.add_option("--by-chrom", dest="by_chrom", action="store_true", help="Parse input file one choromosome at a time. Reduces memory usage, " "but input must be sorted by chromosome and features may not split accross " " multiple chromosomes""") parser.add_option("--fail-missing-gene", dest="missing_gene", action="store_false", help="Fail if no feature of type GENE_TYPE is found instead of using " "defaulting to highest object in hierarchy""") parser.set_defaults( method="hierarchy", gene_type="gene", transcript_type="mRNA", discard=True, gene_field_or_pattern="ID", transcript_field_or_pattern="ID", read_twice=False, by_chrom=False, missing_gene=True, parent="Parent" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) gffs = GFF3.flat_file_iterator(options.stdin) if options.by_chrom: gffs = GFF3.chrom_iterator(gffs) else: gffs = [gffs] # running early so that fails early if configuration is wrong if options.read_twice: # Will throw IOError if options.stdin is not a normal file second_gff = GFF3.flat_file_iterator( IOTools.openFile(options.stdin.name)) if options.by_chrom: second_gff = GFF3.chrom_iterator(second_gff) else: second_gff = iter([second_gff]) else: second_gff = None for chunk in gffs: if options.read_twice: second_gff_chunk = second_gff.next() else: chunk = list(chunk) second_gff_chunk = chunk if options.method == "hierarchy": convert_hierarchy(chunk, second_gff_chunk, options) elif options.method == "set-field": gene_id_pattern = "%%(%s)s" % options.gene_field_or_pattern transcript_id_pattern = "%%(%s)s" % options.transcript_field_or_pattern convert_set(chunk, gene_id_pattern, transcript_id_pattern, options) elif options.method == "set-pattern": convert_set(chunk, options.gene_field_or_pattern, options.transcript_field_or_pattern, options) elif options.method == "set-none": convert_set(chunk, None, None, options) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", action="store", choices=("hierarchy", "set-field", "set-pattern", "set-none"), help="Method to use for conversion") parser.add_option( "-g", "--gene-type", dest="gene_type", type="string", help="feature type to get gene_id from if possible [%default]") parser.add_option( "-t", "--transcript-type", dest="transcript_type", type="string", help="feature type to get transcript_id from if possible [%default]") parser.add_option( "-d", "--no-discard", dest="discard", action="store_false", help= "Do not discard feature types specified by GENE_TYPE and TRANSCRIPT_TYPE" ) parser.add_option( "--gene-id", dest="gene_field_or_pattern", type="string", help="Either field or pattern for the gene_id [%default]") parser.add_option( "--transcript-id", dest="transcript_field_or_pattern", type="string", help="Either field or pattern for the transcript_id [%default]") parser.add_option( "--parent-field", dest="parent", type="string", help="field that specifies the parent relationship. Currently only" "if left as Parent will features with multiple parents be parsed" "correctly" "") parser.add_option( "--read-twice", dest="read_twice", action="store_true", help= "Instead of holding the whole file in memory, read once for parsing the " "hierarchy, and then again for actaully doing the conversion. Means a real file " "and not a pipe must be provided." "") parser.add_option( "--by-chrom", dest="by_chrom", action="store_true", help="Parse input file one choromosome at a time. Reduces memory usage, " "but input must be sorted by chromosome and features may not split accross " " multiple chromosomes" "") parser.add_option( "--fail-missing-gene", dest="missing_gene", action="store_false", help="Fail if no feature of type GENE_TYPE is found instead of using " "defaulting to highest object in hierarchy" "") parser.set_defaults(method="hierarchy", gene_type="gene", transcript_type="mRNA", discard=True, gene_field_or_pattern="ID", transcript_field_or_pattern="ID", read_twice=False, by_chrom=False, missing_gene=True, parent="Parent") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) gffs = GFF3.flat_file_iterator(options.stdin) if options.by_chrom: gffs = GFF3.chrom_iterator(gffs) else: gffs = [gffs] # running early so that fails early if configuration is wrong if options.read_twice: # Will throw IOError if options.stdin is not a normal file second_gff = GFF3.flat_file_iterator( IOTools.openFile(options.stdin.name)) if options.by_chrom: second_gff = GFF3.chrom_iterator(second_gff) else: second_gff = iter([second_gff]) else: second_gff = None for chunk in gffs: if options.read_twice: second_gff_chunk = next(second_gff) else: chunk = list(chunk) second_gff_chunk = chunk if options.method == "hierarchy": convert_hierarchy(chunk, second_gff_chunk, options) elif options.method == "set-field": gene_id_pattern = "%%(%s)s" % options.gene_field_or_pattern transcript_id_pattern = "%%(%s)s" % options.transcript_field_or_pattern convert_set(chunk, gene_id_pattern, transcript_id_pattern, options) elif options.method == "set-pattern": convert_set(chunk, options.gene_field_or_pattern, options.transcript_field_or_pattern, options) elif options.method == "set-none": convert_set(chunk, None, None, options) # write footer and output benchmark information. E.Stop()
def main(argv=None): ''' main function ''' if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-o", "--output-only-attributes", dest="only_attributes", action="store_true", help="output only attributes as separate columns " "[default=%default].") parser.add_option( "-f", "--attributes-as-columns", dest="output_full", action="store_true", help="output attributes as separate columns " "[default=%default].") parser.add_option("--is-gff3", dest="is_gtf", action="store_false", help="input file is in gtf format [default=%default] ") parser.add_option( "-i", "--invert", dest="invert", action="store_true", help="convert tab-separated table back to gtf " "[default=%default].") parser.add_option( "-m", "--output-map", dest="output_map", type="choice", choices=( "transcript2gene", "peptide2gene", "peptide2transcript"), help="output a map mapping transcripts to genes " "[default=%default].") parser.set_defaults( only_attributes=False, output_full=False, invert=False, output_map=None, is_gtf=True ) (options, args) = E.Start(parser, argv=argv) if options.output_full: # output full table with column for each attribute attributes = set() data = [] if options.is_gtf: for gtf in GTF.iterator(options.stdin): data.append(gtf) attributes = attributes.union(set(gtf.keys())) else: for gff in GFF3.iterator_from_gff(options.stdin): data.append(gff) attributes = attributes.union(set(gff.attributes)) # remove gene_id and transcript_id, as they are used # explicitely later attributes.difference_update(["gene_id", "transcript_id"]) attributes = sorted(list(attributes)) # Select whether gtf of gff for output columns if options.is_gtf: if options.only_attributes: header = ["gene_id", "transcript_id"] + attributes else: header = ["contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", ] + attributes else: if options.only_attributes: header = attributes else: header = ["contig", "source", "feature", "start", "end", "score", "strand", "frame"] + attributes attributes_new = header options.stdout.write("\t".join(header) + "\n") if options.is_gtf: for gtf in data: first = True for a in attributes_new: try: val = getattr(gtf, a) except (AttributeError, KeyError): val = "" if first: options.stdout.write("%s" % val) first = False else: options.stdout.write("\t%s" % val) options.stdout.write("\n") else: for gff in data: options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t") % (gff.contig, gff.source, gff.feature, gff.start, gff.end, gff.score, gff.strand, gff.frame)) first = True for a in attributes: try: val = (gff.attributes[a]) except (AttributeError, KeyError): val = '' if first: options.stdout.write("%s" % val) first = False else: options.stdout.write("\t%s" % val) options.stdout.write("\n") elif options.invert: gtf = GTF.Entry() header = None for line in options.stdin: if line.startswith("#"): continue data = line[:-1].split("\t") if not header: header = data map_header2column = dict( [(y, x) for x, y in enumerate(header)]) continue # fill gtf entry with data try: gtf.contig = data[map_header2column["contig"]] gtf.source = data[map_header2column["source"]] gtf.feature = data[map_header2column["feature"]] # subtract -1 to start for 0-based coordinates gtf.start = int(data[map_header2column["start"]]) gtf.end = int(data[map_header2column["end"]]) gtf.score = data[map_header2column["score"]] gtf.strand = data[map_header2column["strand"]] gtf.frame = data[map_header2column["frame"]] gtf.gene_id = data[map_header2column["gene_id"]] gtf.transcript_id = data[map_header2column["transcript_id"]] gtf.parseInfo(data[map_header2column["attributes"]], line) except KeyError as msg: raise KeyError("incomplete entry %s: %s: %s" % (str(data), str(map_header2column), msg)) if gtf.frame is None: gtf.frame = "." # output gtf entry in gtf format options.stdout.write("%s\n" % str(gtf)) elif options.output_map: if options.output_map == "transcript2gene": fr = lambda x: x.transcript_id to = lambda x: x.gene_id options.stdout.write("transcript_id\tgene_id\n") elif options.output_map == "peptide2gene": fr = lambda x: x.protein_id to = lambda x: x.gene_id options.stdout.write("peptide_id\tgene_id\n") elif options.output_map == "peptide2transcript": fr = lambda x: x.protein_id to = lambda x: x.transcript_id options.stdout.write("peptide_id\ttranscript_id\n") map_fr2to = {} for gtf in GTF.iterator(options.stdin): try: map_fr2to[fr(gtf)] = to(gtf) except (AttributeError, KeyError): pass for x, y in sorted(map_fr2to.items()): options.stdout.write("%s\t%s\n" % (x, y)) else: header = ("contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", "attributes") options.stdout.write("\t".join(header) + "\n") for gtf in GTF.iterator(options.stdin): attributes = [] for a in list(gtf.keys()): if a in ("gene_id", "transcript_id"): continue attributes.append('%s %s' % (a, GTF.quote(gtf[a]))) attributes = "; ".join(attributes) # Capture if None and set to . format if gtf.frame is None: gtf.frame = "." options.stdout.write(str(gtf) + "\n") E.Stop()