示例#1
0
文件: gff32gtf.py 项目: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice", action="store",
                      choices=(
                          "hierarchy", "set-field", "set-pattern", "set-none"),
                      help="Method to use for conversion")
    parser.add_option("-g", "--gene-type", dest="gene_type", type="string",
                      help="feature type to get gene_id from if possible [%default]")
    parser.add_option("-t", "--transcript-type", dest="transcript_type", type="string",
                      help="feature type to get transcript_id from if possible [%default]")
    parser.add_option("-d", "--no-discard", dest="discard", action="store_false",
                      help="Do not discard feature types specified by GENE_TYPE and TRANSCRIPT_TYPE")
    parser.add_option("--gene-id", dest="gene_field_or_pattern", type="string",
                      help="Either field or pattern for the gene_id [%default]")
    parser.add_option("--transcript-id", dest="transcript_field_or_pattern", type="string",
                      help="Either field or pattern for the transcript_id [%default]")
    parser.add_option("--parent-field", dest="parent", type="string",
                      help="field that specifies the parent relationship. Currently only"
                      "if left as Parent will features with multiple parents be parsed"
                      "correctly""")
    parser.add_option("--read-twice", dest="read_twice", action="store_true",
                      help="Instead of holding the whole file in memory, read once for parsing the "
                      "hierarchy, and then again for actaully doing the conversion. Means a real file "
                      "and not a pipe must be provided.""")
    parser.add_option("--by-chrom", dest="by_chrom", action="store_true",
                      help="Parse input file one choromosome at a time. Reduces memory usage, "
                      "but input must be sorted by chromosome and features may not split accross "
                      " multiple chromosomes""")
    parser.add_option("--fail-missing-gene", dest="missing_gene", action="store_false",
                      help="Fail if no feature of type GENE_TYPE is found instead of using "
                      "defaulting to highest object in hierarchy""")

    parser.set_defaults(
        method="hierarchy",
        gene_type="gene",
        transcript_type="mRNA",
        discard=True,
        gene_field_or_pattern="ID",
        transcript_field_or_pattern="ID",
        read_twice=False,
        by_chrom=False,
        missing_gene=True,
        parent="Parent"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    gffs = GFF3.flat_file_iterator(options.stdin)

    if options.by_chrom:
        gffs = GFF3.chrom_iterator(gffs)
    else:
        gffs = [gffs]

    # running early so that fails early if configuration is wrong
    if options.read_twice:
        # Will throw IOError if options.stdin is not a normal file
        second_gff = GFF3.flat_file_iterator(
            IOTools.openFile(options.stdin.name))

        if options.by_chrom:
            second_gff = GFF3.chrom_iterator(second_gff)
        else:
            second_gff = iter([second_gff])
    else:
        second_gff = None

    for chunk in gffs:

        if options.read_twice:
            second_gff_chunk = second_gff.next()
        else:
            chunk = list(chunk)
            second_gff_chunk = chunk

        if options.method == "hierarchy":

            convert_hierarchy(chunk, second_gff_chunk, options)
        elif options.method == "set-field":
            gene_id_pattern = "%%(%s)s" % options.gene_field_or_pattern
            transcript_id_pattern = "%%(%s)s" % options.transcript_field_or_pattern
            convert_set(chunk, gene_id_pattern, transcript_id_pattern, options)
        elif options.method == "set-pattern":
            convert_set(chunk, options.gene_field_or_pattern,
                        options.transcript_field_or_pattern, options)
        elif options.method == "set-none":
            convert_set(chunk, None, None, options)

    # write footer and output benchmark information.
    E.Stop()
示例#2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      action="store",
                      choices=("hierarchy", "set-field", "set-pattern",
                               "set-none"),
                      help="Method to use for conversion")

    parser.add_option(
        "-g",
        "--gene-type",
        dest="gene_type",
        type="string",
        help="feature type to get gene_id from if possible [%default]")

    parser.add_option(
        "-t",
        "--transcript-type",
        dest="transcript_type",
        type="string",
        help="feature type to get transcript_id from if possible [%default]")

    parser.add_option(
        "-d",
        "--no-discard",
        dest="discard",
        action="store_false",
        help=
        "Do not discard feature types specified by GENE_TYPE and TRANSCRIPT_TYPE"
    )

    parser.add_option(
        "--gene-id",
        dest="gene_field_or_pattern",
        type="string",
        help="Either field or pattern for the gene_id [%default]")

    parser.add_option(
        "--transcript-id",
        dest="transcript_field_or_pattern",
        type="string",
        help="Either field or pattern for the transcript_id [%default]")

    parser.add_option(
        "--parent-field",
        dest="parent",
        type="string",
        help="field that specifies the parent relationship. Currently only"
        "if left as Parent will features with multiple parents be parsed"
        "correctly"
        "")

    parser.add_option(
        "--read-twice",
        dest="read_twice",
        action="store_true",
        help=
        "Instead of holding the whole file in memory, read once for parsing the "
        "hierarchy, and then again for actaully doing the conversion. Means a real file "
        "and not a pipe must be provided."
        "")

    parser.add_option(
        "--by-chrom",
        dest="by_chrom",
        action="store_true",
        help="Parse input file one choromosome at a time. Reduces memory usage, "
        "but input must be sorted by chromosome and features may not split accross "
        " multiple chromosomes"
        "")

    parser.add_option(
        "--fail-missing-gene",
        dest="missing_gene",
        action="store_false",
        help="Fail if no feature of type GENE_TYPE is found instead of using "
        "defaulting to highest object in hierarchy"
        "")

    parser.set_defaults(method="hierarchy",
                        gene_type="gene",
                        transcript_type="mRNA",
                        discard=True,
                        gene_field_or_pattern="ID",
                        transcript_field_or_pattern="ID",
                        read_twice=False,
                        by_chrom=False,
                        missing_gene=True,
                        parent="Parent")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    gffs = GFF3.flat_file_iterator(options.stdin)

    if options.by_chrom:
        gffs = GFF3.chrom_iterator(gffs)
    else:
        gffs = [gffs]

    # running early so that fails early if configuration is wrong
    if options.read_twice:
        # Will throw IOError if options.stdin is not a normal file
        second_gff = GFF3.flat_file_iterator(
            IOTools.openFile(options.stdin.name))

        if options.by_chrom:
            second_gff = GFF3.chrom_iterator(second_gff)
        else:
            second_gff = iter([second_gff])
    else:
        second_gff = None

    for chunk in gffs:

        if options.read_twice:
            second_gff_chunk = next(second_gff)
        else:
            chunk = list(chunk)
            second_gff_chunk = chunk

        if options.method == "hierarchy":

            convert_hierarchy(chunk, second_gff_chunk, options)
        elif options.method == "set-field":
            gene_id_pattern = "%%(%s)s" % options.gene_field_or_pattern
            transcript_id_pattern = "%%(%s)s" % options.transcript_field_or_pattern
            convert_set(chunk, gene_id_pattern, transcript_id_pattern, options)
        elif options.method == "set-pattern":
            convert_set(chunk, options.gene_field_or_pattern,
                        options.transcript_field_or_pattern, options)
        elif options.method == "set-none":
            convert_set(chunk, None, None, options)

    # write footer and output benchmark information.
    E.Stop()
示例#3
0
def main(argv=None):
    '''
    main function
    '''

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o", "--output-only-attributes", dest="only_attributes",
        action="store_true",
        help="output only attributes as separate columns "
        "[default=%default].")

    parser.add_option(
        "-f", "--attributes-as-columns", dest="output_full",
        action="store_true",
        help="output attributes as separate columns "
        "[default=%default].")

    parser.add_option("--is-gff3", dest="is_gtf", action="store_false",
                      help="input file is in gtf format [default=%default] ")

    parser.add_option(
        "-i", "--invert", dest="invert", action="store_true",
        help="convert tab-separated table back to gtf "
        "[default=%default].")

    parser.add_option(
        "-m", "--output-map", dest="output_map", type="choice",
        choices=(
            "transcript2gene",
            "peptide2gene",
            "peptide2transcript"),
        help="output a map mapping transcripts to genes "
        "[default=%default].")

    parser.set_defaults(
        only_attributes=False,
        output_full=False,
        invert=False,
        output_map=None,
        is_gtf=True
    )

    (options, args) = E.Start(parser, argv=argv)

    if options.output_full:
        # output full table with column for each attribute

        attributes = set()
        data = []
        if options.is_gtf:
            for gtf in GTF.iterator(options.stdin):
                data.append(gtf)
                attributes = attributes.union(set(gtf.keys()))

        else:
            for gff in GFF3.iterator_from_gff(options.stdin):
                data.append(gff)
                attributes = attributes.union(set(gff.attributes))

        # remove gene_id and transcript_id, as they are used
        # explicitely later
        attributes.difference_update(["gene_id", "transcript_id"])

        attributes = sorted(list(attributes))

        # Select whether gtf of gff for output columns
        if options.is_gtf:
            if options.only_attributes:
                header = ["gene_id", "transcript_id"] + attributes
            else:
                header = ["contig", "source", "feature",
                          "start", "end", "score", "strand",
                          "frame", "gene_id",
                          "transcript_id", ] + attributes
        else:
            if options.only_attributes:
                header = attributes
            else:
                header = ["contig", "source", "feature",
                          "start", "end", "score", "strand",
                          "frame"] + attributes

        attributes_new = header

        options.stdout.write("\t".join(header) + "\n")

        if options.is_gtf:
            for gtf in data:
                first = True
                for a in attributes_new:
                    try:
                        val = getattr(gtf, a)
                    except (AttributeError, KeyError):
                        val = ""
                    if first:
                        options.stdout.write("%s" % val)
                        first = False
                    else:
                        options.stdout.write("\t%s" % val)
                options.stdout.write("\n")
        else:
            for gff in data:
                options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t") % (gff.contig,
                                                                             gff.source, gff.feature, gff.start, gff.end,
                                                                             gff.score, gff.strand, gff.frame))

                first = True
                for a in attributes:
                    try:
                        val = (gff.attributes[a])
                    except (AttributeError, KeyError):
                        val = ''
                    if first:
                        options.stdout.write("%s" % val)
                        first = False
                    else:
                        options.stdout.write("\t%s" % val)
                options.stdout.write("\n")

    elif options.invert:

        gtf = GTF.Entry()
        header = None
        for line in options.stdin:
            if line.startswith("#"):
                continue
            data = line[:-1].split("\t")
            if not header:
                header = data
                map_header2column = dict(
                    [(y, x) for x, y in enumerate(header)])
                continue

            # fill gtf entry with data
            try:
                gtf.contig = data[map_header2column["contig"]]
                gtf.source = data[map_header2column["source"]]
                gtf.feature = data[map_header2column["feature"]]
                # subtract -1 to start for 0-based coordinates
                gtf.start = int(data[map_header2column["start"]])
                gtf.end = int(data[map_header2column["end"]])
                gtf.score = data[map_header2column["score"]]
                gtf.strand = data[map_header2column["strand"]]
                gtf.frame = data[map_header2column["frame"]]
                gtf.gene_id = data[map_header2column["gene_id"]]
                gtf.transcript_id = data[map_header2column["transcript_id"]]
                gtf.parseInfo(data[map_header2column["attributes"]], line)
            except KeyError as msg:
                raise KeyError("incomplete entry %s: %s: %s" %
                               (str(data), str(map_header2column), msg))
            if gtf.frame is None:
                gtf.frame = "."
            # output gtf entry in gtf format
            options.stdout.write("%s\n" % str(gtf))

    elif options.output_map:

        if options.output_map == "transcript2gene":
            fr = lambda x: x.transcript_id
            to = lambda x: x.gene_id
            options.stdout.write("transcript_id\tgene_id\n")
        elif options.output_map == "peptide2gene":
            fr = lambda x: x.protein_id
            to = lambda x: x.gene_id
            options.stdout.write("peptide_id\tgene_id\n")
        elif options.output_map == "peptide2transcript":
            fr = lambda x: x.protein_id
            to = lambda x: x.transcript_id
            options.stdout.write("peptide_id\ttranscript_id\n")

        map_fr2to = {}
        for gtf in GTF.iterator(options.stdin):
            try:
                map_fr2to[fr(gtf)] = to(gtf)
            except (AttributeError, KeyError):
                pass

        for x, y in sorted(map_fr2to.items()):
            options.stdout.write("%s\t%s\n" % (x, y))
    else:
        header = ("contig", "source", "feature", "start", "end", "score",
                  "strand", "frame", "gene_id", "transcript_id", "attributes")
        options.stdout.write("\t".join(header) + "\n")

        for gtf in GTF.iterator(options.stdin):
            attributes = []
            for a in list(gtf.keys()):
                if a in ("gene_id", "transcript_id"):
                    continue
                attributes.append('%s %s' % (a, GTF.quote(gtf[a])))

            attributes = "; ".join(attributes)

            # Capture if None and set to . format
            if gtf.frame is None:
                gtf.frame = "."

            options.stdout.write(str(gtf) + "\n")

    E.Stop()