Exemplo n.º 1
0
Arquivo: qc.py Projeto: arvin580/jcvi
def nmd(args):
    """
    %prog nmd gffile

    Identify transcript variants which might be candidates for nonsense
    mediated decay (NMD)

    A transcript is considered to be a candidate for NMD when the CDS stop
    codon is located more than 50nt upstream of terminal splice site donor

    References:
    http://www.nature.com/horizon/rna/highlights/figures/s2_spec1_f3.html
    http://www.biomedcentral.com/1741-7007/7/23/figure/F1
    """
    import __builtin__
    from jcvi.utils.cbook import enumerate_reversed

    p = OptionParser(nmd.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    gff = make_index(gffile)

    fw = must_open(opts.outfile, "w")
    for gene in gff.features_of_type('gene', order_by=('seqid', 'start')):
        _enumerate = __builtin__.enumerate if gene.strand == "-" else enumerate_reversed
        for mrna in gff.children(gene, featuretype='mRNA', order_by=('start')):
            tracker = dict()
            tracker['exon'] = list(gff.children(mrna, featuretype='exon', order_by=('start')))
            tracker['cds'] = [None] * len(tracker['exon'])

            tcds_pos = None
            for i, exon in _enumerate(tracker['exon']):
                for cds in gff.region(region=exon, featuretype='CDS', completely_within=True):
                    if mrna.id in cds['Parent']:
                        tracker['cds'][i] = cds
                        tcds_pos = i
                        break
                if tcds_pos: break

            NMD, distance = False, 0
            if (mrna.strand == "+" and tcds_pos + 1 < len(tracker['exon'])) \
                or (mrna.strand == "-" and tcds_pos - 1 >= 0):
                tcds = tracker['cds'][tcds_pos]
                texon = tracker['exon'][tcds_pos]

                PTC = tcds.end if mrna.strand == '+' else tcds.start
                TDSS = texon.end if mrna.strand == '+' else texon.start
                distance = abs(TDSS - PTC)
                NMD = True if distance > 50 else False

            print >> fw, "\t".join(str(x) for x in (gene.id, mrna.id, \
                gff.children_bp(mrna, child_featuretype='CDS'), distance, NMD))

    fw.close()
Exemplo n.º 2
0
def nmd(args):
    """
    %prog nmd gffile

    Identify transcript variants which might be candidates for nonsense
    mediated decay (NMD)

    A transcript is considered to be a candidate for NMD when the CDS stop
    codon is located more than 50nt upstream of terminal splice site donor

    References:
    http://www.nature.com/horizon/rna/highlights/figures/s2_spec1_f3.html
    http://www.biomedcentral.com/1741-7007/7/23/figure/F1
    """
    import __builtin__
    from jcvi.utils.cbook import enumerate_reversed

    p = OptionParser(nmd.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    gff = make_index(gffile)

    fw = must_open(opts.outfile, "w")
    for gene in gff.features_of_type('gene', order_by=('seqid', 'start')):
        _enumerate = __builtin__.enumerate if gene.strand == "-" else enumerate_reversed
        for mrna in gff.children(gene, featuretype='mRNA', order_by=('start')):
            tracker = dict()
            tracker['exon'] = list(gff.children(mrna, featuretype='exon', order_by=('start')))
            tracker['cds'] = [None] * len(tracker['exon'])

            tcds_pos = None
            for i, exon in _enumerate(tracker['exon']):
                for cds in gff.region(region=exon, featuretype='CDS', completely_within=True):
                    if mrna.id in cds['Parent']:
                        tracker['cds'][i] = cds
                        tcds_pos = i
                        break
                if tcds_pos: break

            NMD, distance = False, 0
            if (mrna.strand == "+" and tcds_pos + 1 < len(tracker['exon'])) \
                or (mrna.strand == "-" and tcds_pos - 1 >= 0):
                tcds = tracker['cds'][tcds_pos]
                texon = tracker['exon'][tcds_pos]

                PTC = tcds.end if mrna.strand == '+' else tcds.start
                TDSS = texon.end if mrna.strand == '+' else texon.start
                distance = abs(TDSS - PTC)
                NMD = True if distance > 50 else False

            print >> fw, "\t".join(str(x) for x in (gene.id, mrna.id, \
                gff.children_bp(mrna, child_featuretype='CDS'), distance, NMD))

    fw.close()
Exemplo n.º 3
0
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.
    """
    p = OptionParser(trimUTR.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    g = make_index(gffile)
    gff = Gff(gffile)
    mRNA_register = {}
    fw = must_open(opts.outfile, "w")
    for c in gff:
        cid, ctype = c.accn, c.type
        if ctype == "gene":
            start, end = get_cds_minmax(g, cid)
            trim(c, start, end)
        elif ctype == "mRNA":
            start, end = get_cds_minmax(g, cid, level=1)
            trim(c, start, end)
            mRNA_register[cid] = (start, end)
        elif ctype != "CDS":
            start, end = mRNA_register[c.parent]
            trim(c, start, end)
        if c.start > c.end:
            print >> sys.stderr, cid, \
                    "destroyed [{0} > {1}]".format(c.start, c.end)
        else:
            print >> fw, c
Exemplo n.º 4
0
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.
    """
    p = OptionParser(trimUTR.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    g = make_index(gffile)
    gff = Gff(gffile)
    mRNA_register = {}
    fw = must_open(opts.outfile, "w")
    for c in gff:
        cid, ctype = c.accn, c.type
        if ctype == "gene":
            start, end = get_cds_minmax(g, cid)
            trim(c, start, end)
        elif ctype == "mRNA":
            start, end = get_cds_minmax(g, cid, level=1)
            trim(c, start, end)
            mRNA_register[cid] = (start, end)
        elif ctype != "CDS":
            start, end = mRNA_register[c.parent]
            trim(c, start, end)
        if c.start > c.end:
            print >> sys.stderr, cid, \
                    "destroyed [{0} > {1}]".format(c.start, c.end)
        else:
            print >> fw, c
Exemplo n.º 5
0
def stats(args):
    """
    %prog stats infile.gff

    Collect gene statistics based on gff file. There are some terminology issues
    here and so normally we call "gene" are actually mRNA, and sometimes "exon"
    are actually CDS, but they are configurable.

    Thee numbers are written to text file in four separate folders,
    corresponding to the four metrics:

    Exon length, Intron length, Gene length, Exon count

    With data written to disk then you can run %prog histogram
    """
    p = OptionParser(stats.__doc__)
    p.add_option("--gene", default="mRNA",
                 help="The gene type [default: %default]")
    p.add_option("--exon", default="CDS",
                 help="The exon type [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gff_file, = args
    g = make_index(gff_file)
    exon_lengths = []
    intron_lengths = []
    gene_lengths = []
    exon_counts = []
    for feat in g.features_of_type(opts.gene):
        exons = []
        for c in g.children(feat.id, 1):
            if c.featuretype != opts.exon:
                continue
            exons.append((c.chrom, c.start, c.stop))
        introns = range_interleave(exons)
        feat_exon_lengths = [(stop - start + 1) for (chrom, start, stop) in exons]
        feat_intron_lengths = [(stop - start + 1) for (chrom, start, stop) in introns]
        exon_lengths += feat_exon_lengths
        intron_lengths += feat_intron_lengths
        gene_lengths.append(sum(feat_exon_lengths))
        exon_counts.append(len(feat_exon_lengths))

    a = SummaryStats(exon_lengths)
    b = SummaryStats(intron_lengths)
    c = SummaryStats(gene_lengths)
    d = SummaryStats(exon_counts)
    for x, title in zip((a, b, c, d), metrics):
        x.title = title
        print(x, file=sys.stderr)

    prefix = gff_file.split(".")[0]
    for x in (a, b, c, d):
        dirname = x.title
        mkdir(dirname)
        txtfile = op.join(dirname, prefix + ".txt")
        x.tofile(txtfile)
Exemplo n.º 6
0
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"),
                        (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum,
                                               precision=0,
                                               target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum,
                                         s.totalsize,
                                         precision=0,
                                         mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print >> sys.stderr, tabulate(r)
Exemplo n.º 7
0
def batcheval(args):
    """
    %prog batcheval model.ids gff_file evidences.bed fastafile

    Get the accuracy for a list of models against evidences in the range of the
    genes. For example:

    $ %prog batcheval all.gff3 isoforms.ids proteins.bed scaffolds.fasta

    Outfile contains the scores for the models can be found in models.scores
    """
    from jcvi.formats.bed import evaluate
    from jcvi.formats.gff import make_index

    p = OptionParser(evaluate.__doc__)
    p.add_option(
        "--type",
        default="CDS",
        help="list of features to extract, use comma to separate (e.g."
        "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    model_ids, gff_file, evidences_bed, fastafile = args
    type = set(opts.type.split(","))

    g = make_index(gff_file)
    fp = open(model_ids)
    prefix = model_ids.rsplit(".", 1)[0]
    fwscores = open(prefix + ".scores", "w")

    for row in fp:
        cid = row.strip()
        b = next(g.parents(cid, 1))
        query = "{0}:{1}-{2}".format(b.chrom, b.start, b.stop)
        children = [c for c in g.children(cid, 1)]

        cidbed = prefix + ".bed"
        fw = open(cidbed, "w")
        for c in children:
            if c.featuretype not in type:
                continue

            fw.write(c.to_bed())

        fw.close()

        b = evaluate(
            [cidbed, evidences_bed, fastafile, "--query={0}".format(query)])
        print("\t".join((cid, b.score)), file=fwscores)
        fwscores.flush()
Exemplo n.º 8
0
def batcheval(args):
    """
    %prog batcheval model.ids gff_file evidences.bed fastafile

    Get the accuracy for a list of models against evidences in the range of the
    genes. For example:

    $ %prog batcheval all.gff3 isoforms.ids proteins.bed scaffolds.fasta

    Outfile contains the scores for the models can be found in models.scores
    """
    from jcvi.formats.bed import evaluate
    from jcvi.formats.gff import make_index

    p = OptionParser(evaluate.__doc__)
    p.add_option("--type", default="CDS",
            help="list of features to extract, use comma to separate (e.g."
            "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    model_ids, gff_file, evidences_bed, fastafile = args
    type = set(opts.type.split(","))

    g = make_index(gff_file)
    fp = open(model_ids)
    prefix = model_ids.rsplit(".", 1)[0]
    fwscores = open(prefix + ".scores", "w")

    for row in fp:
        cid = row.strip()
        b = g.parents(cid, 1).next()
        query = "{0}:{1}-{2}".format(b.chrom, b.start, b.stop)
        children = [c for c in g.children(cid, 1)]

        cidbed = prefix + ".bed"
        fw = open(cidbed, "w")
        for c in children:
            if c.featuretype not in type:
                continue

            fw.write(c.to_bed())

        fw.close()

        b = evaluate([cidbed, evidences_bed, fastafile, "--query={0}".format(query)])
        print >> fwscores, "\t".join((cid, b.score))
        fwscores.flush()
Exemplo n.º 9
0
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print(tabulate(r), file=sys.stderr)
Exemplo n.º 10
0
def genestats(args):
    """
    %prog genestats gffile

    Print summary stats, including:
    - Number of genes
    - Number of single-exon genes
    - Number of multi-exon genes
    - Number of distinct exons
    - Number of genes with alternative transcript variants
    - Number of predicted transcripts
    - Mean number of distinct exons per gene
    - Mean number of transcripts per gene
    - Mean gene locus size (first to last exon)
    - Mean transcript size (UTR, CDS)
    - Mean exon size

    Stats modeled after barley genome paper Table 1.
    A physical, genetic and functional sequence assembly of the barley genome
    """
    p = OptionParser(genestats.__doc__)
    p.add_option("--groupby", default="conf_class",
                 help="Print separate stats groupby")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gff_file, = args
    gb = opts.groupby
    g = make_index(gff_file)

    tf = "transcript.sizes"
    if need_update(gff_file, tf):
        fw = open(tf, "w")
        for feat in g.features_of_type("mRNA"):
            fid = feat.id
            conf_class = feat.attributes.get(gb, "all")
            tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \
                             if c.featuretype == "exon")
            print >> fw, "\t".join((fid, str(tsize), conf_class))
        fw.close()

    tsizes = DictFile(tf, cast=int)
    conf_classes = DictFile(tf, valuepos=2)
    logging.debug("A total of {0} transcripts populated.".format(len(tsizes)))

    genes = []
    for feat in g.features_of_type("gene"):
        fid = feat.id
        transcripts = [c.id for c in g.children(fid, 1) \
                         if c.featuretype == "mRNA"]
        transcript_sizes = [tsizes[x] for x in transcripts]
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                         if c.featuretype == "exon")
        conf_class = conf_classes[transcripts[0]]
        gs = GeneStats(feat, conf_class, transcript_sizes, exons)
        genes.append(gs)

    r = {}  # Report
    distinct_groups = set(conf_classes.values())
    for g in distinct_groups:
        num_genes = num_single_exon_genes = num_multi_exon_genes = 0
        num_genes_with_alts = num_transcripts = num_exons = 0
        cum_locus_size = cum_transcript_size = cum_exon_size = 0
        for gs in genes:
            if gs.conf_class != g:
                continue
            num_genes += 1
            if gs.num_exons == 1:
                num_single_exon_genes += 1
            else:
                num_multi_exon_genes += 1
            num_exons += gs.num_exons
            if gs.num_transcripts > 1:
                num_genes_with_alts += 1
            num_transcripts += gs.num_transcripts
            cum_locus_size += gs.locus_size
            cum_transcript_size += gs.cum_transcript_size
            cum_exon_size += gs.cum_exon_size

        mean_num_exons = num_exons * 1. / num_genes
        mean_num_transcripts = num_transcripts * 1. / num_genes
        mean_locus_size = cum_locus_size * 1. / num_genes
        mean_transcript_size = cum_transcript_size * 1. / num_transcripts
        mean_exon_size = cum_exon_size * 1. / num_exons

        r[("Number of genes", g)] = num_genes
        r[("Number of single-exon genes", g)] = \
            percentage(num_single_exon_genes, num_genes, mode=1)
        r[("Number of multi-exon genes", g)] = \
            percentage(num_multi_exon_genes, num_genes, mode=1)
        r[("Number of distinct exons", g)] = num_exons
        r[("Number of genes with alternative transcript variants", g)] = \
            percentage(num_genes_with_alts, num_genes, mode=1)
        r[("Number of predicted transcripts", g)] = num_transcripts
        r[("Mean number of distinct exons per gene", g)] = mean_num_exons
        r[("Mean number of transcripts per gene", g)] = mean_num_transcripts
        r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size
        r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size
        r[("Mean exon size", g)] = mean_exon_size

    print >> sys.stderr, tabulate(r)
Exemplo n.º 11
0
def reindex(args):
    """
    %prog reindex gffile pep.fasta ref.pep.fasta

    Reindex the splice isoforms (mRNA) in input GFF file, preferably
    generated after PASA annotation update

    In the input GFF file, there can be several types of mRNA within a locus:
    * CDS matches reference, UTR extended, inherits reference mRNA ID
    * CDS (slightly) different from reference, inherits reference mRNA ID
    * Novel isoform added by PASA, have IDs like "LOCUS.1.1", "LOCUS.1.2"
    * Multiple mRNA collapsed due to shared structure, have IDs like "LOCUS.1-LOCUS.1.1"

    In the case of multiple mRNA which have inherited the same reference mRNA ID,
    break ties by comparing the new protein with the reference protein using
    EMBOSS `needle` to decide which mRNA retains ID and which is assigned a new ID.

    All mRNA identifiers should follow the AGI naming conventions.

    When reindexing the isoform identifiers, order mRNA based on:
    * decreasing transcript length
    * decreasing support from multiple input datasets used to run pasa.consolidate()
    """
    from jcvi.formats.gff import make_index
    from jcvi.formats.fasta import Fasta
    from jcvi.apps.emboss import needle
    from jcvi.formats.base import FileShredder
    from tempfile import mkstemp

    p = OptionParser(reindex.__doc__)
    p.add_option("--scores", type="str", \
        help="read from existing EMBOSS `needle` scores file")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    gffile, pep, refpep, = args
    gffdb = make_index(gffile)
    reffasta = Fasta(refpep)

    if not opts.scores:
        fh, pairsfile = mkstemp(prefix='pairs', suffix=".txt", dir=".")
        fw = must_open(pairsfile, "w")

    conflict, novel = AutoVivification(), {}
    for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')):
        geneid = atg_name(gene.id, retval='locus')
        novel[geneid] = []
        updated_mrna, hybrid_mrna = [], []
        for mrna in gffdb.children(gene, featuretype='mRNA', order_by=('seqid', 'start')):
            if re.match(atg_name_pat, mrna.id) is not None and "_" not in mrna.id:
                pf, mrnaid = parse_prefix(mrna.id)
                mlen = gffdb.children_bp(mrna, child_featuretype='exon')
                if "-" in mrna.id:
                    hybrid_mrna.append((mrna.id, mrna.start, mlen, len(pf)))
                else:
                    updated_mrna.append((mrna.id, mrna.start, mlen, len(pf)))

        for mrna in sorted(updated_mrna, key=lambda k:(k[1], -k[2], -k[3])):
            pf, mrnaid = parse_prefix(mrna[0])
            mstart, mlen = mrna[1], mrna[2]

            iso = atg_name(mrnaid, retval='iso')
            newiso = "{0}{1}".format(iso, re.sub(atg_name_pat, "", mrnaid))
            if iso == newiso:
                if iso not in conflict[geneid]:
                    conflict[geneid][iso] = []
                conflict[geneid][iso].append((mrna[0], iso, newiso, \
                    mstart, mlen, len(pf)))
            else:
                novel[geneid].append((mrna[0], None, newiso, \
                    mstart, mlen, len(pf)))

        for mrna in sorted(hybrid_mrna, key=lambda k:(k[1], -k[2], -k[3])):
            pf, mrnaid = parse_prefix(mrna[0])
            mstart, mlen = mrna[1], mrna[2]

            _iso, _newiso = [], []
            for id in sorted(mrnaid.split("-")):
                a = atg_name(id, retval='iso')
                b = "{0}{1}".format(a, re.sub(atg_name_pat, "", id))
                _iso.append(a)
                _newiso.append(b)

            _novel = None
            newiso = "-".join(str(x) for x in set(_newiso))
            for iso, niso in zip(_iso, _newiso):
                if iso == niso:
                    if iso not in conflict[geneid]:
                        conflict[geneid][iso] = \
                            [(mrna[0], iso, newiso, mstart, mlen, len(pf))]
                        _novel = None
                        break

                _novel = True

            if _novel is not None:
                novel[geneid].append((mrna[0], None, newiso, \
                    mstart, mlen, len(pf)))

        if not opts.scores:
            for isoform in sorted(conflict[geneid]):
                mrnaid = "{0}.{1}".format(geneid, isoform)
                if mrnaid in reffasta.keys():
                    for mrna in conflict[geneid][isoform]:
                        print >> fw, "\t".join(str(x) for x in (mrnaid, mrna[0]))

    scoresfile = None
    if not opts.scores:
        fw.close()
        needle([pairsfile, refpep, pep])
        FileShredder([pairsfile], verbose=False)
        scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0])
    else:
        scoresfile = opts.scores

    scores = read_scores(scoresfile, sort=True, trimsuffix=False)

    primary = {}
    for geneid in conflict:
        primary[geneid] = []
        for iso in sorted(conflict[geneid]):
            conflict[geneid][iso].sort(key=lambda k:(k[3], -k[4], -k[5]))
            _iso = "{0}.{1}".format(geneid, iso)
            if _iso not in scores:
                novel[geneid].extend(conflict[geneid][iso])
                continue
            top_score = scores[_iso][0][1]
            result = next((i for i, v in enumerate(conflict[geneid][iso]) if v[0] == top_score), None)
            if result is not None:
                primary[geneid].append(conflict[geneid][iso][result])
                del conflict[geneid][iso][result]
                if geneid not in novel:
                    novel[geneid] = []
                novel[geneid].extend(conflict[geneid][iso])
        novel[geneid].sort(key=lambda k:(k[3], -k[4], -k[5]))

    fw = must_open(opts.outfile, 'w')
    for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')):
        geneid = gene.id
        print >> fw, gene
        seen = []
        if geneid in primary:
            all_mrna = primary[geneid]
            all_mrna.extend(novel[geneid])
            for iso, mrna in enumerate(all_mrna):
                _mrna = gffdb[mrna[0]]
                _iso = mrna[1]
                if mrna not in novel[geneid]:
                    seen.append(int(mrna[1]))
                else:
                    mseen = 0 if len(seen) == 0 else max(seen)
                    _iso = (mseen + iso + 1) - len(seen)

                _mrnaid = "{0}.{1}".format(geneid, _iso)
                _mrna['ID'], _mrna['_old_ID'] = [_mrnaid], [_mrna.id]

                print >> fw, _mrna
                for c in gffdb.children(_mrna, order_by=('start')):
                    c['Parent'] = [_mrnaid]
                    print >> fw, c
        else:
            for feat in gffdb.children(gene, order_by=('seqid', 'start')):
                print >> fw, feat

    fw.close()
Exemplo n.º 12
0
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through each locus (shared locus name or overlapping CDS)
    and identify same/different isoforms (shared splicing structure)
    across the input datasets.

    If `slop` is enabled, consolidation will collapse any variation
    in terminal UTR lengths, keeping the longest as representative.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index, match_subfeats
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    supported_modes = ["name", "coords"]
    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.add_option("--inferUTR", default=False, action="store_true",
            help="infer presence of UTRs from exon coordinates")
    p.add_option("--mode", default="name", choices=supported_modes,
            help="method used to determine overlapping loci")
    p.add_option("--summary", default=False, action="store_true",
            help="Generate summary table of consolidation process")
    p.add_option("--clusters", default=False, action="store_true",
            help="Generate table of cluster members after consolidation")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop
    inferUTR = opts.inferUTR
    mode = opts.mode

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)

    loci = Grouper()
    for dbn in gffdbx:
        odbns = [odbn for odbn in gffdbx if dbn != odbn]
        for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')):
            if mode == "name":
                loci.join(gene.id, (gene.id, dbn))
            else:
                if (gene.id, dbn) not in loci:
                    loci.join((gene.id, dbn))
                    gene_cds = list(gffdbx[dbn].children(gene, \
                        featuretype='CDS', order_by=('start')))
                    gene_cds_start, gene_cds_stop = gene_cds[0].start, \
                        gene_cds[-1].stop
                    for odbn in odbns:
                        for ogene_cds in gffdbx[odbn].region(seqid=gene.seqid, \
                                start=gene_cds_start, end=gene_cds_stop, \
                                strand=gene.strand, featuretype='CDS'):
                            for ogene in gffdbx[odbn].parents(ogene_cds, featuretype='gene'):
                                loci.join((gene.id, dbn), (ogene.id, odbn))

    gfeats = {}
    mrna = AutoVivification()
    for i, locus in enumerate(loci):
        gene = "gene_{0:0{pad}}".format(i, pad=6) \
                if mode == "coords" else None

        for elem in locus:
            if type(elem) == tuple:
                _gene, dbn = elem
                if gene is None: gene = _gene

                g = gffdbx[dbn][_gene]
                if gene not in gfeats:
                    gfeats[gene] = g
                    gfeats[gene].attributes['ID'] = [gene]
                else:
                    if g.start < gfeats[gene].start:
                        gfeats[gene].start = g.start
                    if g.stop > gfeats[gene].stop:
                        gfeats[gene].stop = g.stop

                c = list(gffdbx[dbn].children(_gene, featuretype='mRNA', order_by='start'))
                if len(c) > 0:
                    mrna[gene][dbn] = c

    fw = must_open(opts.outfile, "w")
    print("##gff-version	3", file=fw)
    seen = {}
    if opts.summary:
        summaryfile = "{0}.summary.txt".format(opts.outfile.rsplit(".")[0])
        sfw = must_open(summaryfile, "w")
        summary = ["id"]
        summary.extend(gffdbx.keys())
        print("\t".join(str(x) for x in summary), file=sfw)
    if opts.clusters:
        clustersfile = "{0}.clusters.txt".format(opts.outfile.rsplit(".")[0])
        cfw = must_open(clustersfile, "w")
        clusters = ["id", "dbns", "members", "trlens"]
        print("\t".join(str(x) for x in clusters), file=cfw)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2]
                for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]):
                    mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \
                            mrna2.stop - mrna2.start + 1
                    g.join((dbn1, mrna1.id, mrna1s))
                    g.join((dbn2, mrna2.id, mrna2s))

                    if match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype='CDS'):
                        res = []
                        ftypes = ['exon'] if inferUTR else ['five_prime_UTR', 'three_prime_UTR']
                        for ftype in ftypes:
                            res.append(match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop))

                        if all(r == True for r in res):
                            g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1))

        print(gfeats[gene], file=fw)

        for group in g:
            group.sort(key=lambda x: x[2], reverse=True)
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]

            dbid, _mrnaid = "|".join(str(x) for x in set(dbs)), []
            for x in mrnas:
                if x not in _mrnaid: _mrnaid.append(x)
            mrnaid = "{0}|{1}".format(dbid, "-".join(_mrnaid))
            if mrnaid not in seen:
                seen[mrnaid] = 0
            else:
                seen[mrnaid] += 1
                mrnaid = "{0}-{1}".format(mrnaid, seen[mrnaid])

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            _mrna.attributes['Parent'] = [gene]
            children = gffdbx[d].children(m, order_by='start')
            print(_mrna, file=fw)
            for child in children:
                child.attributes['ID'] = ["{0}|{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print(child, file=fw)

            if opts.summary:
                summary = [mrnaid]
                summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
                print("\t".join(str(x) for x in summary), file=sfw)

            if opts.clusters:
                clusters = [mrnaid]
                clusters.append(",".join(str(el[0]) for el in group))
                clusters.append(",".join(str(el[1]) for el in group))
                clusters.append(",".join(str(el[2]) for el in group))
                print("\t".join(str(x) for x in clusters), file=cfw)

    fw.close()
    if opts.summary: sfw.close()
    if opts.clusters: cfw.close()
Exemplo n.º 13
0
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.

    If reference GFF3 is provided, reinstate UTRs from reference
    transcripts after trimming.

    Note: After running trimUTR, it is advised to also run
    `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3
    to adjust the boundaries of all parent 'gene' features
    """
    import gffutils
    from jcvi.formats.base import SetFile

    p = OptionParser(trimUTR.__doc__)
    p.add_option(
        "--trim5",
        default=None,
        type="str",
        help="File containing gene list for 5' UTR trimming",
    )
    p.add_option(
        "--trim3",
        default=None,
        type="str",
        help="File containing gene list for 3' UTR trimming",
    )
    p.add_option(
        "--trimrange",
        default=None,
        type="str",
        help="File containing gene list for UTR trim back" +
        "based on suggested (start, stop) coordinate range",
    )
    p.add_option(
        "--refgff",
        default=None,
        type="str",
        help="Reference GFF3 used as fallback to replace UTRs",
    )
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (gffile, ) = args
    gff = make_index(gffile)

    trim_both = False if (opts.trim5 or opts.trim3) else True
    trim5 = SetFile(opts.trim5) if opts.trim5 else set()
    trim3 = SetFile(opts.trim3) if opts.trim3 else set()
    trimrange = dict()
    if opts.trimrange:
        trf = must_open(opts.trimrange)
        for tr in trf:
            assert (len(tr.split("\t")) == 3
                    ), "Must specify (start, stop) coordinate range"
            id, start, stop = tr.split("\t")
            trimrange[id] = (int(start), int(stop))
        trf.close()

    refgff = make_index(opts.refgff) if opts.refgff else None

    fw = must_open(opts.outfile, "w")
    for feat in gff.iter_by_parent_childs(featuretype="gene",
                                          order_by=("seqid", "start"),
                                          level=1):
        for c in feat:
            cid, ctype, cparent = (
                c.id,
                c.featuretype,
                c.attributes.get("Parent", [None])[0],
            )
            t5, t3 = False, False
            if ctype == "gene":
                t5 = True if cid in trim5 else False
                t3 = True if cid in trim3 else False
                start, end = get_cds_minmax(gff, cid)
                trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
            elif ctype == "mRNA":
                utr_types, extras = [], set()
                if any(id in trim5 for id in (cid, cparent)):
                    t5 = True
                    trim5.add(cid)
                if any(id in trim3 for id in (cid, cparent)):
                    t3 = True
                    trim3.add(cid)
                refc = None
                if refgff:
                    try:
                        refc = refgff[cid]
                        refctype = refc.featuretype
                        refptype = refgff[refc.attributes["Parent"]
                                          [0]].featuretype
                        if refctype == "mRNA" and refptype == "gene":
                            if cmp_children(cid, gff, refgff, cftype="CDS"):
                                reinstate(c,
                                          refc,
                                          trim5=t5,
                                          trim3=t3,
                                          both=trim_both)
                                if t5:
                                    utr_types.append("five_prime_UTR")
                                if t3:
                                    utr_types.append("three_prime_UTR")
                                for utr_type in utr_types:
                                    for utr in refgff.children(
                                            refc, featuretype=utr_type):
                                        extras.add(utr)
                                        for exon in refgff.region(
                                                region=utr,
                                                featuretype="exon"):
                                            if exon.attributes["Parent"][
                                                    0] == cid:
                                                extras.add(exon)
                        else:
                            refc = None
                    except gffutils.exceptions.FeatureNotFoundError:
                        pass
                start, end = get_cds_minmax(gff, cid, level=1)
                if cid in trimrange:
                    start, end = range_minmax([trimrange[cid], (start, end)])
                if not refc:
                    trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
                for cc in gff.children(cid, order_by="start"):
                    _ctype = cc.featuretype
                    if _ctype not in utr_types:
                        if _ctype != "CDS":
                            if _ctype == "exon":
                                eskip = [
                                    range_overlap(to_range(cc), to_range(x))
                                    for x in extras if x.featuretype == "exon"
                                ]
                                if any(eskip):
                                    continue
                            trim(cc,
                                 start,
                                 end,
                                 trim5=t5,
                                 trim3=t3,
                                 both=trim_both)
                            fprint(cc, fw)
                        else:
                            fprint(cc, fw)
                for x in extras:
                    fprint(x, fw)
    fw.close()
Exemplo n.º 14
0
def genestats(args):
    """
    %prog genestats gffile

    Print summary stats, including:
    - Number of genes
    - Number of single-exon genes
    - Number of multi-exon genes
    - Number of distinct exons
    - Number of genes with alternative transcript variants
    - Number of predicted transcripts
    - Mean number of distinct exons per gene
    - Mean number of transcripts per gene
    - Mean gene locus size (first to last exon)
    - Mean transcript size (UTR, CDS)
    - Mean exon size

    Stats modeled after barley genome paper Table 1.
    A physical, genetic and functional sequence assembly of the barley genome
    """
    p = OptionParser(genestats.__doc__)
    p.add_option("--groupby",
                 default="conf_class",
                 help="Print separate stats groupby")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gff_file, = args
    gb = opts.groupby
    g = make_index(gff_file)

    tf = "transcript.sizes"
    if need_update(gff_file, tf):
        fw = open(tf, "w")
        for feat in g.features_of_type("mRNA"):
            fid = feat.id
            conf_class = feat.attributes.get(gb, "all")
            tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \
                             if c.featuretype == "exon")
            print >> fw, "\t".join((fid, str(tsize), conf_class))
        fw.close()

    tsizes = DictFile(tf, cast=int)
    conf_classes = DictFile(tf, valuepos=2)
    logging.debug("A total of {0} transcripts populated.".format(len(tsizes)))

    genes = []
    for feat in g.features_of_type("gene"):
        fid = feat.id
        transcripts = [c.id for c in g.children(fid, 1) \
                         if c.featuretype == "mRNA"]
        transcript_sizes = [tsizes[x] for x in transcripts]
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                         if c.featuretype == "exon")
        conf_class = conf_classes[transcripts[0]]
        gs = GeneStats(feat, conf_class, transcript_sizes, exons)
        genes.append(gs)

    r = {}  # Report
    distinct_groups = set(conf_classes.values())
    for g in distinct_groups:
        num_genes = num_single_exon_genes = num_multi_exon_genes = 0
        num_genes_with_alts = num_transcripts = num_exons = max_transcripts = 0
        cum_locus_size = cum_transcript_size = cum_exon_size = 0
        for gs in genes:
            if gs.conf_class != g:
                continue
            num_genes += 1
            if gs.num_exons == 1:
                num_single_exon_genes += 1
            else:
                num_multi_exon_genes += 1
            num_exons += gs.num_exons
            if gs.num_transcripts > 1:
                num_genes_with_alts += 1
            if gs.num_transcripts > max_transcripts:
                max_transcripts = gs.num_transcripts
            num_transcripts += gs.num_transcripts
            cum_locus_size += gs.locus_size
            cum_transcript_size += gs.cum_transcript_size
            cum_exon_size += gs.cum_exon_size

        mean_num_exons = num_exons * 1. / num_genes
        mean_num_transcripts = num_transcripts * 1. / num_genes
        mean_locus_size = cum_locus_size * 1. / num_genes
        mean_transcript_size = cum_transcript_size * 1. / num_transcripts
        mean_exon_size = cum_exon_size * 1. / num_exons

        r[("Number of genes", g)] = num_genes
        r[("Number of single-exon genes", g)] = \
            percentage(num_single_exon_genes, num_genes, mode=1)
        r[("Number of multi-exon genes", g)] = \
            percentage(num_multi_exon_genes, num_genes, mode=1)
        r[("Number of distinct exons", g)] = num_exons
        r[("Number of genes with alternative transcript variants", g)] = \
            percentage(num_genes_with_alts, num_genes, mode=1)
        r[("Number of predicted transcripts", g)] = num_transcripts
        r[("Mean number of distinct exons per gene", g)] = mean_num_exons
        r[("Mean number of transcripts per gene", g)] = mean_num_transcripts
        r[("Max number of transcripts per gene", g)] = max_transcripts
        r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size
        r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size
        r[("Mean exon size", g)] = mean_exon_size

    fw = must_open(opts.outfile, "w")
    print >> fw, tabulate(r)
    fw.close()
Exemplo n.º 15
0
Arquivo: qc.py Projeto: arvin580/jcvi
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.

    If reference GFF3 is provided, reinstate UTRs from reference
    transcripts after trimming.

    Note: After running trimUTR, it is advised to also run
    `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3
    to adjust the boundaries of all parent 'gene' features
    """
    import gffutils
    from jcvi.formats.base import SetFile

    p = OptionParser(trimUTR.__doc__)
    p.add_option("--trim5", default=None, type="str", \
        help="File containing gene list for 5' UTR trimming")
    p.add_option("--trim3", default=None, type="str", \
        help="File containing gene list for 3' UTR trimming")
    p.add_option("--trimrange", default=None, type="str", \
        help="File containing gene list for UTR trim back" + \
             "based on suggested (start, stop) coordinate range")
    p.add_option("--refgff", default=None, type="str", \
        help="Reference GFF3 used as fallback to replace UTRs")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    gff = make_index(gffile)

    trim_both = False if (opts.trim5 or opts.trim3) else True
    trim5 = SetFile(opts.trim5) if opts.trim5 else set()
    trim3 = SetFile(opts.trim3) if opts.trim3 else set()
    trimrange = dict()
    if opts.trimrange:
        trf = must_open(opts.trimrange)
        for tr in trf:
            assert len(tr.split("\t")) == 3, \
                "Must specify (start, stop) coordinate range"
            id, start, stop = tr.split("\t")
            trimrange[id] = (int(start), int(stop))
        trf.close()

    refgff = make_index(opts.refgff) if opts.refgff else None

    fw = must_open(opts.outfile, "w")
    for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1):
        for c in feat:
            cid, ctype, cparent = c.id, c.featuretype, \
                c.attributes.get('Parent', [None])[0]
            t5, t3 = False, False
            if ctype == "gene":
                t5 = True if cid in trim5 else False
                t3 = True if cid in trim3 else False
                start, end = get_cds_minmax(gff, cid)
                trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
            elif ctype == "mRNA":
                utr_types, extras = [], set()
                if any(id in trim5 for id in (cid, cparent)):
                    t5 = True
                    trim5.add(cid)
                if any(id in trim3 for id in (cid, cparent)):
                    t3 = True
                    trim3.add(cid)
                refc = None
                if refgff:
                    try:
                        refc = refgff[cid]
                        refctype = refc.featuretype
                        refptype = refgff[refc.attributes['Parent'][0]].featuretype
                        if refctype == "mRNA" and refptype == "gene":
                            if cmp_children(cid, gff, refgff, cftype="CDS"):
                                reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both)
                                if t5: utr_types.append('five_prime_UTR')
                                if t3: utr_types.append('three_prime_UTR')
                                for utr_type in utr_types:
                                    for utr in refgff.children(refc, featuretype=utr_type):
                                        extras.add(utr)
                                        for exon in refgff.region(region=utr, featuretype="exon"):
                                            if exon.attributes['Parent'][0] == cid:
                                                extras.add(exon)
                        else:
                            refc = None
                    except gffutils.exceptions.FeatureNotFoundError:
                        pass
                start, end = get_cds_minmax(gff, cid, level=1)
                if cid in trimrange:
                    start, end = range_minmax([trimrange[cid], (start, end)])
                if not refc:
                    trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
                for cc in gff.children(cid, order_by=("start")):
                    _ctype = cc.featuretype
                    if _ctype not in utr_types:
                        if _ctype != "CDS":
                            if _ctype == "exon":
                                eskip = [range_overlap(to_range(cc), to_range(x)) \
                                    for x in extras if x.featuretype == 'exon']
                                if any(skip for skip in eskip): continue
                            trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both)
                            fprint(cc, fw)
                        else:
                            fprint(cc, fw)
                for x in extras:
                    fprint(x, fw)
    fw.close()
Exemplo n.º 16
0
Arquivo: pasa.py Projeto: BrokeW/jcvi
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through every gene locus and identify all cases of same and
    different isoforms across the different input datasets.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    gene_coords = {}
    mrna = AutoVivification()
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)
        for gene in gffdbx[dbn].features_of_type('gene',
                                                 order_by=('seqid', 'start')):
            if gene.id not in gene_coords:
                gene_coords[gene.id] = []
            gene_coords[gene.id].extend([gene.start, gene.stop])

            c = list(gffdbx[dbn].children(gene,
                                          featuretype='mRNA',
                                          order_by='start'))
            if len(c) > 0:
                mrna[gene.id][dbn] = c

    fw = must_open(opts.outfile, "w")
    print >> fw, "##gff-version	3"
    summary = ["id"]
    summary.extend(gffdbx.keys())
    print >> sys.stderr, "\t".join(str(x) for x in summary)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                for mrna1, mrna2 in product(mrna[gene][dbn1],
                                            mrna[gene][dbn2]):
                    g.join((dbn1, mrna1.id))
                    g.join((dbn2, mrna2.id))

                    fUTR, tUTR = None, None
                    if match_subfeats(mrna1, mrna2, gffdbx[dbn1],
                                      gffdbx[dbn2]):
                        fUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \
                                featuretype='five_prime_UTR', slop=slop)
                        tUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \
                                featuretype='three_prime_UTR', slop=slop)

                    if fUTR and tUTR:
                        g.join((dbn1, mrna1.id), (dbn2, mrna2.id))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id))

        dbn = mrna[gene].keys()[0]
        gene_coords[gene].sort()
        _gene = gffdbx[dbn][gene]
        _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1]
        print >> fw, _gene

        logging.debug(list(g))
        for group in g:
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]
            if slop:
                mlen = 0
                for D, M in zip(dbs, mrnas):
                    _mrna = gffdbx[D][M]
                    _mlen = (_mrna.stop - _mrna.start) + 1
                    if _mlen > mlen:
                        d, m, mlen = D, M, _mlen

            dbid, _mrnaid = "".join(str(x) for x in set(dbs)), []
            _mrnaid = [x for x in mrnas if x not in _mrnaid]
            mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid))

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            children = gffdbx[d].children(m, order_by='start')
            print >> fw, _mrna
            for child in children:
                child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print >> fw, child

            summary = [mrnaid]
            summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
            print >> sys.stderr, "\t".join(str(x) for x in summary)

    fw.close()