示例#1
0
def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.items():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, noverlapping=%i\n" %
                             (ninput, noutput, noverlapping))
示例#2
0
    def update(self, bed):

        # convert to a gtf entry
        gtf = GTF.Entry()

        gtf.fromBed(bed)
        gtf.feature = 'exon'
        GeneModelAnalysis.Classifier.update(self, [gtf])
示例#3
0
 def _add(interval, anno):
     gtf = GTF.Entry()
     gtf.contig = transcript[0].contig
     gtf.gene_id = transcript[0].gene_id
     gtf.transcript_id = transcript[0].transcript_id
     gtf.strand = transcript[0].strand
     gtf.feature = anno
     gtf.start, gtf.end = interval
     results.append(gtf)
示例#4
0
    def test_entry(frame, strand, xfrom, xto, start, end, ref):

        entry = GTF.Entry()
        entry.frame = frame
        entry.strand = strand
        entry.start = xfrom
        entry.end = xto

        intervals = transform_third_codon(start, end, [(xfrom, xto, entry)])
        if ref != intervals:
            print("failed:", ref != intervals)
示例#5
0
def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append((min(ma, lcontig - options.promotor),
                            min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
                             (ngenes, ntranscripts, npromotors))
示例#6
0
def convert_set(gffs, gene_pattern, transcript_pattern, options):
    ''' creates the gene_id and transcript_id fields from a string format pattern using
    fields of the gff. '''

    for gff in gffs:

        gff.gene_id = str(gene_pattern) % gff.asDict()
        gff.transcript_id = str(gene_pattern) % gff.asDict()

        gtf_entry = GTF.Entry()

        gtf_entry.copy(gff)
        if "Parent" in gtf_entry:
            gtf_entry['Parent'] = ",".join(gtf_entry['Parent'])

        options.stdout.write(str(gtf_entry) + "\n")
def buildRepeatTrack(infile, outfile):
    '''build a repeat track as negative control.'''

    nrepeats = 0
    for gff in GTF.iterator(gzip.open(infile, "r")):
        nrepeats += 1
    sample = set(
        random.sample(range(nrepeats), PARAMS["ancestral_repeats_samplesize"]))

    outf = gzip.open(outfile, "w")
    gtf = GTF.Entry()
    for x, gff in enumerate(GTF.iterator(gzip.open(infile, "r"))):
        if x not in sample:
            continue
        gtf.fromGTF(gff, "%08i" % x, "%08i" % x)
        outf.write("%s\n" % str(gtf))
    outf.close()

    E.debug("created sample of %i repeats out of %i in %s" %
            (len(sample), nrepeats, outfile))
示例#8
0
def addSegment(feature, start, end, template, options):
    """add a generic segment of type *feature*.
    """
    if start >= end:
        return 0

    entry = GTF.Entry()

    if isinstance(template, tuple):
        entry.copy(template[0])
        entry.clearAttributes()
        entry.addAttribute("downstream_gene_id", template[1].gene_id)
    else:
        entry.copy(template)
        entry.clearAttributes()

    entry.start, entry.end = start, end
    entry.feature = feature
    if feature not in ("exon", "CDS", "UTR", "UTR3", "UTR5"):
        entry.score = "."
    options.stdout.write(str(entry) + "\n")

    return 1
示例#9
0
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False):
    '''get refseq gene set from UCSC database and save as :term:`gtf`
    formatted file.

    Matches to ``chr_random`` are ignored (as does ENSEMBL).

    Note that this approach does not work as a gene set, as refseq
    maps are not real gene builds and unalignable parts cause
    differences that are not reconcilable.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`gtf` format. The filename
       aims to be close to the ENSEMBL gtf format.
    remove_duplicate : bool
       If True, duplicate mappings are removed.

    '''

    duplicates = set()

    if remove_duplicates:
        cc = dbhandle.execute("""SELECT name, COUNT(*) AS c FROM refGene
        WHERE chrom NOT LIKE '%_random'
        GROUP BY name HAVING c > 1""")
        duplicates = set([x[0] for x in cc.fetchall()])
        E.info("removing %i duplicates" % len(duplicates))

    # these are forward strand coordinates
    statement = '''
    SELECT gene.name, link.geneName, link.name, gene.name2, product,
    protAcc, chrom, strand, cdsStart, cdsEnd,
    exonCount, exonStarts, exonEnds, exonFrames
    FROM refGene as gene, refLink as link
    WHERE gene.name = link.mrnaAcc
    AND chrom NOT LIKE '%_random'
    ORDER by chrom, cdsStart
    '''

    outf = iotools.open_file(outfile, "w")

    cc = dbhandle.execute(statement)

    SQLResult = collections.namedtuple(
        'Result', '''transcript_id, gene_id, gene_name, gene_id2, description,
        protein_id, contig, strand, start, end,
        nexons, starts, ends, frames''')

    counts = E.Counter()
    counts.duplicates = len(duplicates)

    for r in map(SQLResult._make, cc.fetchall()):

        if r.transcript_id in duplicates:
            continue

        starts = list(map(int, r.starts.split(",")[:-1]))
        ends = list(map(int, r.ends.split(",")[:-1]))
        frames = list(map(int, r.frames.split(",")[:-1]))

        gtf = GTF.Entry()
        gtf.contig = r.contig
        gtf.source = "protein_coding"
        gtf.strand = r.strand
        gtf.gene_id = r.gene_id
        gtf.transcript_id = r.transcript_id
        gtf.addAttribute("protein_id", r.protein_id)
        gtf.addAttribute("transcript_name", r.transcript_id)
        gtf.addAttribute("gene_name", r.gene_name)

        assert len(starts) == len(ends) == len(frames)

        if gtf.strand == "-":
            starts.reverse()
            ends.reverse()
            frames.reverse()

        counts.transcripts += 1
        i = 0
        for start, end, frame in zip(starts, ends, frames):
            gtf.feature = "exon"
            counts.exons += 1
            i += 1
            gtf.addAttribute("exon_number", i)
            # frame of utr exons is set to -1 in UCSC
            gtf.start, gtf.end, gtf.frame = start, end, "."
            outf.write("%s\n" % str(gtf))

            cds_start, cds_end = max(r.start, start), min(r.end, end)
            if cds_start >= cds_end:
                # UTR exons have no CDS
                # do not expect any in UCSC
                continue
            gtf.feature = "CDS"
            # invert the frame
            frame = (3 - frame % 3) % 3
            gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame
            outf.write("%s\n" % str(gtf))

    outf.close()

    E.info("%s" % str(counts))
示例#10
0
def main(argv=None):
    '''
    main function
    '''

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o", "--output-only-attributes", dest="only_attributes",
        action="store_true",
        help="output only attributes as separate columns "
        "[default=%default].")

    parser.add_option(
        "-f", "--attributes-as-columns", dest="output_full",
        action="store_true",
        help="output attributes as separate columns "
        "[default=%default].")

    parser.add_option("--is-gff3", dest="is_gtf", action="store_false",
                      help="input file is in gtf format [default=%default] ")

    parser.add_option(
        "-i", "--invert", dest="invert", action="store_true",
        help="convert tab-separated table back to gtf "
        "[default=%default].")

    parser.add_option(
        "-m", "--output-map", dest="output_map", type="choice",
        choices=(
            "transcript2gene",
            "peptide2gene",
            "peptide2transcript"),
        help="output a map mapping transcripts to genes "
        "[default=%default].")

    parser.set_defaults(
        only_attributes=False,
        output_full=False,
        invert=False,
        output_map=None,
        is_gtf=True
    )

    (options, args) = E.start(parser, argv=argv)

    if options.output_full:
        # output full table with column for each attribute

        attributes = set()
        data = []
        if options.is_gtf:
            for gtf in GTF.iterator(options.stdin):
                data.append(gtf)
                attributes = attributes.union(set(gtf.keys()))

        else:
            for gff in GFF3.iterator_from_gff(options.stdin):
                data.append(gff)
                attributes = attributes.union(set(gff.attributes))

        # remove gene_id and transcript_id, as they are used
        # explicitely later
        attributes.difference_update(["gene_id", "transcript_id"])

        attributes = sorted(list(attributes))

        # Select whether gtf of gff for output columns
        if options.is_gtf:
            if options.only_attributes:
                header = ["gene_id", "transcript_id"] + attributes
            else:
                header = ["contig", "source", "feature",
                          "start", "end", "score", "strand",
                          "frame", "gene_id",
                          "transcript_id", ] + attributes
        else:
            if options.only_attributes:
                header = attributes
            else:
                header = ["contig", "source", "feature",
                          "start", "end", "score", "strand",
                          "frame"] + attributes

        attributes_new = header

        options.stdout.write("\t".join(header) + "\n")

        if options.is_gtf:
            for gtf in data:
                first = True
                for a in attributes_new:
                    try:
                        val = getattr(gtf, a)
                    except (AttributeError, KeyError):
                        val = ""
                    if first:
                        options.stdout.write("%s" % val)
                        first = False
                    else:
                        options.stdout.write("\t%s" % val)
                options.stdout.write("\n")
        else:
            for gff in data:
                options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t") % (gff.contig,
                                                                             gff.source, gff.feature, gff.start, gff.end,
                                                                             gff.score, gff.strand, gff.frame))

                first = True
                for a in attributes:
                    try:
                        val = (gff.attributes[a])
                    except (AttributeError, KeyError):
                        val = ''
                    if first:
                        options.stdout.write("%s" % val)
                        first = False
                    else:
                        options.stdout.write("\t%s" % val)
                options.stdout.write("\n")

    elif options.invert:

        gtf = GTF.Entry()
        header = None
        for line in options.stdin:
            if line.startswith("#"):
                continue
            data = line[:-1].split("\t")
            if not header:
                header = data
                map_header2column = dict(
                    [(y, x) for x, y in enumerate(header)])
                continue

            # fill gtf entry with data
            try:
                gtf.contig = data[map_header2column["contig"]]
                gtf.source = data[map_header2column["source"]]
                gtf.feature = data[map_header2column["feature"]]
                # subtract -1 to start for 0-based coordinates
                gtf.start = int(data[map_header2column["start"]])
                gtf.end = int(data[map_header2column["end"]])
                gtf.score = data[map_header2column["score"]]
                gtf.strand = data[map_header2column["strand"]]
                gtf.frame = data[map_header2column["frame"]]
                gtf.gene_id = data[map_header2column["gene_id"]]
                gtf.transcript_id = data[map_header2column["transcript_id"]]
                gtf.parseInfo(data[map_header2column["attributes"]], line)
            except KeyError as msg:
                raise KeyError("incomplete entry %s: %s: %s" %
                               (str(data), str(map_header2column), msg))
            if gtf.frame is None:
                gtf.frame = "."
            # output gtf entry in gtf format
            options.stdout.write("%s\n" % str(gtf))

    elif options.output_map:

        if options.output_map == "transcript2gene":
            fr = lambda x: x.transcript_id
            to = lambda x: x.gene_id
            options.stdout.write("transcript_id\tgene_id\n")
        elif options.output_map == "peptide2gene":
            fr = lambda x: x.protein_id
            to = lambda x: x.gene_id
            options.stdout.write("peptide_id\tgene_id\n")
        elif options.output_map == "peptide2transcript":
            fr = lambda x: x.protein_id
            to = lambda x: x.transcript_id
            options.stdout.write("peptide_id\ttranscript_id\n")

        map_fr2to = {}
        for gtf in GTF.iterator(options.stdin):
            try:
                map_fr2to[fr(gtf)] = to(gtf)
            except (AttributeError, KeyError):
                pass

        for x, y in sorted(map_fr2to.items()):
            options.stdout.write("%s\t%s\n" % (x, y))
    else:
        header = ("contig", "source", "feature", "start", "end", "score",
                  "strand", "frame", "gene_id", "transcript_id", "attributes")
        options.stdout.write("\t".join(header) + "\n")

        for gtf in GTF.iterator(options.stdin):
            attributes = []
            for a in list(gtf.keys()):
                if a in ("gene_id", "transcript_id"):
                    continue
                attributes.append('%s %s' % (a, GTF.quote(gtf[a])))

            attributes = "; ".join(attributes)

            # Capture if None and set to . format
            if gtf.frame is None:
                gtf.frame = "."

            options.stdout.write(str(gtf) + "\n")

    E.stop()
示例#11
0
def buildTerritories(iterator, fasta, method, options):
    """build gene territories.

    Exons in a gene are merged and the resulting segments enlarged by
    --radius. Territories overlapping are divided in the midpoint
    between the two genes.

    If *method* is ``gene``, gene territories will be built.
    If *method* is ``tss``, tss territories will be built.

    """

    ninput, noutput, nambiguous = 0, 0, 0

    assert method in ("gene", "tss")

    dr = 2 * options.radius

    prev_pos = 0
    last_contig = None
    gff = None

    def _iterator(iterator):
        """yield gene plus the locations of the end of the previous gene and
        start of next gene"""

        last_end, prev_end = 0, 0
        last_contig = None
        last = None
        for matches in GTF.iterator_overlaps(iterator):

            this_start = min([x.start for x in matches])
            this_end = max([x.end for x in matches])

            if method == "tss":
                # restrict to tss
                if matches[0].strand == "+":
                    this_end = this_start + 1
                else:
                    this_start = this_end - 1

            this_contig = matches[0].contig

            if last_contig != this_contig:
                if last:
                    yield prev_end, last, fasta.getLength(last_contig)
                last_end, prev_end = 0, 0
            else:
                yield prev_end, last, this_start

            prev_end = last_end
            last_end = this_end
            last = matches
            last_contig = this_contig

        if last:
            yield prev_end, last, fasta.getLength(last_contig)

    for last_end, matches, next_start in _iterator(iterator):

        gff = GTF.Entry().copy(matches[0])

        start = min([x.start for x in matches])
        end = max([x.end for x in matches])

        if method == "tss":
            # restrict to tss
            if matches[0].strand == "+":
                end = start + 1
            else:
                start = end - 1

        d = start - last_end
        if d < dr:
            start -= d // 2
        else:
            start -= options.radius

        d = next_start - end
        if d < dr:
            end += d // 2
        else:
            end += options.radius

        gff.gene_id = ":".join(sorted(set([x.gene_id for x in matches])))
        gff.transcript_id = gff.gene_id
        gff.start, gff.end = start, end

        nsegments = len(matches)
        if nsegments > 1:
            gff.addAttribute("ambiguous", nsegments)
            nambiguous += 1

        assert gff.start < gff.end, "invalid segment: %s" % str(gff)
        options.stdout.write(str(gff) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nambiguous=%i" %
           (ninput, noutput, nambiguous))
示例#12
0
def convert_hierarchy(first_gffs, second_gffs, options):
    ''' Converts GFF to GTF by parsing the hierarchy.
    First parses :param:first_gffs to build the hierarchy then iterates over second_gffs
    using a call to the recursive function search_hierarchy to identify gene_ids and transcript_ids.

    If multiple gene and transcript_ids are found outputs a record for each combination.

    If no definitive transcript_id is found and options.missing_gene is True, it will use the
    possible_transcript_id as transcript_id, which is the ID one level below the entry used as gene_id.
    If this is also None (that is there was only on level), sets transcript_id to gene_id.

    Might raise ValueError if options.missing_gene is false and either no gene or no transcript_id
    was found for an entry.

    Might raise RuntimeError if the recursion limit was reached because the input contains circular
    references. '''

    hierarchy = {}

    for gff in first_gffs:

        if not (options.parent == "Parent"):
            if options.parent in gff.asDict():
                gff['Parent'] = gff[options.parent].split(",")
            else:
                gff['Parent'] = []

        hierarchy[gff['ID']] = {
            "type":
            gff.feature,
            "Parent":
            gff.asDict().get("Parent", []),
            "gene_id":
            gff.attributes.get(options.gene_field_or_pattern, gff['ID']),
            "transcript_id":
            gff.attributes.get(options.transcript_field_or_pattern, gff['ID'])
        }

    for gff in second_gffs:

        if options.discard and (
            (options.missing_gene and options.parent not in gff) or
            (gff.feature in (options.gene_type, options.transcript_type))):

            continue

        gene_ids, transcript_ids, poss_transcript_ids = search_hierarchy(
            gff['ID'], hierarchy, options)

        assert len(gene_ids) > 0 and len(transcript_ids) > 0

        if options.missing_gene:

            transcript_ids = [
                poss if found is None else found
                for found, poss in zip(transcript_ids, poss_transcript_ids)
            ]

            transcript_ids = [
                gid if found is None else found
                for found, gid in zip(transcript_ids, gene_ids)
            ]

        elif None in transcript_ids:
            raise ValueError("failed to find transcript id for %s" % gff['ID'])

        for gene_id, transcript_id in zip(gene_ids, transcript_ids):

            gff.gene_id = gene_id
            gff.transcript_id = transcript_id

            gtf_entry = GTF.Entry()
            gtf_entry.copy(gff)
            if "Parent" in gtf_entry:
                gtf_entry['Parent'] = ",".join(gtf_entry['Parent'])

            options.stdout.write(str(gtf_entry) + "\n")
示例#13
0
def annotateRegulons(iterator, fasta, tss, options):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = (min(lcontig, max(0, interval[0])),
                        min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have
            # changed)
            regulons = Intervals.combine(regulons)
            transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            nregulons += 1
            x += 1

    E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" %
           (ngenes, ntranscripts, nregulons))
示例#14
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="method",
        type=str,
        choices=("add-flank", "add-upstream-flank", "add-downstream-flank",
                 "crop", "crop-unique", "complement-groups", "combine-groups",
                 "filter-range", "join-features", "merge-features", "sanitize",
                 "to-forward-coordinates", "to-forward-strand", "rename-chr"),
        help="method to apply ")

    parser.add_argument("--ignore-strand",
                        dest="ignore_strand",
                        help="ignore strand information.",
                        action="store_true")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="input will be treated as gtf.")

    parser.add_argument("-c",
                        "--contigs-tsv-file",
                        dest="input_filename_contigs",
                        type=str,
                        help="filename with contig lengths.")

    parser.add_argument(
        "--agp-file",
        dest="input_filename_agp",
        type=str,
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("--crop-gff-file",
                        dest="filename_crop_gff",
                        type=str,
                        help="GFF/GTF file to crop against.")

    parser.add_argument(
        "--group-field",
        dest="group_field",
        type=str,
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... .""")

    parser.add_argument(
        "--filter-range",
        dest="filter_range",
        type=str,
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_argument("--sanitize-method",
                        dest="sanitize_method",
                        type=str,
                        choices=("ucsc", "ensembl", "genome"),
                        help="method to use for sanitizing chromosome names. "
                        ".")

    parser.add_argument(
        "--flank-method",
        dest="flank_method",
        type=str,
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        ".")

    parser.add_argument("--skip-missing",
                        dest="skip_missing",
                        action="store_true",
                        help="skip entries on missing contigs. Otherwise an "
                        "exception is raised .")

    parser.add_argument(
        "--contig-pattern",
        dest="contig_pattern",
        type=str,
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize .")

    parser.add_argument(
        "--assembly-report",
        dest="assembly_report",
        type=str,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-hasids",
        dest="assembly_report_hasIDs",
        type=int,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-ucsccol",
        dest="assembly_report_ucsccol",
        type=int,
        help="column in the assembly report containing ucsc contig ids"
        ".")

    parser.add_argument(
        "--assembly-report-ensemblcol",
        dest="assembly_report_ensemblcol",
        type=int,
        help="column in the assembly report containing ensembl contig ids")

    parser.add_argument(
        "--assembly-extras",
        dest="assembly_extras",
        type=str,
        help="additional mismatches between gtf and fasta to fix when"
        "sanitizing the genome .")

    parser.add_argument("--extension-upstream",
                        dest="extension_upstream",
                        type=float,
                        help="extension for upstream end .")

    parser.add_argument("--extension-downstream",
                        dest="extension_downstream",
                        type=float,
                        help="extension for downstream end .")

    parser.add_argument("--min-distance",
                        dest="min_distance",
                        type=int,
                        help="minimum distance of features to merge/join .")

    parser.add_argument("--max-distance",
                        dest="max_distance",
                        type=int,
                        help="maximum distance of features to merge/join .")

    parser.add_argument("--min-features",
                        dest="min_features",
                        type=int,
                        help="minimum number of features to merge/join .")

    parser.add_argument("--max-features",
                        dest="max_features",
                        type=int,
                        help="maximum number of features to merge/join .")

    parser.add_argument(
        "--rename-chr-file",
        dest="rename_chr_file",
        type=str,
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(input_filename_contigs=False,
                        filename_crop_gff=None,
                        input_filename_agp=False,
                        genome_file=None,
                        rename_chr_file=None,
                        add_up_flank=None,
                        add_down_flank=None,
                        complement_groups=False,
                        crop=None,
                        crop_unique=False,
                        ignore_strand=False,
                        filter_range=None,
                        min_distance=0,
                        max_distance=0,
                        min_features=1,
                        max_features=0,
                        extension_upstream=1000,
                        extension_downstream=1000,
                        sanitize_method="ucsc",
                        flank_method="add",
                        output_format="%06i",
                        skip_missing=False,
                        is_gtf=False,
                        group_field=None,
                        contig_pattern=None,
                        assembly_report=None,
                        assembly_report_hasIDs=1,
                        assembly_report_ensemblcol=4,
                        assembly_report_ucsccol=9,
                        assembly_extras=None)

    (args) = E.start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    chr_map = None

    if args.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            iotools.open_file(args.input_filename_contigs, "r"))

    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    if args.rename_chr_file:
        chr_map = {}
        with open(args.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    if args.assembly_report:
        df = pd.read_csv(args.assembly_report,
                         comment="#",
                         header=None,
                         sep="\t")
        # fixes naming inconsistency in assembly report: ensembl chromosome
        # contigs found in columnn 0, ensembl unassigned contigs found in
        # column 4.
        if args.assembly_report_hasIDs == 1:
            ucsccol = args.assembly_report_ucsccol
            ensemblcol = args.assembly_report_ensemblcol
            df.loc[df[1] == "assembled-molecule",
                   ensemblcol] = df.loc[df[1] == "assembled-molecule", 0]
            if args.sanitize_method == "ucsc":
                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
            elif args.sanitize_method == "ensembl":
                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
            else:
                raise ValueError(''' When using assembly report,
                please specify sanitize method as either
                "ucsc" or "ensembl" to specify direction of conversion
                ''')
        else:
            assembly_dict = {}
        if args.assembly_extras is not None:
            assembly_extras = args.assembly_extras.split(",")
            for item in assembly_extras:
                item = item.split("-")
                assembly_dict[item[0]] = item[1]

    if args.method in ("forward_coordinates", "forward_strand",
                       "add-flank", "add-upstream-flank",
                       "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if args.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(iotools.open_file(args.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(args.stdin)

    if args.method in ("add-upstream-flank", "add-downstream-flank",
                       "add-flank"):

        add_upstream_flank = "add-upstream-flank" == args.method
        add_downstream_flank = "add-downstream-flank" == args.method
        if args.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(args.extension_upstream)
        downstream_flank = int(args.extension_downstream)
        extend_flank = args.flank_method == "extend"

        if args.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, args.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(0,
                                             chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(0,
                                             chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                args.stdout.write(str(gff) + "\n")

    elif args.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            if args.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                args.stdout.write(str(x) + "\n")
                x.start = c.end

    elif args.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            args.stdout.write(str(x) + "\n")

    elif args.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=False,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=True,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop":
        for gff in cropGFF(gffs, args.filename_crop_gff):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)",
                                                   args.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      args.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % args.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "sanitize":

        def assemblyReport(id):
            if id in assembly_dict.keys():
                id = assembly_dict[id]
            # if not in dict, the contig name is forced
            # into the desired convention, this is helpful user
            # modified gff files that contain additional contigs
            elif args.sanitize_method == "ucsc":
                if not id.startswith("contig") and not id.startswith("chr"):
                    id = "chr%s" % id
            elif args.sanitize_method == "ensembl":
                if id.startswith("contig"):
                    return id[len("contig"):]
                elif id.startswith("chr"):
                    return id[len("chr"):]
            return id

        if args.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError("please specify --genome-file= when using "
                                 "--sanitize-method=genome")
            f = genome_fasta.getToken
        else:
            if args.assembly_report is None:
                raise ValueError(
                    "please specify --assembly-report= when using "
                    "--sanitize-method=ucsc or ensembl")
            f = assemblyReport

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if args.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if args.contig_pattern:
                to_remove = [
                    re.compile(x) for x in args.contig_pattern.split(",")
                ]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            args.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys())), str(skipped_contigs)))

        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(list(
                       outofrange_contigs.keys())), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())), str(filtered_contigs)))

    elif args.method == "rename-chr":
        if not chr_map:
            raise ValueError("please supply mapping file")

        for gff in renameChromosomes(gffs, chr_map):
            args.stdout.write(str(gff) + "\n")

    else:

        for gff in gffs:

            if args.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if args.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            args.stdout.write(str(gff) + "\n")

    E.stop()
示例#15
0
def combineGFF(gffs,
               min_distance,
               max_distance,
               min_features,
               max_features,
               merge=True,
               output_format="%06i"):
    """join intervals in gff file.

    Note: strandedness is ignored
    """

    E.info("joining features: min distance=%i, max_distance=%i, "
           "at least %i and at most %i features." %
           (min_distance, max_distance, min_features, max_features))

    def iterate_chunks(gffs):

        last = next(gffs)
        to_join = [last]

        for gff in gffs:
            d = gff.start - last.end
            if gff.contig == last.contig:
                assert gff.start >= last.start, "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" % (
                    d, last, gff)

            if gff.contig != last.contig or \
                    (max_distance and d > max_distance) or \
                    (min_distance and d < min_distance) or \
                    (max_features and len(to_join) >= max_features):

                if min_features or len(to_join) >= min_features:
                    yield to_join
                to_join = []

            last = gff
            to_join.append(gff)

        if len(to_join) >= min_features:
            yield to_join
        raise StopIteration

    id = 1
    ninput, noutput, nfeatures = 0, 0, 0

    if merge:
        for to_join in iterate_chunks(gffs):

            ninput += 1
            y = GTF.Entry()
            t = output_format % id
            y.fromGTF(to_join[0], t, t)
            y.start = to_join[0].start
            y.end = to_join[-1].end

            yield (y)
            nfeatures += 1

            noutput += 1
            id += 1
    else:

        for to_join in iterate_chunks(gffs):

            ninput += 1
            for x in to_join:
                y = GTF.Entry()
                t = output_format % id
                y.fromGTF(x, t, t)
                yield (y)
                nfeatures += 1

            noutput += 1
            id += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i" %
           (ninput, noutput, nfeatures))
示例#16
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-a",
                        "--as-gtf",
                        dest="as_gtf",
                        action="store_true",
                        help="output as gtf.")

    parser.add_argument(
        "-f",
        "--id-format",
        dest="id_format",
        type=str,
        help="format for numeric identifier if --as-gtf is set and "
        "no name in bed file .")

    parser.set_defaults(as_gtf=False, id_format="%08i", test=None)

    (args) = E.start(parser, add_pipe_options=True)

    as_gtf = args.as_gtf
    id_format = args.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator(args.stdin):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start
        gff.end = bed.end
        if bed.fields and len(bed.fields) >= 3:
            gff.strand = bed.fields[2]
        else:
            gff.strand = "."

        if bed.fields and len(bed.fields) >= 2:
            gff.score = bed.fields[1]

        if as_gtf:
            if bed.fields:
                gff.gene_id = bed.fields[0]
                gff.transcript_id = bed.fields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id
        else:
            if bed.fields:
                gff.source = bed.fields[0]

        args.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.stop()
示例#17
0
def annotateGREATDomains(iterator, fasta, options):
    """build great domains

    extend from TSS a basal region.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    counter = E.Counter()

    upstream, downstream = options.upstream, options.downstream
    radius = options.radius
    outfile = options.stdout

    regions = []
    ####################################################################
    # define basal regions for each gene
    # take all basal regions per transcript and merge them
    # Thus, the basal region of a gene might be larger than the sum
    # of options.upstream + options.downstream
    for gene in gene_iterator:
        counter.genes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)

        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []

        # collect every basal region per transcript
        for transcript in gene:
            counter.transcripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            # add range to both sides of tss
            if is_negative_strand:
                interval = ma - options.downstream, ma + options.upstream
            else:
                interval = mi - options.upstream, mi + options.downstream

            interval = (min(lcontig, max(0, interval[0])),
                        min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        # take first/last entry
        start, end = min(x[0] for x in regulons), max(x[1] for x in regulons)

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "greatdomain"
        gtf.start, gtf.end = start, end
        regions.append(gtf)

    regions.sort(key=lambda x: (x.contig, x.start))

    outf = iotools.open_file("test.gff", "w")
    for x in regions:
        outf.write(str(x) + "\n")
    outf.close()

    ####################################################################
    # extend basal regions
    regions.sort(key=lambda x: (x.contig, x.start))

    # iterate within groups of overlapping basal regions
    groups = list(GTF.iterator_overlaps(iter(regions)))
    counter.groups = len(groups)

    last_end = 0
    reset = False

    for region_id, group in enumerate(groups):

        # collect basal intervals in group
        intervals = [(x.start, x.end) for x in group]

        def overlapsBasalRegion(pos):
            for start, end in intervals:
                if start == pos or end == pos:
                    continue
                if start <= pos < end:
                    return True
                if start > pos:
                    return False
            return False

        # deal with boundary cases - end of contig
        if region_id < len(groups) - 1:
            nxt = groups[region_id + 1]
            if nxt[0].contig == group[0].contig:
                next_start = min([x.start for x in nxt])
            else:
                next_start = fasta.getLength(group[0].contig)
                reset = True
        else:
            next_start = fasta.getLength(group[0].contig)
            reset = True

        # last_end = basal extension of previous group
        # next_start = basal_extension of next group

        # extend region to previous/next group always extend
        # dowstream, but upstream only extend if basal region of an
        # interval is not already overlapping another basal region
        # within the group
        save_end = 0
        for gtf in group:
            save_end = max(save_end, gtf.end)
            if gtf.strand == "+":
                if not overlapsBasalRegion(gtf.start):
                    gtf.start = max(gtf.start - radius, last_end)
                # always extend downstream
                gtf.end = min(gtf.end + radius, next_start)
            else:
                # always extend downstream
                gtf.start = max(gtf.start - radius, last_end)
                if not overlapsBasalRegion(gtf.end):
                    gtf.end = min(gtf.end + radius, next_start)
            outfile.write(str(gtf) + "\n")
            counter.regulons += 1

        if len(group) > 1:
            counter.overlaps += len(group)
        else:
            counter.nonoverlaps += 1

        if reset:
            last_end = 0
            reset = False
        else:
            last_end = save_end

    E.info("%s" % str(counter))
示例#18
0
def annotateGenes(iterator, fasta, options):
    """annotate gene structures

    This method outputs intervals for first/middle/last exon/intron,
    UTRs and flanking regions.

    This method annotates per transcript. In order to achieve a unique tiling,
    use only a single transcript per gene and remove any overlap between
    genes.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nskipped = 0, 0, 0

    results = []
    increment = options.increment

    introns_detail = "introns" in options.detail
    exons_detail = "exons" in options.detail

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        try:
            lcontig = fasta.getLength(gene[0][0].contig)
        except KeyError:
            nskipped += 1
            continue

        results = []

        for transcript in gene:

            def _add(interval, anno):
                gtf = GTF.Entry()
                gtf.contig = transcript[0].contig
                gtf.gene_id = transcript[0].gene_id
                gtf.transcript_id = transcript[0].transcript_id
                gtf.strand = transcript[0].strand
                gtf.feature = anno
                gtf.start, gtf.end = interval
                results.append(gtf)

            ntranscripts += 1

            exons = [(x.start, x.end) for x in transcript
                     if x.feature == "exon"]
            if len(exons) == 0:
                nskipped += 1

            exons.sort()
            introns = []
            end = exons[0][1]
            for exon in exons[1:]:
                introns.append((end, exon[0]))
                end = exon[1]

            # add flank
            start, end = exons[0][0], exons[-1][1]
            upstream, downstream = [], []
            for x in range(0, options.flank, increment):
                upstream.append((start - increment, start))
                start -= increment
                downstream.append((end, end + increment))
                end += increment

            # remove out-of-bounds coordinates
            upstream = [x for x in upstream if x[0] >= 0]
            downstream = [x for x in downstream if x[1] <= lcontig]

            if is_negative_strand:
                exons.reverse()
                introns.reverse()
                upstream, downstream = downstream, upstream

            # add exons
            if exons_detail:
                _add(exons[0], "first_exon")
                if len(exons) > 1:
                    _add(exons[-1], "last_exon")
                for e in exons[1:-1]:
                    _add(e, "middle_exon")
            else:
                for e in exons:
                    _add(e, "exon")

            # add introns
            if introns_detail:
                if len(introns) > 0:
                    _add(introns[0], "first_intron")
                if len(introns) > 1:
                    _add(introns[-1], "last_intron")
                for i in introns[1:-1]:
                    _add(i, "middle_intron")
            else:
                for i in introns:
                    _add(i, "intron")

            for x, u in enumerate(upstream):
                _add(u, "upstream_%i" % (increment * (x + 1)))

            for x, u in enumerate(downstream):
                _add(u, "downstream_%i" % (increment * (x + 1)))

            results.sort(key=lambda x: x.feature)

        cache = []
        for key, vals in itertools.groupby(results, key=lambda x: x.feature):
            v = list(vals)
            intervals = [(x.start, x.end) for x in v]
            intervals = Intervals.combine(intervals)

            for start, end in intervals:
                r = GTF.Entry()
                r.copy(v[0])
                r.start, r.end = start, end
                cache.append(r)

        cache.sort(key=lambda x: x.start)
        for r in cache:
            options.stdout.write("%s\n" % str(r))

    E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" %
           (ngenes, ntranscripts, nskipped))