Python GTF.readAndIndex примеры использования

Язык программирования: Python

Пространство имен/Пакет: cgat

Класс/Тип: GTF

Метод/Функция: readAndIndex

Примеров на hotexamples.com: 2

Python GTF.readAndIndex - 2 примера найдено. Это лучшие примеры Python кода для cgat.GTF.readAndIndex, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

iterator(30)

Entry(18)

transcript_iterator(9)

gene_iterator(6)

readFromFile(4)

asRanges(3)

iterator_filtered(3)

joined_iterator(3)

readAsIntervals(3)

flat_gene_iterator(2)

readAndIndex(2)

iterator_overlaps(2)

chunk_iterator(1)

SortPerContig(1)

Overlap(1)

iterator_sorted(1)

quote(1)

toIntronIntervals(1)

Пример #1

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--mask-bed-file",
        "--mask-gff-file",
        dest="filename_bed",
        type="string",
        metavar='GFF',
        help="gff formatted file with masking locations. The number of "
        "reads overlapping the intervals in the given file will be "
        "computed. Note that the computation currently does not take "
        "into account indels, so it is an approximate count only. "
        "[%default]")

    parser.add_option(
        "-f",
        "--ignore-masked-reads",
        dest="ignore_masked_reads",
        action="store_true",
        help="as well as counting reads in the file given by --mask-bed-file, "
        "also remove these reads for duplicate and match statistics. "
        "[%default]")

    parser.add_option(
        "-i",
        "--num-reads",
        dest="input_reads",
        type="int",
        help="the number of reads - if given, used to provide percentages "
        "[%default]")

    parser.add_option(
        "-d",
        "--output-details",
        dest="output_details",
        action="store_true",
        help="output per-read details into a separate file. Read names are "
        "md5/base64 encoded [%default]")

    parser.add_option("--output-readmap",
                      dest="output_readmap",
                      action="store_true",
                      help="output map between read name and "
                      "md5/base64 encoded short name[%default]")

    parser.add_option(
        "--add-alignment-details",
        dest="add_alignment_details",
        action="store_true",
        help=
        "add alignment details to per-read details. Implies --output-details "
        "[%default]")

    parser.add_option(
        "-q",
        "--fastq-file",
        dest="filename_fastq",
        help="filename with sequences and quality scores. This file is only "
        "used to collect sequence identifiers. Thus, for paired end data a "
        "single file is sufficient [%default]")

    parser.add_option(
        "--basic-counts",
        dest="detailed_count",
        action="store_false",
        help="perform basic counting and do not compute per read stats. "
        "This is more memory efficient and faster stats computation, "
        "but only a summary counts table is output [%default]")

    parser.set_defaults(
        filename_bed=None,
        ignore_masked_reads=False,
        input_reads=0,
        force_output=False,
        filename_fastq=None,
        detailed_count=True,
        output_details=False,
        output_readmap=False,
        add_alignment_details=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if options.filename_bed:
        bed_mask = GTF.readAndIndex(
            GTF.iterator(iotools.open_file(options.filename_bed)))
    else:
        bed_mask = None

    if options.add_alignment_details:
        options.output_details = True

    is_stdin = True
    if len(args) > 0:
        pysam_in = pysam.AlignmentFile(args[0], "rb")
        if args[0] != "-":
            is_stdin = False
    elif options.stdin == sys.stdin:
        pysam_in = pysam.AlignmentFile("-", "rb")
    else:
        pysam_in = pysam.AlignmentFile(options.stdin, "rb")
        if options.stdin != "-":
            is_stdin = False

    if options.output_details:
        outfile_details = E.open_output_file("details", "w")
    else:
        outfile_details = None

    if options.output_readmap:
        outfile_readmap = E.open_output_file("readmap", "w")
    else:
        outfile_readmap = None

    if options.filename_fastq and not os.path.exists(options.filename_fastq):
        raise IOError("file %s does not exist" % options.filename_fastq)

    (counter, flags_counts, nh_filtered, nh_all,
     nm_filtered, nm_all, mapq, mapq_all, max_hi, details_df) = \
        bam2stats_count(pysam_in,
                        bed_mask=bed_mask,
                        ignore_masked_reads=options.ignore_masked_reads,
                        is_stdin=is_stdin,
                        filename_fastq=options.filename_fastq,
                        outfile_details=outfile_details,
                        add_alignment_details=options.add_alignment_details,
                        outfile_readmap=outfile_readmap,
                        detailed_count=options.detailed_count)

    if max_hi > 0 and max_hi != max(nh_all.keys()):
        E.warn("max_hi(%i) is inconsistent with max_nh (%i) "
               "- counts will be corrected" % (max_hi, max(nh_all.keys())))

    outs = options.stdout
    outs.write("category\tcounts\tpercent\tof\n")

    def _write(outs, text, numerator, denominator, base):
        percent = iotools.pretty_percent(numerator, denominator)
        outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base))

    ###############################
    ###############################
    ###############################
    # Output alignment information
    ###############################
    nalignments_unmapped = flags_counts["unmapped"]
    nalignments_mapped = counter.alignments_input - nalignments_unmapped

    _write(outs, "alignments_total", counter.alignments_input,
           counter.alignments_input, "alignments_total")

    if counter.alignments_input == 0:
        E.warn("no alignments in BAM file - no further output")
        E.stop()
        return

    _write(outs, "alignments_mapped", nalignments_mapped,
           counter.alignments_input, 'alignments_total')
    _write(outs, "alignments_unmapped", nalignments_unmapped,
           counter.alignments_input, 'alignments_total')

    if nalignments_mapped == 0:
        E.warn("no mapped alignments - no further output")
        E.stop()
        return

    for flag, counts in sorted(flags_counts.items()):
        if flag == "unmapped":
            continue
        _write(outs, 'alignments_' + flag, counts, nalignments_mapped,
               'alignments_mapped')

    if options.filename_bed:
        _write(outs, "alignments_masked", counter.alignments_masked,
               nalignments_mapped, 'alignments_mapped')
        _write(outs, "alignments_notmasked", counter.alignments_notmasked,
               nalignments_mapped, 'alignments_mapped')

    _write(outs, "alignments_filtered", counter.alignments_filtered,
           nalignments_mapped, "alignments_mapped")

    if counter.filtered == nalignments_mapped:
        normby = "alignments_mapped"
    else:
        normby = "alignments_filtered"

    if counter.filtered > 0:
        _write(outs, "alignments_duplicates", counter.alignments_duplicates,
               counter.alignments_filtered, normby)
        _write(outs, "alignments_unique",
               counter.aligmnments_filtered - counter.alignments_duplicates,
               counter.alignments_filtered, normby)

    ###############################
    ###############################
    ###############################
    # Output read based information
    ###############################

    # derive the number of mapped reads in file from alignment counts
    if options.filename_fastq or not is_stdin:
        nreads_total = counter.total_read
        _write(outs, "reads_total", counter.total_read, nreads_total,
               'reads_total')
        _write(outs, "reads_unmapped", counter.total_read_is_unmapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped", counter.total_read_is_mapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_missing", counter.total_read_is_missing,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_multimapping", counter.total_read_is_mmap,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_mapped_supplementary",
               counter.total_read_has_supplementary,
               counter.total_read_is_mapped, 'reads_mapped')
    else:
        E.warn('inferring read counts from alignments and NH tags')
        nreads_unmapped = flags_counts["unmapped"]
        nreads_mapped = computeMappedReadsFromAlignments(
            nalignments_mapped, nh_all, max_hi)

        nreads_missing = 0
        if options.input_reads:
            nreads_total = options.input_reads
            # unmapped reads in bam file?
            if nreads_unmapped:
                nreads_missing = nreads_total - nreads_unmapped - nreads_mapped
            else:
                nreads_unmapped = nreads_total - nreads_mapped

        elif nreads_unmapped:
            # if unmapped reads are in bam file, take those
            nreads_total = nreads_mapped + nreads_unmapped
        else:
            # otherwise normalize by mapped reads
            nreads_unmapped = 0
            nreads_total = nreads_mapped

        outs.write("reads_total\t%i\t%5.2f\treads_total\n" %
                   (nreads_total, 100.0))
        outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_mapped, 100.0 * nreads_mapped / nreads_total))
        outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total))
        outs.write("reads_missing\t%i\t%5.2f\treads_total\n" %
                   (nreads_missing, 100.0 * nreads_missing / nreads_total))

        if len(nh_all) > 1:
            outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" %
                       (nh_all[1], 100.0 * nh_all[1] / nreads_mapped))

    pysam_in.close()

    ###############################
    ###############################
    ###############################
    # Output pair information
    ###############################
    if flags_counts["read2"] > 0:
        if options.filename_fastq:
            pairs_mapped = counter.total_pair_is_mapped

            # sanity check
            assert counter.total_pair_is_mapped == \
                (counter.total_pair_is_proper_uniq +
                 counter.total_pair_is_incomplete_uniq +
                 counter.total_pair_is_incomplete_mmap +
                 counter.total_pair_is_proper_duplicate +
                 counter.total_pair_is_proper_mmap +
                 counter.total_pair_not_proper_uniq +
                 counter.total_pair_is_other)

            outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pairs,
                        100.0 * counter.total_pairs / counter.total_pairs))
            outs.write(
                "pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs))
            outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_unmapped, 100.0 *
                        counter.total_pair_is_unmapped / counter.total_pairs))
            outs.write(
                "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_uniq, 100.0 *
                 counter.total_pair_is_proper_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_uniq, 100.0 *
                 counter.total_pair_is_incomplete_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_mmap, 100.0 *
                 counter.total_pair_is_incomplete_mmap / counter.total_pairs))
            outs.write(
                "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_duplicate, 100.0 *
                 counter.total_pair_is_proper_duplicate / counter.total_pairs))
            outs.write(
                "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_mmap, 100.0 *
                 counter.total_pair_is_proper_mmap / counter.total_pairs))
            outs.write(
                "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_not_proper_uniq, 100.0 *
                 counter.total_pair_not_proper_uniq / counter.total_pairs))
            outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_other, 100.0 *
                        counter.total_pair_is_other / counter.total_pairs))

            nread1_total = counter.total_read1
            _write(outs, "read1_total", counter.total_read1, nread1_total,
                   'read1_total')
            _write(outs, "read1_unmapped", counter.total_read1_is_unmapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped", counter.total_read1_is_mapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped_unique",
                   counter.total_read1_is_mapped_uniq,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "reads_multimapping", counter.total_read1_is_mmap,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "read1_missing", counter.total_read1_is_missing,
                   counter.total_read1_is_mapped, 'read1_total')

            nread2_total = counter.total_read2
            _write(outs, "read2_total", counter.total_read2, nread2_total,
                   'read2_total')
            _write(outs, "read2_unmapped", counter.total_read2_is_unmapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped", counter.total_read2_is_mapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped_unique",
                   counter.total_read2_is_mapped_uniq,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "reads_multimapping", counter.total_read2_is_mmap,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "read2_missing", counter.total_read2_is_missing,
                   counter.total_read2_is_mapped, 'read2_total')

        else:
            # approximate counts
            pairs_total = nreads_total // 2
            pairs_mapped = flags_counts["proper_pair"] // 2
            _write(outs, "pairs_total", pairs_total, pairs_total,
                   "pairs_total")
            _write(outs, "pairs_mapped", pairs_mapped, pairs_total,
                   "pairs_total")
    else:
        # no paired end data
        pairs_total = pairs_mapped = 0
        outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_total, 0.0))
        outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_mapped, 0.0))

    outs.write("error_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.error_counts, counter.error_rate * 100.0))
    outs.write("insertion_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.insertion_counts, counter.insertion_rate * 100.0))
    outs.write("deletion_rate\t%i\t%5.2f\tmatches+deletions\n" %
               (counter.deletion_counts, counter.deletion_rate * 100.0))
    outs.write("mismatch_rate\t%i\t%5.2f\tmatches\n" %
               (counter.mismatch_counts, counter.mismatch_rate * 100.0))
    outs.write("match_rate\t%i\t%5.2f\tmatches+insertions\n" %
               (counter.match_counts, counter.match_rate * 100.0))

    if options.force_output or len(nm_filtered) > 0:
        outfile = E.open_output_file("nm", "w")
        outfile.write("NM\talignments\n")
        if len(nm_filtered) > 0:
            for x in range(0, max(nm_filtered.keys()) + 1):
                outfile.write("%i\t%i\n" % (x, nm_filtered[x]))
        else:
            outfile.write("0\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(nh_all) > 1:
        outfile = E.open_output_file("nh_all", "w")
        outfile.write("NH\treads\n")
        if len(nh_all) > 0:
            writeNH(outfile, nh_all, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.mapped_reads))
        outfile.close()

    if options.force_output or len(nh_filtered) > 1:
        outfile = E.open_output_file("nh", "w")
        outfile.write("NH\treads\n")
        if len(nh_filtered) > 0:
            writeNH(outfile, nh_filtered, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(mapq_all) > 1:
        outfile = E.open_output_file("mapq", "w")
        outfile.write("mapq\tall_reads\tfiltered_reads\n")
        for x in range(0, max(mapq_all.keys()) + 1):
            outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x]))
        outfile.close()

    if details_df is not None:
        with E.open_output_file("summaries", "w") as outf:
            details_df.describe().transpose().to_csv(outf,
                                                     sep="\t",
                                                     index_label="metric")
        bins = numpy.arange(0, 1.01, 0.01)
        histogram_df = pandas.DataFrame.from_items([
            (x, numpy.histogram(details_df[x].dropna(), bins=bins)[0])
            for x in details_df.columns
        ])

        histogram_df.index = numpy.arange(0, 1.0, 0.01)

        row_sums = histogram_df.sum(axis=1)
        histogram_df = histogram_df[row_sums != 0]

        with E.open_output_file("histogram", "w") as outf:
            histogram_df.to_csv(outf, sep="\t", index_label="bin")

    # write footer and output benchmark information.
    E.stop()

Пример #2

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-e",
                        "--exons-file",
                        "--gtf-file",
                        dest="filename_exons",
                        type=str,
                        metavar="gtf",
                        help="gtf formatted file with non-overlapping exon "
                        "locations (required). ")

    parser.set_defaults(
        filename_exons=None,
        read_length=200,
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              add_output_options=True,
                              unknowns=True)

    exons = GTF.readAndIndex(
        GTF.iterator(iotools.open_file(args.filename_exons)))

    pysam_in = pysam.AlignmentFile("-", "rb")

    nspliced = 0
    nspliced_ignored = 0
    nspliced_nooverlap = 0
    nspliced_halfoverlap = 0
    nspliced_bothoverlap = 0
    nspliced_overrun = [0] * 2 * (args.read_length + 10)
    nspliced_exact = 0
    nspliced_inexact = 0
    nunspliced = 0
    nunspliced_overlap = 0
    nunspliced_ignored = 0
    nunspliced_nooverlap = 0
    nunspliced_overrun = [0] * (args.read_length + 10)
    overrun_offset = args.read_length + 10
    ninput = 0
    nunmapped = 0

    c = E.Counter()

    def _splice_overrun(start, end, overlap):
        '''return splicesite over/underrun.

        positive values: overrun
        negative values: underrun
        0: no over/underrun
        '''

        exon_start = min([x[0] for x in overlap])
        exon_end = max([x[1] for x in overlap])

        if start <= exon_start and end > exon_start:
            # overrun at start or match
            r = exon_start - start
        elif start < exon_end and end >= exon_end:
            # overrun at end or match
            r = end - exon_end
        else:
            # underrun - distance to closest exon boundary
            r = -min(start - exon_start, exon_end - end)

        return r

    for read in pysam_in:
        ninput += 1
        if read.is_unmapped:
            nunmapped += 1
            continue

        # check for BAM_CREF_SKIP code in cigar string
        cigar = read.cigar
        is_spliced = 3 in [x[0] for x in cigar]

        contig = pysam_in.getrname(read.tid)
        start = read.pos
        end = read.aend
        if is_spliced:
            # count both ends
            nspliced += 1

            if len(cigar) != 3:
                nspliced_ignored += 1
                continue

            start5, end5 = start, start + cigar[0][1]
            start3, end3 = end - cigar[2][1], end
            try:
                overlap3 = list(exons.get(contig, start3, end3))
                overlap5 = list(exons.get(contig, start5, end5))
            except KeyError:
                overlap3 = overlap5 = []

            ovl3 = len(overlap3)
            ovl5 = len(overlap5)
            o3 = o5 = None
            if not ovl3 and not ovl5:
                nspliced_nooverlap += 1
            elif ovl3 and not ovl5:
                nspliced_halfoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
            elif ovl5 and not ovl3:
                nspliced_halfoverlap += 1
                o5 = _splice_overrun(start5, end5, overlap5)
            else:
                # both overlap
                nspliced_bothoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
                o5 = _splice_overrun(start5, end5, overlap5)

            if o3 is not None:
                if o3 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o3)] += 1
            if o5 is not None:
                if o5 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o5)] += 1
        else:
            nunspliced += 1
            try:
                overlap = list(exons.get(contig, start, end))
            except KeyError:
                overlap = []

            if len(overlap) == 0:
                nunspliced_nooverlap += 1
            elif len(overlap) >= 1:
                nunspliced_overlap += 1
                # multiple overlap - merge exons (usually: small introns)
                exon_start = min([x[0] for x in overlap])
                exon_end = max([x[1] for x in overlap])
                ostart = max(0, exon_start - start)
                oend = max(0, end - exon_end)
                o = min(end, exon_end) - max(start, exon_start)
                overrun = ostart + oend
                nunspliced_overrun[overrun] += 1

    # output histograms
    outfile = E.open_output_file("overrun")
    outfile.write(
        "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n"
    )
    _nspliced_overrun = nspliced_overrun[overrun_offset:]
    _nspliced_underrun = nspliced_overrun[:overrun_offset + 1]
    _nspliced_underrun.reverse()
    for x, v in enumerate(
            zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)):
        outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v))))
    outfile.close()

    # output summary
    # convert to counter
    c.input = ninput
    c.unmapped = nunmapped
    c.mapped = ninput - nunmapped

    c.unspliced = nunspliced
    c.unspliced_nooverlap = nunspliced_nooverlap
    c.unspliced_nooverrun = nunspliced_overrun[0]
    c.unspliced_overlap = nunspliced_overlap
    c.unspliced_overrun = sum(nunspliced_overrun[1:])

    c.spliced = nspliced
    c.spliced_nooverlap = nspliced_nooverlap
    c.spliced_halfoverlap = nspliced_halfoverlap
    c.spliced_bothoverlap = nspliced_bothoverlap
    c.spliced_exact = nspliced_exact
    c.spliced_inexact = nspliced_inexact
    c.spliced_ignored = nspliced_ignored
    c.spliced_underrun = sum(_nspliced_underrun[1:])
    c.spliced_overrun = sum(_nspliced_overrun[1:])

    outfile = args.stdout
    outfile.write("category\tcounts\n")
    for k, v in sorted(c.items()):
        outfile.write("%s\t%i\n" % (k, v))

    # write footer and output benchmark information.
    E.stop()