Python GTF.readFromFile примеры использования

Язык программирования: Python

Пространство имен/Пакет: cgat

Класс/Тип: GTF

Метод/Функция: readFromFile

Примеров на hotexamples.com: 4

Python GTF.readFromFile - 4 примера найдено. Это лучшие примеры Python кода для cgat.GTF.readFromFile, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

iterator(30)

Entry(18)

transcript_iterator(9)

gene_iterator(6)

readFromFile(4)

asRanges(3)

iterator_filtered(3)

joined_iterator(3)

readAsIntervals(3)

flat_gene_iterator(2)

readAndIndex(2)

iterator_overlaps(2)

chunk_iterator(1)

SortPerContig(1)

Overlap(1)

iterator_sorted(1)

quote(1)

toIntronIntervals(1)

Пример #1

Показать файл

Файл: diff_gtf.py Проект: harmeet1990/cgat-apps

    def buildIndex(self, filename):
        """read and index."""

        idx = {}
        infile = iotools.open_file(filename, "r")
        for e in GTF.readFromFile(infile):
            if e.contig not in idx:
                idx[e.contig] = NCL.NCLSimple()
            idx[e.contig].add(e.start, e.end)
        infile.close()
        return idx

Пример #2

Показать файл

def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--output-equivalent",
                      dest="write_equivalent",
                      action="store_true",
                      help="write equivalent entries [default=%default].")

    parser.add_option("-f",
                      "--output-full",
                      dest="write_full",
                      action="store_true",
                      help="write full gff entries [default=%default].")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentage columns [default=%default].")

    parser.add_option("-s",
                      "--ignore-strand",
                      dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.set_defaults(
        write_equivalent=False,
        write_full=False,
        add_percent=False,
        ignore_strand=False,
        as_gtf=False,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) != 2:
        raise ValueError("two arguments required")

    input_filename1, input_filename2 = args

    # duplicated features cause a problem. Make sure
    # features are non-overlapping by running
    # gff_combine.py on GFF files first.

    E.info("reading data started")

    idx, genes2 = {}, set()
    for e in GTF.readFromFile(iotools.open_file(input_filename2, "r")):
        genes2.add(e.gene_id)
        if e.contig not in idx:
            idx[e.contig] = quicksect.IntervalTree()
        idx[e.contig].add(e.start, e.end, e)

    overlaps_genes = []

    E.info("reading data finished: %i contigs" % len(idx))

    # outfile_diff and outfile_overlap not implemented
    # outfile_diff = getFile( options, "diff" )
    # outfile_overlap = getFile( options, "overlap" )
    overlapping_genes = set()

    genes1 = set()

    # iterate over exons
    with iotools.open_file(input_filename1, "r") as infile:
        for this in GTF.iterator(infile):

            genes1.add(this.gene_id)

            try:
                intervals = idx[this.contig].find(
                    quicksect.Interval(this.start, this.end))
            except KeyError:
                continue

            others = [x.data for x in intervals]
            for other in others:
                overlapping_genes.add((this.gene_id, other.gene_id))

            # check for identical/half-identical matches
            output = None
            for other in others:
                if this.start == other.start and this.end == other.end:
                    output, symbol = other, "="
                    break
            else:
                for other in others:
                    if this.start == other.start or this.end == other.end:
                        output, symbol = other, "|"
                        break
                else:
                    symbol = "~"

    # if outfile_diff != options.stdout: outfile_diff.close()
    # if outfile_overlap != options.stdout: outfile_overlap.close()

    outfile = None
    ##################################################################
    ##################################################################
    ##################################################################
    # print gene based information
    ##################################################################
    if overlapping_genes:
        outfile = getFile(options, "genes_ovl")
        outfile.write("gene_id1\tgene_id2\n")
        for a, b in sorted(overlapping_genes):
            outfile.write("%s\t%s\n" % (a, b))
        if outfile != options.stdout:
            outfile.close()

        outfile_total = getFile(options, "genes_total")
        outfile_total.write(
            "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n")

        outfile = getFile(options, "genes_uniq1")
        b = set([x[0] for x in overlapping_genes])
        d = genes1.difference(b)
        outfile.write("gene_id1\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()
        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename1), len(genes1), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1)))

        outfile = getFile(options, "genes_uniq2")
        b = set([x[1] for x in overlapping_genes])
        d = genes2.difference(b)
        outfile.write("gene_id2\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()

        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename2), len(genes2), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2)))
        if outfile_total != options.stdout:
            outfile_total.close()

    E.stop()

Пример #3

Показать файл

Файл: gff2coverage.py Проект: alphaneer/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: "
                            "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z "
                            "andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default]")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      action="append",
                      help="features to collect "
                      "[default=%default]")

    parser.add_option("-w",
                      "--window-size",
                      dest="window_size",
                      type="int",
                      help="window size in bp for histogram computation. "
                      "Determines the bin size.  "
                      "[default=%default]")

    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="number of bins for histogram computation "
                      "if window size is not given. "
                      "[default=%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=(
                          "genomic",
                          "histogram",
                      ),
                      help="methods to apply. "
                      "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        window_size=None,
        num_bins=1000,
        value_format="%6.4f",
        features=[],
        method="genomic",
    )

    (options, args) = E.start(parser, add_output_options=True)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.method == "histogram":

        gff = GTF.readFromFile(options.stdin)

        gff.sort(key=lambda x: (x.contig, x.start))

        chunk = []
        last_contig = None

        for entry in gff:

            if last_contig != entry.contig:
                processChunk(last_contig, chunk, options, fasta)
                last_contig = entry.contig
                chunk = []

            chunk.append(entry)

        processChunk(last_contig, chunk, options, fasta)

    elif options.method == "genomic":
        intervals = collections.defaultdict(int)
        bases = collections.defaultdict(int)
        total = 0
        for entry in GTF.iterator(options.stdin):
            intervals[(entry.contig, entry.source, entry.feature)] += 1
            bases[(entry.contig, entry.source,
                   entry.feature)] += entry.end - entry.start
            total += entry.end - entry.start

        options.stdout.write("contig\tsource\tfeature\tintervals\tbases")
        if fasta:
            options.stdout.write(
                "\tpercent_coverage\ttotal_percent_coverage\n")
        else:
            options.stdout.write("\n")

        total_genome_size = sum(
            fasta.getContigSizes(with_synonyms=False).values())

        for key in sorted(intervals.keys()):
            nbases = bases[key]
            nintervals = intervals[key]
            contig, source, feature = key
            options.stdout.write("\t".join(
                ("\t".join(key), str(nintervals), str(nbases))))
            if fasta:
                options.stdout.write(
                    "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig)))
                options.stdout.write(
                    "\t%f\n" % (100.0 * float(nbases) / total_genome_size))
            else:
                options.stdout.write("\n")

    E.stop()

Пример #4

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome (indexed).")

    parser.add_argument("-w",
                        "--windows-bed-file",
                        dest="filename_windows",
                        type=str,
                        help="gff file with windows to use.")

    parser.add_argument("-d",
                        "--filename-data",
                        dest="filename_data",
                        type=str,
                        help="gff file with data to use.")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="filename-data is gtf file")

    parser.add_argument("-f",
                        "--features",
                        dest="features",
                        type=str,
                        action="append",
                        choices=("GC", ),
                        help="features to compute.")

    parser.add_argument("-c",
                        "--decorator",
                        dest="decorator",
                        type=str,
                        choices=("counts", "gc", "gc3", "mean-length",
                                 "median-length", "percent-coverage",
                                 "median-score", "mean-score", "stddev-score",
                                 "min-score", "max-score"),
                        help="decorators to use.")

    parser.add_argument("-e",
                        "--skip-empty",
                        dest="skip_empty",
                        action="store_true",
                        help="skip empty windows.")

    parser.add_argument(
        "-t",
        "--transform=",
        dest="transform",
        type=str,
        choices=("none", "overlap", "complement", "third_codon"),
        help="transform to use when mapping overlapping regions onto window.")

    parser.set_defaults(
        genome_file=None,
        filename_windows=None,
        filename_data=None,
        features=[],
        skip_empty=False,
        decorator="counts",
        transform="none",
        is_gtf=False,
    )

    (args) = E.start(parser)

    #    test_transform_third_codon()

    if not args.filename_windows:
        raise ValueError("please supply a gff file with window information.")

    if args.loglevel >= 1:
        args.stdlog.write("# reading windows...")
        args.stdlog.flush()

    windows = GTF.readAsIntervals(
        GTF.iterator(iotools.open_file(args.filename_windows, "r")))

    if args.loglevel >= 1:
        args.stdlog.write("done\n")
        args.stdlog.flush()

    if args.filename_data:
        if args.loglevel >= 1:
            args.stdlog.write("# reading data...")
            args.stdlog.flush()

        if args.is_gtf:
            gff_data = GTF.readFromFile(
                iotools.open_file(args.filename_data, "r"))
        else:
            gff_data = GTF.readFromFile(
                IOTOols.open_file(args.filename_data, "r"))

        if args.loglevel >= 1:
            args.stdlog.write("done\n")
            args.stdlog.flush()

        data_ranges = GTF.SortPerContig(gff_data)
    else:
        # use windows to compute properties
        # by supplying no data and asking for the complement = original window
        gff_data = None
        data_ranges = None
        args.transform = "complement"

    map_contig2size = {}

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
        map_contig2size = fasta.getContigSizes()
    else:
        for contig, values in list(windows.items()):
            map_contig2size[contig] = max(lambda x: x[1], values)
        fasta = None

    contigs = list(map_contig2size.keys())
    contigs.sort()

    # proceed contig wise
    noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0

    args.stdout.write("\t".join(
        map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1",
                  "l1", "n2", "l2", "score", "extra_info"))) + "\n")

    for contig in contigs:

        skip = False
        if contig not in windows:
            ncontigs_skipped_windows += 1
            skip = True

        if data_ranges and contig not in data_ranges:
            ncontigs_skipped_data += 1
            skip = True

        if skip:
            continue

        noutput_contigs += 1
        if data_ranges:
            annotateWindows(
                contig, windows[contig],
                gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta,
                args)
        else:
            annotateWindows(contig, windows[contig], [], fasta, args)

    E.info(
        "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i"
        % (len(windows), noutput_contigs, len(contigs),
           ncontigs_skipped_windows, ncontigs_skipped_data))

    E.stop()