Python GTF.Overlap примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGAT

Класс/Тип: GTF

Метод/Функция: Overlap

Примеров на hotexamples.com: 2

Python GTF.Overlap - 2 примера найдено. Это лучшие примеры Python кода для CGAT.GTF.Overlap, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Entry(30)

Overlap(2)

SortPerContig(1)

Основные методы

Entry (30)

Overlap (2)

SortPerContig (1)

Пример #1

Показать файл

Файл: gff2coverage.py Проект: lesheng/cgat

def processChunk(contig, chunk, options, fasta=None):
    """
    This function requires segments to be non-overlapping.
    """

    if len(chunk) == 0:
        return

    # check whether there are overlapping features or not
    checked = []
    for feature in chunk:
        checked.append(feature)
        others = [x for x in chunk if x not in checked]
        for otherFeature in others:
            if GTF.Overlap(feature, otherFeature):
                raise ValueError(" Histogram could not be created"
                                 " since the file contains overlapping "
                                 "features! \n%s\n%s  "
                                 % (feature, otherFeature))
    # clear auxiliary list
    del checked[:]

    # compute max_coordinate for the histogram
    max_coordinate = max(map(lambda x: x.end, chunk))
    # compute window size
    if options.window_size:
        window_size = options.window_size
        num_bins = int(math.ceil((float(max_coordinate) / window_size)))
    elif options.num_bins and fasta:
        contig_length = fasta.getLength(contig)
        assert max_coordinate <= contig_length, ("maximum coordinate (%i) "
                                                 "larger than contig size (%i)"
                                                 " for contig %s"
                                                 % (max_coordinate,
                                                    contig_length,
                                                    contig))
        max_coordinate = contig_length
        window_size = int(math.floor(float(contig_length) / options.num_bins))
        num_bins = options.num_bins
    else:
        raise ValueError("please specify a window size of provide "
                         "genomic sequence with number of bins.")

    values = [[] for x in range(num_bins)]

    # do several parses for each feature, slow, but easier to code
    # alternatively: sort by feature and location.
    for feature in options.features:
        total = 0
        bin = 0
        end = window_size
        for entry in chunk:
            if entry.feature != feature:
                continue

            while end < entry.start:
                values[bin].append(total)
                bin += 1
                end += window_size

            while entry.end > end:
                seg_start = max(entry.start, end - window_size)
                seg_end = min(entry.end, end)
                total += seg_end - seg_start
                values[bin].append(total)
                end += window_size
                bin += 1
            else:
                seg_start = max(entry.start, end - window_size)
                seg_end = min(entry.end, end)
                total += seg_end - seg_start

        while bin < num_bins:
            values[bin].append(total)
            bin += 1

    printValues(contig, max_coordinate, window_size, values, options)

Пример #2

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: diff_gff.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--write-equivalent",
                      dest="write_equivalent",
                      help="write equivalent entries [default=%default].",
                      action="store_true")

    parser.add_option("-f",
                      "--write-full",
                      dest="write_full",
                      help="write full gff entries [default=%default].",
                      action="store_true")

    parser.add_option(
        "-o",
        "--format=",
        dest="format",
        help="output format [flat|multi-line] [default=%default]")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentage columns [default=%default].")

    parser.add_option(
        "-a",
        "--as-gtf",
        "--is-gtf",
        dest="as_gtf",
        action="store_true",
        help=
        "input is in gtf format. Output on overlapping genes will be output [default=%default]."
    )

    parser.add_option("-s",
                      "--ignore-strand",
                      dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.set_defaults(
        write_equivalent=False,
        write_full=False,
        format="flat",
        add_percent=False,
        ignore_strand=False,
        as_gtf=False,
    )

    (options, args) = E.Start(parser, add_output_options=True)

    if len(args) != 2:
        raise ValueError("two arguments required")

    input_filename1, input_filename2 = args

    ## duplicated features cause a problem. Make sure
    ## features are non-overlapping by running
    ## gff_combine.py on GFF files first.

    E.info("reading data")

    if options.as_gtf:
        gff1 = GTF.readFromFile(IOTools.openFile(input_filename1, "r"))
        gff2 = GTF.readFromFile(IOTools.openFile(input_filename2, "r"))
        overlaps_genes = []
    else:
        gff1 = GTF.readFromFile(IOTools.openFile(input_filename1, "r"))
        gff2 = GTF.readFromFile(IOTools.openFile(input_filename2, "r"))

    E.info("reading data finished: %i, %i" % (len(gff1), len(gff2)))

    # removing everything but exons
    gff1 = [x for x in gff1 if x.feature == "exon"]
    gff2 = [x for x in gff2 if x.feature == "exon"]

    E.info("after keeping only 'exons': %i, %i" % (len(gff1), len(gff2)))

    if options.ignore_strand:
        for e in gff1:
            e.strand = "."
        for e in gff2:
            e.strand = "."

    E.info("sorting exons")

    gff1.sort(key=lambda x: (x.contig, x.strand, x.start, x.end))
    gff2.sort(key=lambda x: (x.contig, x.strand, x.start, x.end))

    E.info("sorting exons finished")

    subtotals = []
    subtotal = Counts(add_percent=options.add_percent)

    outfile_diff = getFile(options, "diff")
    outfile_overlap = getFile(options, "overlap")

    if options.as_gtf:
        overlapping_genes = []
    else:
        overlapping_genes = None

    i1, i2 = 0, 0
    n1 = len(gff1)
    n2 = len(gff2)
    first_entry2, first_entry1 = None, None

    while i1 < n1 and i2 < n2:

        entry1 = gff1[i1]
        entry2 = gff2[i2]

        E.debug("1: i1=%i n1=%i entry1=%s" % (i1, n1, str(entry1)))
        E.debug("2: i2=%i n2=%i entry2=%s" % (i2, n2, str(entry2)))

        ## when chromosome/strand have changed in both (and are the same), print summary info:
        if first_entry1:

            if (first_entry1.contig != entry1.contig or \
                    first_entry1.strand != entry1.strand) and \
                    (first_entry2.contig != entry2.contig or \
                         first_entry2.strand != entry2.strand) and \
                 entry1.contig == entry2.contig and \
                 entry1.strand == entry2.strand :

                subtotals.append(
                    (first_entry1.contig, first_entry1.strand, subtotal))
                subtotal = Counts(add_percent=options.add_percent)
                first_entry1 = entry1
                first_entry2 = entry2

        else:
            first_entry1 = entry1
            first_entry2 = entry2

        output_1, output_2 = None, None

        if GTF.Overlap(entry1, entry2):

            ## collect multiple matches
            last_l = True
            while GTF.Overlap(entry1, entry2):

                if overlapping_genes != None:
                    overlapping_genes.append((entry1.gene_id, entry2.gene_id))

                write_last = True
                subtotal.noverlap += 1
                if entry1.start == entry2.start and entry1.end == entry2.end:
                    symbol = "="
                    subtotal.nidentical += 1
                elif entry1.start == entry2.start or entry1.end == entry2.end:
                    symbol = "|"
                    subtotal.nhalf += 1
                else:
                    symbol = "~"

                output_1 = entry1
                output_2 = entry2

                if entry1.end < entry2.end:
                    i1 += 1
                    subtotal.nleft += 1
                    last_l = True

                    if i1 >= n1:
                        i2 += 1
                        break

                    entry1 = gff1[i1]
                    if GTF.Overlap(entry1, entry2):
                        symbol = "/"
                        # outfile.write( "# split right\n" )
                        subtotal.nsplit_right += 1

                else:
                    i2 += 1
                    subtotal.nright += 1
                    last_l = False

                    if i2 >= n2:
                        i1 += 1
                        break

                    entry2 = gff2[i2]
                    if GTF.Overlap(entry1, entry2):
                        symbol = "\\"
                        # outfile.write("# split left\n")
                        subtotal.nsplit_left += 1

                ## output at the end, so that symbol is known
                if options.write_equivalent:
                    if options.format == "flat":
                        outfile_overlap.write(
                            "%s\t%s\t%s\n" %
                            (symbol, str(output_1), str(output_2)))
                    elif options.format == "multi-line":
                        outfile_overlap.write(
                            "%s\t%s\n\t%s\n" %
                            (symbol, str(output_1), str(output_2)))

                write_last = False

            if write_last and output_1 and output_2 and options.write_equivalent:
                if options.format == "flat":
                    outfile_overlap.write(
                        "%s\t%s\t%s\n" %
                        (symbol, str(output_1), str(output_2)))
                elif options.format == "multi-line":
                    outfile_overlap.write(
                        "%s\t%s\n\t%s\n" %
                        (symbol, str(output_1), str(output_2)))

            ## if last advance was left, go right, and vice versa
            if last_l:
                i2 += 1
                subtotal.nright += 1
            else:
                i1 += 1
                subtotal.nleft += 1

        elif _cmp(entry1, entry2) < 0:
            outfile_diff.write("<\t%s\n" % str(entry1))
            subtotal.nunique_left += 1
            i1 += 1
            subtotal.nleft += 1

        elif _cmp(entry1, entry2) > 0:
            outfile_diff.write(">\t%s\n" % str(entry2))
            subtotal.nunique_right += 1
            i2 += 1
            subtotal.nright += 1

    while i1 < n1:
        outfile_diff.write("<\t%s\n" % str(entry1))
        subtotal.nunique_left += 1
        i1 += 1
        if i1 >= n1: break
        entry1 = gff1[i1]
        subtotal.nleft += 1

    while i2 < n2:
        outfile_diff.write(">\t%s\n" % str(entry2))
        subtotal.nunique_right += 1
        i2 += 1
        if i2 >= n2: break
        entry2 = gff2[i2]
        subtotal.nright += 1

    subtotals.append((entry1.contig, entry1.strand, subtotal))

    if outfile_diff != options.stdout: outfile_diff.close()
    if outfile_overlap != options.stdout: outfile_overlap.close()

    ##################################################################
    ##################################################################
    ##################################################################
    ## print gene based information
    ##################################################################
    if overlapping_genes:
        outfile = getFile(options, "genes_ovl")
        s = set(overlapping_genes)
        outfile.write("gene_id1\tgene_id2\n")
        for a, b in s:
            outfile.write("%s\t%s\n" % (a, b))
        if outfile != options.stdout: outfile.close()

        outfile_total = getFile(options, "genes_total")
        outfile_total.write(
            "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n")

        outfile = getFile(options, "genes_uniq1")
        a = set([x.gene_id for x in gff1])
        b = set([x[0] for x in s])
        d = a.difference(b)
        outfile.write("gene_id1\n")
        outfile.write("\n".join(d) + "\n")
        if outfile != options.stdout: outfile.close()
        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename1), len(a), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(a)))

        outfile = getFile(options, "genes_uniq2")
        a = set([x.gene_id for x in gff2])
        b = set([x[1] for x in s])
        d = a.difference(b)
        outfile.write("gene_id2\n")
        outfile.write("\n".join(d) + "\n")
        if outfile != options.stdout: outfile.close()

        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename2), len(a), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(a)))
        if outfile_total != options.stdout: outfile_total.close()

    ##################################################################
    ##################################################################
    ##################################################################
    ## print totals
    ##################################################################
    outfile = getFile(options, "total")
    outfile.write("chr\tstrand\t%s\n" %
                  Counts(add_percent=options.add_percent).getHeader())

    total = Counts(add_percent=options.add_percent)
    for x in subtotals:
        outfile.write("\t".join((x[0], x[1], str(x[2]))) + "\n")
        total += x[2]

    outfile.write("\t".join(("all", "all", str(total))) + "\n")

    if outfile != options.stdout: outfile.close()

    E.Stop()