Python count_intervals示例，iCLIP.count_intervals Python示例

示例#1

0

显示文件

def getSigHeights(sig_bed, bam_file, outfile):
    ''' Take a bedgraph of significant x-linked bases and return a begraph of heights '''

    print iCLIP.__file__
    bam = pysam.AlignmentFile(bam_file)
    last_contig = None
    intervals = []
    outlist = []
    for line in IOTools.openFile(sig_bed):

        contig, start, end, pval = line.strip().split("\t")

        if last_contig is None:
            last_contig = contig

        if contig != last_contig:
            print last_contig
            out = iCLIP.count_intervals(bam, intervals, last_contig)
            out.index.name = "start"
            out.name = "count"
            out = out.reset_index()
            out["contig"] = last_contig
            outlist.append(out)
            intervals = []
            last_contig = contig

        intervals.append((int(start), int(end)))

    # output the final chrom
    out = iCLIP.count_intervals(bam, intervals, last_contig)
    out.index.name = "start"
    out.name = "count"
    out = out.reset_index()
    out["contig"] = last_contig
    outlist.append(out)

    outframe = pandas.concat(outlist)
    outframe["end"] = outframe["start"] + 1
    outframe = outframe.loc[:, ["contig", "start", "end", "count"]]
    outframe["start"] = outframe["start"].astype("int")
    outframe["end"] = outframe["end"].astype("int")

    outframe.to_csv(IOTools.openFile(outfile, "w"),
                    sep="\t",
                    index=False,
                    header=False)

示例#2

0

显示文件

文件： PipelineUMI.py 项目： CGATOxford/UMI-tools_pipelines

def getSigHeights(sig_bed, bam_file, outfile):
    ''' Take a bedgraph of significant x-linked bases and return a begraph of heights '''

    print iCLIP.__file__
    bam = pysam.AlignmentFile(bam_file)
    last_contig = None
    intervals = []
    outlist = []
    for line in IOTools.openFile(sig_bed):

        contig, start, end, pval = line.strip().split("\t")

        if last_contig is None:
            last_contig = contig

        if contig != last_contig:
            print last_contig
            out = iCLIP.count_intervals(bam, intervals, last_contig)
            out.index.name = "start"
            out.name = "count"
            out = out.reset_index()
            out["contig"] = last_contig
            outlist.append(out)
            intervals = []
            last_contig = contig
        
        intervals.append((int(start), int(end)))


    # output the final chrom
    out = iCLIP.count_intervals(bam, intervals, last_contig)
    out.index.name = "start"
    out.name = "count"
    out = out.reset_index()
    out["contig"] = last_contig
    outlist.append(out)
    
    outframe = pandas.concat(outlist)
    outframe["end"] = outframe["start"] + 1
    outframe = outframe.loc[:,["contig", "start", "end", "count"]]
    outframe["start"] = outframe["start"].astype("int")
    outframe["end"] = outframe["end"].astype("int")

    outframe.to_csv(IOTools.openFile(outfile, "w"), sep="\t", index=False, header=False)

示例#3

0

显示文件

文件： PipelineUMI.py 项目： CGATOxford/UMI-tools_pipelines

def countTagsInClusters(bedfile, bamfile, outfile):

    bam = pysam.AlignmentFile(bamfile)

    outlines = []

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        interval = (bed.start, bed.end)
        counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
        outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)])

    IOTools.writeLines(outfile, outlines, header=["position","count"])

示例#4

0

显示文件

def countTagsInClusters(bedfile, bamfile, outfile):

    bam = pysam.AlignmentFile(bamfile)

    outlines = []

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        interval = (bed.start, bed.end)
        counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
        outlines.append(
            ["%s:%i-%i" % (bed.contig, bed.start, bed.end),
             str(counts)])

    IOTools.writeLines(outfile, outlines, header=["position", "count"])

示例#5

0

显示文件

文件： reproducibility_by_exon.py 项目： shulp2211/UMIpipe

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=["ave_dist", "min_dist", "corr"],
                      default="min_dist",
                      help="Method for calcuating similarity between profiles")
    parser.add_option("-s",
                      "--spread",
                      dest="spread",
                      type="int",
                      default=10,
                      help="Amount to spread each tag by")
    parser.add_option("-k",
                      "--keep-dist",
                      dest="keep_dist",
                      action="store_true",
                      help="Keep the distribution of tag depths")
    parser.add_option("-r",
                      "--rands",
                      dest="rands",
                      default=100,
                      help="Number of randomisations to use for calculating"
                      " mean and stdev of distance")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    profile1_file, profile2_file = args
    profile1_file = pysam.AlignmentFile(profile1_file)

    if profile2_file.endswith("bed") or profile2_file.endswith("bed.gz"):
        profile2_file = Bed.readAndIndex(profile2_file, with_values=True)
        profile2_counter = bed_counter
    else:
        profile2_file = pysam.AlignmentFile(profile2_file)
        profile2_counter = iCLIP.count_intervals

    if options.method == "min_dist":
        distance_func = iCLIP.findMinDistance
    elif options.method == "ave_dist":
        distance_func = iCLIP.calcAverageDistance
    else:

        def distance_func(profile1, profile2):
            return iCLIP.corr_profile(profile1,
                                      profile2,
                                      options.spread,
                                      profile2_ready=True)

    for exon in GTF.iterator(options.stdin):
        if exon.feature != "exon":
            continue

        contig = exon.contig
        strand = exon.strand
        transcript_id = exon.transcript_id
        start = exon.start
        end = exon.end

        profile1 = iCLIP.count_intervals(profile1_file, [(start, end)],
                                         contig=contig,
                                         strand=strand)

        profile2 = profile2_counter(profile2_file, [(start, end)],
                                    contig=contig,
                                    strand=strand)

        if profile1.sum() == 0 or profile2.sum() == 0:
            z = "NA"
            distance = "NA"
            options.stdout.write(
                "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance)s\t%(z)s\n"
                % locals())
            continue

        if options.method == "corr":
            profile2 = iCLIP.spread(profile2, options.spread)

        distance = distance_func(profile1, profile2)

        rands = iCLIP.rand_apply(profile=profile1,
                                 exon=exon,
                                 n=options.rands,
                                 func=distance_func,
                                 keep_dist=options.keep_dist,
                                 profile2=profile2)

        z = (distance - rands.mean()) / rands.std()

        options.stdout.write(
            "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance).3f\t%(z).2f\n"
            % locals())
    # write footer and output benchmark information.
    E.Stop()

示例#6

0

显示文件

文件： count_clip_sites.py 项目： jefflee1103/iCLIPlib

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--feature",
                      dest="feature",
                      type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="which feature to use: gene/transcript/exon")
    parser.add_option("--unstranded-bw",
                      dest="unstranded_wig",
                      type="string",
                      help="BigWig with tag counts on both strands")
    parser.add_option("--plus-bw",
                      dest="plus_wig",
                      type="string",
                      help="BigWig with tag counts from plus strand")
    parser.add_option("--minus-bw",
                      dest="minus_wig",
                      type="string",
                      help="BigWig with tag counts from minus strand")
    parser.add_option("--bed",
                      dest="bedfile",
                      type="string",
                      help="tabix indexed bed file with tag counts"),
    parser.add_option("-c",
                      "--use-centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read rather than start")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":

        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]

        iterator = _exon_iterator(iterator)

    if options.unstranded_wig:
        bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig)
    elif options.plus_wig:
        if not options.minus_wig:
            raise ValueError(
                "Please provide wigs for both strands or use --unstranded_wig")
        bamfile = iCLIP.make_getter(plus_wig=options.plus_wig,
                                    minus_wig=options.minus_wig)
    elif options.bedfile:
        bamfile = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        bamfile = pysam.AlignmentFile(args[0])

    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32",
                                            use_centre=options.centre)

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32",
                                              use_centre=options.centre)

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                try:
                    exon_id = feature[0].exon_number
                except AttributeError:
                    exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = float(intron_counts)

        outlines.append([
            gene_id, transcript_id, exon_id,
            str(float(exon_counts)),
            str(intron_counts)
        ])

    options.stdout.write("\t".join([
        "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"
    ]) + "\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()

示例#7

0

显示文件

文件： reproducibility_by_exon.py 项目： CGATOxford/UMI-tools_pipelines

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv


    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m","--method", dest="method", type="choice",
                      choices=["ave_dist","min_dist","corr"],
                      default="min_dist",
                      help="Method for calcuating similarity between profiles")
    parser.add_option("-s", "--spread", dest="spread", type="int",
                      default=10,
                      help="Amount to spread each tag by")
    parser.add_option("-k", "--keep-dist", dest="keep_dist", 
                      action="store_true",
                      help="Keep the distribution of tag depths")
    parser.add_option("-r", "--rands", dest="rands", 
                      default=100,
                      help="Number of randomisations to use for calculating"
                           " mean and stdev of distance")
 
    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    profile1_file, profile2_file = args
    profile1_file = pysam.AlignmentFile(profile1_file)

    if profile2_file.endswith("bed") or profile2_file.endswith("bed.gz"):
	profile2_file = Bed.readAndIndex(profile2_file, with_values=True)
        profile2_counter = bed_counter
    else:
        profile2_file = pysam.AlignmentFile(profile2_file)
        profile2_counter = iCLIP.count_intervals

    if options.method=="min_dist":
        distance_func = iCLIP.findMinDistance
    elif options.method=="ave_dist":
        distance_func = iCLIP.calcAverageDistance
    else:
        def distance_func(profile1,profile2):
            return iCLIP.corr_profile(profile1,profile2, options.spread, profile2_ready=True)
 
    for exon in GTF.iterator(options.stdin):
	if exon.feature != "exon":
            continue

        contig = exon.contig
        strand = exon.strand
        transcript_id = exon.transcript_id
        start = exon.start
        end = exon.end

        profile1 = iCLIP.count_intervals(profile1_file, 
                                         [(start, end)],
                                         contig=contig,
                                         strand=strand)

        profile2 = profile2_counter(profile2_file,
                                    [(start, end)],
                                    contig=contig,
                                    strand=strand)

        if profile1.sum() == 0 or profile2.sum() == 0:
            z = "NA"
            distance = "NA"
            options.stdout.write(
                "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance)s\t%(z)s\n" % locals())
            continue

	if options.method=="corr":
             profile2 = iCLIP.spread(profile2, options.spread)

        distance = distance_func(profile1, profile2)

        rands = iCLIP.rand_apply(profile=profile1, 
                                 exon=exon, 
                                 n=options.rands, 
                                 func=distance_func,
                                 keep_dist=options.keep_dist,
                                 profile2=profile2)

        z = (distance - rands.mean())/rands.std()

        options.stdout.write(
          "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance).3f\t%(z).2f\n" % locals())
    # write footer and output benchmark information.
    E.Stop()

示例#8

0

显示文件

文件： find_significant_bases.py 项目： sudlab/iCLIPlib

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    grouping_choices = ["exons",
                        "utrs",
                        "all"]
    parser.add_option("-g", "--grouping", dest="grouping", type="choice",
                      choices=grouping_choices,
                      help="How to group transcript regions choices are [%s]"
                            % ",".join(grouping_choices))
    parser.add_option("-p", "--pipeout", dest="pipeout", action="store_true",
                      help="Output continuously to the pipe rather than in a"
                           "chunk at the end")
    parser.add_option("-d", "--dtype", dest="dtype", type="string",
                      default="int32",
                      help="Numpy dtype for storing counts")
    parser.add_option("-w", "--window-size", dest="window_size",
                      type="int", default=15,
                      help="Size of window either size of crosslinked base to"
                           "consider")
    parser.add_option("-f", "--fdr", dest="fdr", action="store_true",
                      default=False,
                      help="perform BH fdr correction on p-values, implies not"
                           "--pipeout")
    parser.add_option("-o", "--output-windows", dest="output_windows",
                      action="store_true",
                      default=False,
                      help="Output consolidated windows isntead of bases")
    parser.add_option("-b", "--output-both", type="string", dest="output_both",
                      default=None,
                      help="Output both bases bedGraph (stdout) and windows"
                           "bed12 (specified file).")
    parser.add_option("-t", "--threshold", dest="threshold", type="float",
                      default=0.05,
                      help="p-value threshold under which to merge windows")
    parser.add_option("-c", "--centre", dest="centre", action="store_true",
                      default=False,
                      help="Use centre of read rather than -1 base when no"
                      "mutaiton is present")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # Standard in contains the transcripts
    
    gffs = GTF.gene_iterator(GTF.iterator(options.stdin))

    # bam file is the first positional arguement
    bamfile = iCLIP.make_getter(bamfile=args[0], centre=options.centre)

    if options.output_both:
        outfile_bases = options.stdout
        outfile_windows = IOTools.openFile(options.output_both, "w")
    elif options.output_windows:
        outfile_bases = None
        outfile_windows = options.stdout
    else:
        outfile_bases = options.stdout
        outfile_windows = None

    if options.fdr and options.pipeout:
        E.warning("--fdr implies not --pipeout, instant output disabled")
        options.pipeout = False

    if options.pipeout:
        output = InstantOutput(outfile_bases=outfile_bases,
                               outfile_windows=outfile_windows,
                               window_size=options.window_size,
                               threshold=options.threshold)
    else:
        output = DeferredOutput(outfile_bases=outfile_bases,
                                outfile_windows=outfile_windows,
                                correct=options.fdr,
                                window_size=options.window_size,
                                threshold=options.threshold)

    E.info("Counting accross transcripts ...")
    max_end = 0
    for gene in gffs:

        if options.grouping == "all":
            gene = GTF.merged_gene_iterator(gene)

        transcript_ps = {}

        for transcript in gene:
            
            # E.debug("Transcript is %s" % transcript[0].transcript_id)
            coords_converter = iCLIP.TranscriptCoordInterconverter(transcript)
            exons = GTF.asRanges(transcript, "exon")
            counts = iCLIP.count_intervals(bamfile,
                                           exons,
                                           strand=transcript[0].strand,
                                           contig=transcript[0].contig,
                                           dtype=options.dtype)

 
            counts.index = coords_converter.genome2transcript(counts.index.values)
            counts = counts.sort_index()
            cds = GTF.asRanges(transcript, "CDS")

            if options.grouping == "utrs" and len(cds) > 0:
                
                cds_interval = (cds[0][0], cds[-1][1])
                cds_interval = coords_converter.genome2transcript(cds_interval)
                cds_interval.sort()
                cds_length = cds_interval[1] - cds_interval[0]

                p_intervals = [(0, cds_interval[0]),
                               (cds_interval[0], cds_length),
                               (cds_interval[1], coords_converter.length - cds_interval[1])]

            else:  # do not group by cds or there is no cds
                p_intervals = [(0, coords_converter.length)]

            p_values = [calculateProbabilities(counts, options.window_size,
                                              length=length, start=start)
                        for start, length in p_intervals
                        if length > 0]
  
            if len(p_values) > 1:
                p_values = pd.concat(p_values)
            else:
                p_values = p_values[0]

            p_values.index = coords_converter.transcript2genome(p_values.index.values)
 
 
            intron_intervals = GTF.toIntronIntervals(transcript)
            
            if len(intron_intervals) > 0:
                intron_coords = iCLIP.TranscriptCoordInterconverter(transcript,
                                                                    introns=True)
                intron_counts = iCLIP.count_intervals(bamfile,
                                                      intron_intervals,
                                                      strand=transcript[0].strand,
                                                      contig=transcript[0].contig,
                                                      dtype=options.dtype)
             
                intron_counts.index = intron_coords.genome2transcript(
                    intron_counts.index.values)
                intron_counts = intron_counts.sort_index()
                intron_pvalues = calculateProbabilities(intron_counts,
                                                        options.window_size,
                                                        intron_coords.length)
                                                        
                intron_pvalues.index = intron_coords.transcript2genome(
                    intron_pvalues.index.values)
                p_values = p_values.append(intron_pvalues)
                
            transcript_ps[transcript[0].transcript_id] = p_values

        transcript_df = pd.DataFrame(transcript_ps)

        transcript_df.index.rename("position", inplace=True)
        transcript_df["contig"] = gene[0][0].contig
        transcript_df["strand"] = gene[0][0].strand
        transcript_df["gene_id"] = gene[0][0].gene_id
        transcript_df.set_index("contig", append=True, inplace=True)
        transcript_df.set_index("strand", append=True, inplace=True)
        transcript_df.set_index("gene_id", append=True, inplace=True)
 
        gene_ps = transcript_df.mean(1)
        gene_ps = gene_ps.reorder_levels(["gene_id", "contig",
                                          "strand", "position"])

        output.write(gene_ps, gene)

    output.close()

    # write footer and output benchmark information.
    E.Stop()

示例#9

0

显示文件

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    grouping_choices = ["exons", "utrs", "all"]
    parser.add_option("-g",
                      "--grouping",
                      dest="grouping",
                      type="choice",
                      choices=grouping_choices,
                      help="How to group transcript regions choices are [%s]" %
                      ",".join(grouping_choices))
    parser.add_option("-p",
                      "--pipeout",
                      dest="pipeout",
                      action="store_true",
                      help="Output continuously to the pipe rather than in a"
                      "chunk at the end")
    parser.add_option("-d",
                      "--dtype",
                      dest="dtype",
                      type="string",
                      default="int32",
                      help="Numpy dtype for storing counts")
    parser.add_option("-w",
                      "--window-size",
                      dest="window_size",
                      type="int",
                      default=15,
                      help="Size of window either size of crosslinked base to"
                      "consider")
    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      action="store_true",
                      default=False,
                      help="perform BH fdr correction on p-values, implies not"
                      "--pipeout")
    parser.add_option("-o",
                      "--output-windows",
                      dest="output_windows",
                      action="store_true",
                      default=False,
                      help="Output consolidated windows isntead of bases")
    parser.add_option("-b",
                      "--output-both",
                      type="string",
                      dest="output_both",
                      default=None,
                      help="Output both bases bedGraph (stdout) and windows"
                      "bed12 (specified file).")
    parser.add_option("-t",
                      "--threshold",
                      dest="threshold",
                      type="float",
                      default=0.05,
                      help="p-value threshold under which to merge windows")
    parser.add_option("-c",
                      "--centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read rather than -1 base when no"
                      "mutaiton is present")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # Standard in contains the transcripts

    gffs = GTF.gene_iterator(GTF.iterator(options.stdin))

    # bam file is the first positional arguement
    bamfile = iCLIP.make_getter(bamfile=args[0], centre=options.centre)

    if options.output_both:
        outfile_bases = options.stdout
        outfile_windows = IOTools.openFile(options.output_both, "w")
    elif options.output_windows:
        outfile_bases = None
        outfile_windows = options.stdout
    else:
        outfile_bases = options.stdout
        outfile_windows = None

    if options.fdr and options.pipeout:
        E.warning("--fdr implies not --pipeout, instant output disabled")
        options.pipeout = False

    if options.pipeout:
        output = InstantOutput(outfile_bases=outfile_bases,
                               outfile_windows=outfile_windows,
                               window_size=options.window_size,
                               threshold=options.threshold)
    else:
        output = DeferredOutput(outfile_bases=outfile_bases,
                                outfile_windows=outfile_windows,
                                correct=options.fdr,
                                window_size=options.window_size,
                                threshold=options.threshold)

    E.info("Counting accross transcripts ...")
    max_end = 0
    for gene in gffs:

        if options.grouping == "all":
            gene = GTF.merged_gene_iterator(gene)

        transcript_ps = {}

        for transcript in gene:

            # E.debug("Transcript is %s" % transcript[0].transcript_id)
            coords_converter = iCLIP.TranscriptCoordInterconverter(transcript)
            exons = GTF.asRanges(transcript, "exon")
            counts = iCLIP.count_intervals(bamfile,
                                           exons,
                                           strand=transcript[0].strand,
                                           contig=transcript[0].contig,
                                           dtype=options.dtype)

            counts.index = coords_converter.genome2transcript(
                counts.index.values)
            counts = counts.sort_index()
            cds = GTF.asRanges(transcript, "CDS")

            if options.grouping == "utrs" and len(cds) > 0:

                cds_interval = (cds[0][0], cds[-1][1])
                cds_interval = coords_converter.genome2transcript(cds_interval)
                cds_interval.sort()
                cds_length = cds_interval[1] - cds_interval[0]

                p_intervals = [(0, cds_interval[0]),
                               (cds_interval[0], cds_length),
                               (cds_interval[1],
                                coords_converter.length - cds_interval[1])]

            else:  # do not group by cds or there is no cds
                p_intervals = [(0, coords_converter.length)]

            p_values = [
                calculateProbabilities(counts,
                                       options.window_size,
                                       length=length,
                                       start=start)
                for start, length in p_intervals if length > 0
            ]

            if len(p_values) > 1:
                p_values = pd.concat(p_values)
            else:
                p_values = p_values[0]

            p_values.index = coords_converter.transcript2genome(
                p_values.index.values)

            intron_intervals = GTF.toIntronIntervals(transcript)

            if len(intron_intervals) > 0:
                intron_coords = iCLIP.TranscriptCoordInterconverter(
                    transcript, introns=True)
                intron_counts = iCLIP.count_intervals(
                    bamfile,
                    intron_intervals,
                    strand=transcript[0].strand,
                    contig=transcript[0].contig,
                    dtype=options.dtype)

                intron_counts.index = intron_coords.genome2transcript(
                    intron_counts.index.values)
                intron_counts = intron_counts.sort_index()
                intron_pvalues = calculateProbabilities(
                    intron_counts, options.window_size, intron_coords.length)

                intron_pvalues.index = intron_coords.transcript2genome(
                    intron_pvalues.index.values)
                p_values = p_values.append(intron_pvalues)

            transcript_ps[transcript[0].transcript_id] = p_values

        transcript_df = pd.DataFrame(transcript_ps)

        transcript_df.index.rename("position", inplace=True)
        transcript_df["contig"] = gene[0][0].contig
        transcript_df["strand"] = gene[0][0].strand
        transcript_df["gene_id"] = gene[0][0].gene_id
        transcript_df.set_index("contig", append=True, inplace=True)
        transcript_df.set_index("strand", append=True, inplace=True)
        transcript_df.set_index("gene_id", append=True, inplace=True)

        gene_ps = transcript_df.mean(1)
        gene_ps = gene_ps.reorder_levels(
            ["gene_id", "contig", "strand", "position"])

        output.write(gene_ps, gene)

    output.close()

    # write footer and output benchmark information.
    E.Stop()

示例#10

0

显示文件

文件： count_clip_sites.py 项目： sudlab/iCLIPlib

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--feature", dest="feature", type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="which feature to use: gene/transcript/exon")
    parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string",
                      help="BigWig with tag counts on both strands")
    parser.add_option("--plus-bw", dest="plus_wig", type="string",
                      help="BigWig with tag counts from plus strand")
    parser.add_option("--minus-bw", dest="minus_wig", type="string",
                      help="BigWig with tag counts from minus strand")
    parser.add_option("--bed", dest="bedfile", type="string",
                      help="tabix indexed bed file with tag counts"),
    parser.add_option("-c", "--use-centre", dest="centre", action="store_true",
                      default=False,
                      help="Use centre of read rather than start")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":
        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]
        iterator = _exon_iterator(iterator)

    if options.unstranded_wig:
        bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig)
    elif options.plus_wig:
        if not options.minus_wig:
            raise ValueError(
                "Please provide wigs for both strands or use --unstranded_wig")
        bamfile = iCLIP.make_getter(plus_wig=options.plus_wig,
                                    minus_wig=options.minus_wig)
    elif options.bedfile:
        bamfile = iCLIP.make_getter(bedfile=options.bedfile)   
    else:
        bamfile = pysam.AlignmentFile(args[0])
        
    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32",
                                            use_centre=options.centre)

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32",
                                              use_centre=options.centre)

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                try:
                    exon_id = feature[0].exon_number
                except AttributeError:
                    exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = float(intron_counts)
            
        outlines.append([gene_id,
                         transcript_id,
                         exon_id,
                         str(float(exon_counts)),
                         str(intron_counts)])

    options.stdout.write("\t".join(["gene_id",
                                    "transcript_id",
                                    "exon_id",
                                    "exon_count",
                                    "intron_count"])+"\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()

示例#11

0

显示文件

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--feature",
                      dest="feature",
                      type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":

        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]

        iterator = _exon_iterator(iterator)

    bamfile = pysam.AlignmentFile(args[0])
    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32")

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32")

        intron_counts = intron_counts.sum()

        if options.feature == "exon":
            exon_id = feature[0].exon_id
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id

        outlines.append([
            gene_id, transcript_id, exon_id,
            str(exon_counts),
            str(intron_counts)
        ])

    options.stdout.write("\t".join([
        "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"
    ]) + "\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()

示例#12

0

显示文件

文件： count_clip_sites.py 项目： CGATOxford/UMI-tools_pipelines

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--feature", dest="feature", type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":
        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]
        iterator = _exon_iterator(iterator)

    bamfile = pysam.AlignmentFile(args[0])
    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32")

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32")

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id

        outlines.append([gene_id,
                         transcript_id,
                         exon_id,
                         str(exon_counts),
                         str(intron_counts)])

    options.stdout.write("\t".join(["gene_id",
                                    "transcript_id",
                                    "exon_id",
                                    "exon_count",
                                    "intron_count"])+"\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()