def getSigHeights(sig_bed, bam_file, outfile): ''' Take a bedgraph of significant x-linked bases and return a begraph of heights ''' print iCLIP.__file__ bam = pysam.AlignmentFile(bam_file) last_contig = None intervals = [] outlist = [] for line in IOTools.openFile(sig_bed): contig, start, end, pval = line.strip().split("\t") if last_contig is None: last_contig = contig if contig != last_contig: print last_contig out = iCLIP.count_intervals(bam, intervals, last_contig) out.index.name = "start" out.name = "count" out = out.reset_index() out["contig"] = last_contig outlist.append(out) intervals = [] last_contig = contig intervals.append((int(start), int(end))) # output the final chrom out = iCLIP.count_intervals(bam, intervals, last_contig) out.index.name = "start" out.name = "count" out = out.reset_index() out["contig"] = last_contig outlist.append(out) outframe = pandas.concat(outlist) outframe["end"] = outframe["start"] + 1 outframe = outframe.loc[:, ["contig", "start", "end", "count"]] outframe["start"] = outframe["start"].astype("int") outframe["end"] = outframe["end"].astype("int") outframe.to_csv(IOTools.openFile(outfile, "w"), sep="\t", index=False, header=False)
def getSigHeights(sig_bed, bam_file, outfile): ''' Take a bedgraph of significant x-linked bases and return a begraph of heights ''' print iCLIP.__file__ bam = pysam.AlignmentFile(bam_file) last_contig = None intervals = [] outlist = [] for line in IOTools.openFile(sig_bed): contig, start, end, pval = line.strip().split("\t") if last_contig is None: last_contig = contig if contig != last_contig: print last_contig out = iCLIP.count_intervals(bam, intervals, last_contig) out.index.name = "start" out.name = "count" out = out.reset_index() out["contig"] = last_contig outlist.append(out) intervals = [] last_contig = contig intervals.append((int(start), int(end))) # output the final chrom out = iCLIP.count_intervals(bam, intervals, last_contig) out.index.name = "start" out.name = "count" out = out.reset_index() out["contig"] = last_contig outlist.append(out) outframe = pandas.concat(outlist) outframe["end"] = outframe["start"] + 1 outframe = outframe.loc[:,["contig", "start", "end", "count"]] outframe["start"] = outframe["start"].astype("int") outframe["end"] = outframe["end"].astype("int") outframe.to_csv(IOTools.openFile(outfile, "w"), sep="\t", index=False, header=False)
def countTagsInClusters(bedfile, bamfile, outfile): bam = pysam.AlignmentFile(bamfile) outlines = [] for bed in Bed.iterator(IOTools.openFile(bedfile)): interval = (bed.start, bed.end) counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum() outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)]) IOTools.writeLines(outfile, outlines, header=["position","count"])
def countTagsInClusters(bedfile, bamfile, outfile): bam = pysam.AlignmentFile(bamfile) outlines = [] for bed in Bed.iterator(IOTools.openFile(bedfile)): interval = (bed.start, bed.end) counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum() outlines.append( ["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)]) IOTools.writeLines(outfile, outlines, header=["position", "count"])
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=["ave_dist", "min_dist", "corr"], default="min_dist", help="Method for calcuating similarity between profiles") parser.add_option("-s", "--spread", dest="spread", type="int", default=10, help="Amount to spread each tag by") parser.add_option("-k", "--keep-dist", dest="keep_dist", action="store_true", help="Keep the distribution of tag depths") parser.add_option("-r", "--rands", dest="rands", default=100, help="Number of randomisations to use for calculating" " mean and stdev of distance") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) profile1_file, profile2_file = args profile1_file = pysam.AlignmentFile(profile1_file) if profile2_file.endswith("bed") or profile2_file.endswith("bed.gz"): profile2_file = Bed.readAndIndex(profile2_file, with_values=True) profile2_counter = bed_counter else: profile2_file = pysam.AlignmentFile(profile2_file) profile2_counter = iCLIP.count_intervals if options.method == "min_dist": distance_func = iCLIP.findMinDistance elif options.method == "ave_dist": distance_func = iCLIP.calcAverageDistance else: def distance_func(profile1, profile2): return iCLIP.corr_profile(profile1, profile2, options.spread, profile2_ready=True) for exon in GTF.iterator(options.stdin): if exon.feature != "exon": continue contig = exon.contig strand = exon.strand transcript_id = exon.transcript_id start = exon.start end = exon.end profile1 = iCLIP.count_intervals(profile1_file, [(start, end)], contig=contig, strand=strand) profile2 = profile2_counter(profile2_file, [(start, end)], contig=contig, strand=strand) if profile1.sum() == 0 or profile2.sum() == 0: z = "NA" distance = "NA" options.stdout.write( "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance)s\t%(z)s\n" % locals()) continue if options.method == "corr": profile2 = iCLIP.spread(profile2, options.spread) distance = distance_func(profile1, profile2) rands = iCLIP.rand_apply(profile=profile1, exon=exon, n=options.rands, func=distance_func, keep_dist=options.keep_dist, profile2=profile2) z = (distance - rands.mean()) / rands.std() options.stdout.write( "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance).3f\t%(z).2f\n" % locals()) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="which feature to use: gene/transcript/exon") parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string", help="BigWig with tag counts on both strands") parser.add_option("--plus-bw", dest="plus_wig", type="string", help="BigWig with tag counts from plus strand") parser.add_option("--minus-bw", dest="minus_wig", type="string", help="BigWig with tag counts from minus strand") parser.add_option("--bed", dest="bedfile", type="string", help="tabix indexed bed file with tag counts"), parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than start") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) if options.unstranded_wig: bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig) elif options.plus_wig: if not options.minus_wig: raise ValueError( "Please provide wigs for both strands or use --unstranded_wig") bamfile = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bamfile = iCLIP.make_getter(bedfile=options.bedfile) else: bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: try: exon_id = feature[0].exon_number except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = float(intron_counts) outlines.append([ gene_id, transcript_id, exon_id, str(float(exon_counts)), str(intron_counts) ]) options.stdout.write("\t".join([ "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count" ]) + "\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m","--method", dest="method", type="choice", choices=["ave_dist","min_dist","corr"], default="min_dist", help="Method for calcuating similarity between profiles") parser.add_option("-s", "--spread", dest="spread", type="int", default=10, help="Amount to spread each tag by") parser.add_option("-k", "--keep-dist", dest="keep_dist", action="store_true", help="Keep the distribution of tag depths") parser.add_option("-r", "--rands", dest="rands", default=100, help="Number of randomisations to use for calculating" " mean and stdev of distance") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) profile1_file, profile2_file = args profile1_file = pysam.AlignmentFile(profile1_file) if profile2_file.endswith("bed") or profile2_file.endswith("bed.gz"): profile2_file = Bed.readAndIndex(profile2_file, with_values=True) profile2_counter = bed_counter else: profile2_file = pysam.AlignmentFile(profile2_file) profile2_counter = iCLIP.count_intervals if options.method=="min_dist": distance_func = iCLIP.findMinDistance elif options.method=="ave_dist": distance_func = iCLIP.calcAverageDistance else: def distance_func(profile1,profile2): return iCLIP.corr_profile(profile1,profile2, options.spread, profile2_ready=True) for exon in GTF.iterator(options.stdin): if exon.feature != "exon": continue contig = exon.contig strand = exon.strand transcript_id = exon.transcript_id start = exon.start end = exon.end profile1 = iCLIP.count_intervals(profile1_file, [(start, end)], contig=contig, strand=strand) profile2 = profile2_counter(profile2_file, [(start, end)], contig=contig, strand=strand) if profile1.sum() == 0 or profile2.sum() == 0: z = "NA" distance = "NA" options.stdout.write( "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance)s\t%(z)s\n" % locals()) continue if options.method=="corr": profile2 = iCLIP.spread(profile2, options.spread) distance = distance_func(profile1, profile2) rands = iCLIP.rand_apply(profile=profile1, exon=exon, n=options.rands, func=distance_func, keep_dist=options.keep_dist, profile2=profile2) z = (distance - rands.mean())/rands.std() options.stdout.write( "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance).3f\t%(z).2f\n" % locals()) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) grouping_choices = ["exons", "utrs", "all"] parser.add_option("-g", "--grouping", dest="grouping", type="choice", choices=grouping_choices, help="How to group transcript regions choices are [%s]" % ",".join(grouping_choices)) parser.add_option("-p", "--pipeout", dest="pipeout", action="store_true", help="Output continuously to the pipe rather than in a" "chunk at the end") parser.add_option("-d", "--dtype", dest="dtype", type="string", default="int32", help="Numpy dtype for storing counts") parser.add_option("-w", "--window-size", dest="window_size", type="int", default=15, help="Size of window either size of crosslinked base to" "consider") parser.add_option("-f", "--fdr", dest="fdr", action="store_true", default=False, help="perform BH fdr correction on p-values, implies not" "--pipeout") parser.add_option("-o", "--output-windows", dest="output_windows", action="store_true", default=False, help="Output consolidated windows isntead of bases") parser.add_option("-b", "--output-both", type="string", dest="output_both", default=None, help="Output both bases bedGraph (stdout) and windows" "bed12 (specified file).") parser.add_option("-t", "--threshold", dest="threshold", type="float", default=0.05, help="p-value threshold under which to merge windows") parser.add_option("-c", "--centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than -1 base when no" "mutaiton is present") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # Standard in contains the transcripts gffs = GTF.gene_iterator(GTF.iterator(options.stdin)) # bam file is the first positional arguement bamfile = iCLIP.make_getter(bamfile=args[0], centre=options.centre) if options.output_both: outfile_bases = options.stdout outfile_windows = IOTools.openFile(options.output_both, "w") elif options.output_windows: outfile_bases = None outfile_windows = options.stdout else: outfile_bases = options.stdout outfile_windows = None if options.fdr and options.pipeout: E.warning("--fdr implies not --pipeout, instant output disabled") options.pipeout = False if options.pipeout: output = InstantOutput(outfile_bases=outfile_bases, outfile_windows=outfile_windows, window_size=options.window_size, threshold=options.threshold) else: output = DeferredOutput(outfile_bases=outfile_bases, outfile_windows=outfile_windows, correct=options.fdr, window_size=options.window_size, threshold=options.threshold) E.info("Counting accross transcripts ...") max_end = 0 for gene in gffs: if options.grouping == "all": gene = GTF.merged_gene_iterator(gene) transcript_ps = {} for transcript in gene: # E.debug("Transcript is %s" % transcript[0].transcript_id) coords_converter = iCLIP.TranscriptCoordInterconverter(transcript) exons = GTF.asRanges(transcript, "exon") counts = iCLIP.count_intervals(bamfile, exons, strand=transcript[0].strand, contig=transcript[0].contig, dtype=options.dtype) counts.index = coords_converter.genome2transcript(counts.index.values) counts = counts.sort_index() cds = GTF.asRanges(transcript, "CDS") if options.grouping == "utrs" and len(cds) > 0: cds_interval = (cds[0][0], cds[-1][1]) cds_interval = coords_converter.genome2transcript(cds_interval) cds_interval.sort() cds_length = cds_interval[1] - cds_interval[0] p_intervals = [(0, cds_interval[0]), (cds_interval[0], cds_length), (cds_interval[1], coords_converter.length - cds_interval[1])] else: # do not group by cds or there is no cds p_intervals = [(0, coords_converter.length)] p_values = [calculateProbabilities(counts, options.window_size, length=length, start=start) for start, length in p_intervals if length > 0] if len(p_values) > 1: p_values = pd.concat(p_values) else: p_values = p_values[0] p_values.index = coords_converter.transcript2genome(p_values.index.values) intron_intervals = GTF.toIntronIntervals(transcript) if len(intron_intervals) > 0: intron_coords = iCLIP.TranscriptCoordInterconverter(transcript, introns=True) intron_counts = iCLIP.count_intervals(bamfile, intron_intervals, strand=transcript[0].strand, contig=transcript[0].contig, dtype=options.dtype) intron_counts.index = intron_coords.genome2transcript( intron_counts.index.values) intron_counts = intron_counts.sort_index() intron_pvalues = calculateProbabilities(intron_counts, options.window_size, intron_coords.length) intron_pvalues.index = intron_coords.transcript2genome( intron_pvalues.index.values) p_values = p_values.append(intron_pvalues) transcript_ps[transcript[0].transcript_id] = p_values transcript_df = pd.DataFrame(transcript_ps) transcript_df.index.rename("position", inplace=True) transcript_df["contig"] = gene[0][0].contig transcript_df["strand"] = gene[0][0].strand transcript_df["gene_id"] = gene[0][0].gene_id transcript_df.set_index("contig", append=True, inplace=True) transcript_df.set_index("strand", append=True, inplace=True) transcript_df.set_index("gene_id", append=True, inplace=True) gene_ps = transcript_df.mean(1) gene_ps = gene_ps.reorder_levels(["gene_id", "contig", "strand", "position"]) output.write(gene_ps, gene) output.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) grouping_choices = ["exons", "utrs", "all"] parser.add_option("-g", "--grouping", dest="grouping", type="choice", choices=grouping_choices, help="How to group transcript regions choices are [%s]" % ",".join(grouping_choices)) parser.add_option("-p", "--pipeout", dest="pipeout", action="store_true", help="Output continuously to the pipe rather than in a" "chunk at the end") parser.add_option("-d", "--dtype", dest="dtype", type="string", default="int32", help="Numpy dtype for storing counts") parser.add_option("-w", "--window-size", dest="window_size", type="int", default=15, help="Size of window either size of crosslinked base to" "consider") parser.add_option("-f", "--fdr", dest="fdr", action="store_true", default=False, help="perform BH fdr correction on p-values, implies not" "--pipeout") parser.add_option("-o", "--output-windows", dest="output_windows", action="store_true", default=False, help="Output consolidated windows isntead of bases") parser.add_option("-b", "--output-both", type="string", dest="output_both", default=None, help="Output both bases bedGraph (stdout) and windows" "bed12 (specified file).") parser.add_option("-t", "--threshold", dest="threshold", type="float", default=0.05, help="p-value threshold under which to merge windows") parser.add_option("-c", "--centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than -1 base when no" "mutaiton is present") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # Standard in contains the transcripts gffs = GTF.gene_iterator(GTF.iterator(options.stdin)) # bam file is the first positional arguement bamfile = iCLIP.make_getter(bamfile=args[0], centre=options.centre) if options.output_both: outfile_bases = options.stdout outfile_windows = IOTools.openFile(options.output_both, "w") elif options.output_windows: outfile_bases = None outfile_windows = options.stdout else: outfile_bases = options.stdout outfile_windows = None if options.fdr and options.pipeout: E.warning("--fdr implies not --pipeout, instant output disabled") options.pipeout = False if options.pipeout: output = InstantOutput(outfile_bases=outfile_bases, outfile_windows=outfile_windows, window_size=options.window_size, threshold=options.threshold) else: output = DeferredOutput(outfile_bases=outfile_bases, outfile_windows=outfile_windows, correct=options.fdr, window_size=options.window_size, threshold=options.threshold) E.info("Counting accross transcripts ...") max_end = 0 for gene in gffs: if options.grouping == "all": gene = GTF.merged_gene_iterator(gene) transcript_ps = {} for transcript in gene: # E.debug("Transcript is %s" % transcript[0].transcript_id) coords_converter = iCLIP.TranscriptCoordInterconverter(transcript) exons = GTF.asRanges(transcript, "exon") counts = iCLIP.count_intervals(bamfile, exons, strand=transcript[0].strand, contig=transcript[0].contig, dtype=options.dtype) counts.index = coords_converter.genome2transcript( counts.index.values) counts = counts.sort_index() cds = GTF.asRanges(transcript, "CDS") if options.grouping == "utrs" and len(cds) > 0: cds_interval = (cds[0][0], cds[-1][1]) cds_interval = coords_converter.genome2transcript(cds_interval) cds_interval.sort() cds_length = cds_interval[1] - cds_interval[0] p_intervals = [(0, cds_interval[0]), (cds_interval[0], cds_length), (cds_interval[1], coords_converter.length - cds_interval[1])] else: # do not group by cds or there is no cds p_intervals = [(0, coords_converter.length)] p_values = [ calculateProbabilities(counts, options.window_size, length=length, start=start) for start, length in p_intervals if length > 0 ] if len(p_values) > 1: p_values = pd.concat(p_values) else: p_values = p_values[0] p_values.index = coords_converter.transcript2genome( p_values.index.values) intron_intervals = GTF.toIntronIntervals(transcript) if len(intron_intervals) > 0: intron_coords = iCLIP.TranscriptCoordInterconverter( transcript, introns=True) intron_counts = iCLIP.count_intervals( bamfile, intron_intervals, strand=transcript[0].strand, contig=transcript[0].contig, dtype=options.dtype) intron_counts.index = intron_coords.genome2transcript( intron_counts.index.values) intron_counts = intron_counts.sort_index() intron_pvalues = calculateProbabilities( intron_counts, options.window_size, intron_coords.length) intron_pvalues.index = intron_coords.transcript2genome( intron_pvalues.index.values) p_values = p_values.append(intron_pvalues) transcript_ps[transcript[0].transcript_id] = p_values transcript_df = pd.DataFrame(transcript_ps) transcript_df.index.rename("position", inplace=True) transcript_df["contig"] = gene[0][0].contig transcript_df["strand"] = gene[0][0].strand transcript_df["gene_id"] = gene[0][0].gene_id transcript_df.set_index("contig", append=True, inplace=True) transcript_df.set_index("strand", append=True, inplace=True) transcript_df.set_index("gene_id", append=True, inplace=True) gene_ps = transcript_df.mean(1) gene_ps = gene_ps.reorder_levels( ["gene_id", "contig", "strand", "position"]) output.write(gene_ps, gene) output.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="which feature to use: gene/transcript/exon") parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string", help="BigWig with tag counts on both strands") parser.add_option("--plus-bw", dest="plus_wig", type="string", help="BigWig with tag counts from plus strand") parser.add_option("--minus-bw", dest="minus_wig", type="string", help="BigWig with tag counts from minus strand") parser.add_option("--bed", dest="bedfile", type="string", help="tabix indexed bed file with tag counts"), parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than start") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) if options.unstranded_wig: bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig) elif options.plus_wig: if not options.minus_wig: raise ValueError( "Please provide wigs for both strands or use --unstranded_wig") bamfile = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bamfile = iCLIP.make_getter(bedfile=options.bedfile) else: bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: try: exon_id = feature[0].exon_number except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = float(intron_counts) outlines.append([gene_id, transcript_id, exon_id, str(float(exon_counts)), str(intron_counts)]) options.stdout.write("\t".join(["gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"])+"\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32") exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32") intron_counts = intron_counts.sum() if options.feature == "exon": exon_id = feature[0].exon_id gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id outlines.append([ gene_id, transcript_id, exon_id, str(exon_counts), str(intron_counts) ]) options.stdout.write("\t".join([ "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count" ]) + "\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32") exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32") intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id outlines.append([gene_id, transcript_id, exon_id, str(exon_counts), str(intron_counts)]) options.stdout.write("\t".join(["gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"])+"\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()