def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format.") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s).") (options, args) = parser.parse_args() if not (options.input_file): parser.print_help() sys.exit(0) for input_file in ([options.input_file]): if not os.path.exists(input_file): print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) obj = SAM.ParseBAM(options.input_file) obj.clipping_profile(outfile=options.output_prefix) try: subprocess.call("Rscript " + options.output_prefix + '.clipping_profile.r', shell=True) except: print >> sys.stderr, "Cannot generate pdf file form " + options.output_prefix + '.clipping_profile.r' pass
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.") parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. This file is better to be a pooled gene model as it will be used to annotate splicing junctions [required]") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]") parser.add_option("-m","--min-intron",action="store",type="int",dest="min_intron",default=50, help="Minimum intron length (bp). default=%default [optional]") parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default") (options,args)=parser.parse_args() if not (options.output_prefix and options.input_file and options.ref_gene_model): parser.print_help() sys.exit(0) if not os.path.exists(options.ref_gene_model): print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n' sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.annotate_junction(outfile=options.output_prefix,refgene=options.ref_gene_model,min_intron=options.min_intron, q_cut = options.map_qual) try: subprocess.call("Rscript " + options.output_prefix + '.junction_plot.r', shell=True) except: print >>sys.stderr, "Cannot generate pdf file from " + '.junction_plot.r' pass else: print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' sys.exit(0) try: generate_bed12(options.output_prefix + '.junction.xls') except: pass
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM format.") parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default" ) (options, args) = parser.parse_args() if not (options.input_file): parser.print_help() sys.exit(0) if not os.path.exists(options.input_file): print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n' sys.exit(0) obj = SAM.ParseBAM(options.input_file) obj.stat(q_cut=options.map_qual)
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s)") parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene",help="Reference gene model in BED format.") parser.add_option("-k","--sample-size",action="store",type="int",dest="sampleSize",default=1000000,help="Number of read-pairs used to estimate inner distance. default=%default") parser.add_option("-l","--lower-bound",action="store",type="int",dest="lower_bound_size",default=-250,help="Lower bound of inner distance (bp). This option is used for ploting histograme. default=%default") parser.add_option("-u","--upper-bound",action="store",type="int",dest="upper_bound_size",default=250,help="Upper bound of inner distance (bp). This option is used for plotting histogram. default=%default") parser.add_option("-s","--step",action="store",type="int",dest="step_size",default=5,help="Step size (bp) of histograme. This option is used for plotting histogram. default=%default") parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default") (options,args)=parser.parse_args() if not (options.output_prefix and options.input_file and options.ref_gene): parser.print_help() sys.exit(0) for input_file in ([options.input_file,options.ref_gene]): if not os.path.exists(input_file): print >>sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n' parser.print_help() sys.exit(0) if options.step_size <=0: print >>sys.stderr, "step size is a positive interger" sys.exit(0) obj = SAM.ParseBAM(options.input_file) obj.mRNA_inner_distance(outfile=options.output_prefix,low_bound=options.lower_bound_size,up_bound=options.upper_bound_size,step=options.step_size,refbed=options.ref_gene,sample_size=options.sampleSize, q_cut = options.map_qual) try: subprocess.call("Rscript " + options.output_prefix + '.inner_distance_plot.r',shell=True) except: print >>sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.inner_distance_plot.r' pass
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format.") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s).") (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.readGC(outfile=options.output_prefix) try: subprocess.call("Rscript " + options.output_prefix + ".GC_plot.r", shell=True) except: pass else: print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0)
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format.") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s).") parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default" ) parser.add_option( "-s", "--sequencing", action="store", dest="layout", help="Sequencing layout. \"SE\"(single-end) or \"PE\"(pair-end). ") (options, args) = parser.parse_args() if not (options.input_file and options.output_prefix and options.layout): parser.print_help() sys.exit(0) for input_file in ([options.input_file]): if not os.path.exists(input_file): print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n' sys.exit(0) obj = SAM.ParseBAM(options.input_file) if options.layout == "SE": obj.insertion_profile(outfile=options.output_prefix, q_cut=options.map_qual, PE=False) elif options.layout == "PE": obj.insertion_profile(outfile=options.output_prefix, q_cut=options.map_qual, PE=True) else: print >> sys.stderr, "unknow sequencing layout. Must be \"SE\" or \"PE\"" try: subprocess.call("Rscript " + options.output_prefix + '.insertion_profile.r', shell=True) except: print >> sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.insertion_profile.r' pass
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format.") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s).") parser.add_option( "-u", "--up-limit", action="store", type="int", dest="upper_limit", default=500, help= "Upper limit of reads' occurrence. Only used for plotting, default=%default (times)" ) parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default" ) (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.readDupRate(outfile=options.output_prefix, up_bound=options.upper_limit, q_cut=options.map_qual) try: subprocess.call("Rscript " + options.output_prefix + ".DupRate_plot.r", shell=True) except: pass else: print('\n\n' + options.input_file + " does NOT exists" + '\n', file=sys.stderr) #parser.print_help() sys.exit(0)
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format. [required]") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s). [required]") parser.add_option( "-r", "--reduce", action="store", type="int", dest="reduce_fold", default=1000, help= "To avoid making huge vector in R, nucleotide with particular phred score less frequent than this number will be ignored. Increase this number save more memory while reduce precision. Set to 1 achieves maximum precision (i.e. every nucleotide will be considered). This option only applies to the 'boxplot'. default=%default" ) parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default" ) (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.readsQual_boxplot(outfile=options.output_prefix, q_cut=options.map_qual, shrink=options.reduce_fold) try: subprocess.call("Rscript " + options.output_prefix + ".qual.r", shell=True) except: pass else: print('\n\n' + options.input_file + " does NOT exists" + '\n', file=sys.stderr) #parser.print_help() sys.exit(0)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format. BAM file must be sorted and indexed using samTools. .bam and .bai files should be placed in the same directory.") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name/ID, second column is chromosome size. Chromosome name (such as \"chr1\") should be consistent between this file and the BAM file.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output wiggle files(s). One wiggle file will be generated for non strand-specific data, two wiggle files (\"Prefix_Forward.wig\" and \"Prefix_Reverse.wig\") will be generated for strand-specific RNA-seq data.") parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",help="Specified wigsum. Eg: 1,000,000,000 equals to coverage of 10 million 100nt reads. Ignore this option to disable normalization") parser.add_option("-u","--skip-multi-hits",action="store_true",dest="skip_multi",help="Skip non-unique hit reads.") parser.add_option("-d","--strand",action="store",type="string",dest="strand_rule",default=None,help="How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq data, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'. If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data).") parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality for an alignment to be called \"uniquely mapped\". default=%default") (options,args)=parser.parse_args() if options.skip_multi:print "Skip multi-hits:True" else:print "Skip multi-hits:False" if not (options.output_prefix and options.input_file and options.chromSize and options.output_prefix): parser.print_help() sys.exit(0) for file in (options.input_file,options.chromSize): if not os.path.exists(file): print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n' sys.exit(0) if not os.path.exists(options.input_file + '.bai'): print >>sys.stderr, "index file " + options.input_file + '.bai' + " does not exists" sys.exit(0) chromSizes = load_chromsize(options.chromSize) norm_factor=None if options.total_wigsum: obj = SAM.ParseBAM(options.input_file) wig_sum = obj.calWigSum(chrom_sizes = chromSizes, skip_multi=options.skip_multi) print >>sys.stderr, "\n\ntotal wigsum is:" + str(wig_sum) + '\n' try: norm_factor = options.total_wigsum / wig_sum except: norm_factor = None obj = SAM.ParseBAM(options.input_file) obj.bamTowig(outfile = options.output_prefix, chrom_sizes = chromSizes, chrom_file = options.chromSize, q_cut = options.map_qual, skip_multi=options.skip_multi,strand_rule = options.strand_rule, WigSumFactor=norm_factor)
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Input file in BAM or SAM format.[required]") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s). [required]") parser.add_option( "-x", "--nx", action="store_true", dest="unknown_nucleotide", help= "Flag option. Presense of this flag tells program to include N,X in output NVC plot [required]" ) parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default" ) (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.readsNVC(outfile=options.output_prefix, nx=options.unknown_nucleotide, q_cut=options.map_qual) try: subprocess.call("Rscript " + options.output_prefix + ".NVC_plot.r", shell=True) except: pass else: print('\n\n' + options.input_file + " does NOT exists" + '\n', file=sys.stderr) #parser.print_help() sys.exit(0)
def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Input alignment file in SAM or BAM format") parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed fomat.") parser.add_option( "-s", "--sample-size", action="store", type="int", dest="sample_size", default=200000, help="Number of reads sampled from SAM/BAM file. default=%default") (options, args) = parser.parse_args() if not (options.input_file and options.refgene_bed): parser.print_help() print >> sys.stderr, '\n\n' + __doc__ sys.exit(0) for f in (options.input_file, options.refgene_bed): if not os.path.exists(f): print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' sys.exit(0) if options.sample_size < 1000: print >> sys.stderr, "Warn: Sample Size too small to give a accurate estimation" obj = SAM.ParseBAM(options.input_file) (protocol, sp1, sp2, other) = obj.configure_experiment(refbed=options.refgene_bed, sample_size=options.sample_size) if other < 0: other = 0.0 if protocol == "PairEnd": print "\n\nThis is PairEnd Data" print "Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1 print "Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2 print "Fraction of reads explained by other combinations: %.4f" % other elif protocol == "SingleEnd": print "\n\nThis is SingleEnd Data" print "Fraction of reads explained by \"++,--\": %.4f" % sp1 print "Fraction of reads explained by \"+-,-+\": %.4f" % sp2 print "Fraction of reads explained by other combinations: %.4f" % other else: print "Unknown Data type"
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format.") parser.add_option("-r", "--refgene", action="store", type="string", dest="ref_gene_model", help="Reference gene model in bed format. [required]") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s). [required]") (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file and options.ref_gene_model): parser.print_help() sys.exit(0) if not os.path.exists(options.ref_gene_model): print >> sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.coverageGeneBody(outfile=options.output_prefix, refbed=options.ref_gene_model) try: subprocess.call("Rscript " + options.output_prefix + '.geneBodyCoverage_plot.r', shell=True) except: print >> sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.geneBodyCoverage_plot.r' pass else: print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0)
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format.") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s).") parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default" ) (options, args) = parser.parse_args() if not (options.input_file): parser.print_help() sys.exit(0) for input_file in ([options.input_file]): if not os.path.exists(input_file): print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) obj = SAM.ParseBAM(options.input_file) obj.clipping_profile(outfile=options.output_prefix, q_cut=options.map_qual) try: subprocess.call("Rscript " + options.output_prefix + '.clipping_profile.r', shell=True) except: print >> sys.stderr, "Cannot generate pdf file form " + options.output_prefix + '.clipping_profile.r' pass
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format. [required]") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s). [required]") parser.add_option( "-r", "--reduce", action="store", type="int", dest="reduce_fold", default=1000, help= "To avoid making huge vector in R, nucleotide with particular phred score represented less than this number will be ignored. Increase this number save more memory while reduce precision. This option only applies to the 'boxplot'. default=%default" ) (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.readsQual_boxplot(outfile=options.output_prefix) try: subprocess.call("Rscript " + options.output_prefix + ".qual.r", shell=True) except: pass else: print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0)
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. [required]") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]") parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat. [required]") parser.add_option("-d","--strand",action="store",type="string",dest="strand_rule",default=None,help="How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'. If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)") parser.add_option("-l","--percentile-floor",action="store",type="int",dest="percentile_low_bound",default=5, help="Sampling starts from this percentile. A integer between 0 and 100. default=%default") parser.add_option("-u","--percentile-ceiling",action="store",type="int",dest="percentile_up_bound",default=100, help="Sampling ends at this percentile. A integer between 0 and 100. default=%default") parser.add_option("-s","--percentile-step",action="store",type="int",dest="percentile_step",default=5, help="Sampling frequency. Smaller value means more sampling times. A integer between 0 and 100. default=%default") parser.add_option("-c","--rpkm-cutoff",action="store",type="float",dest="rpkm_cutoff",default=0.01, help="Transcripts with RPKM smaller than this number will be ignored in visualization plot. default=%default") parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default") (options,args)=parser.parse_args() if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if options.percentile_low_bound <0 or options.percentile_low_bound >100: print >>sys.stderr, "percentile_low_bound must be larger than 0 and samller than 100" sys.exit(0) if options.percentile_up_bound <0 or options.percentile_up_bound >100: print >>sys.stderr, "percentile_up_bound must be larger than 0 and samller than 100" sys.exit(0) if options.percentile_up_bound < options.percentile_low_bound: print >>sys.stderr, "percentile_up_bound must be larger than percentile_low_bound" sys.exit(0) if options.percentile_step <0 or options.percentile_step > options.percentile_up_bound: print >>sys.stderr, "percentile_step must be larger than 0 and samller than percentile_up_bound" sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.saturation_RPKM(outfile=options.output_prefix, refbed=options.refgene_bed, sample_start=options.percentile_low_bound,sample_end=options.percentile_up_bound,sample_step=options.percentile_step,strand_rule=options.strand_rule, q_cut = options.map_qual) show_saturation(infile=options.output_prefix + ".eRPKM.xls", outfile=options.output_prefix + ".saturation.r",rpkm_cut = options.rpkm_cutoff) try: subprocess.call("Rscript " + options.output_prefix + ".saturation.r", shell=True) except: pass else: print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0)
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format.") (options, args) = parser.parse_args() if not (options.input_file): parser.print_help() sys.exit(0) if not os.path.exists(options.input_file): print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n' sys.exit(0) obj = SAM.ParseBAM(options.input_file) obj.stat()
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Input file in BAM or SAM format.[required]") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s). [required]") parser.add_option( "-x", "--nx", action="store_true", dest="unknown_nucleotide", help= "Flag option. Presense of this flag tells program to include N,X in output NVC plot [required]" ) (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.readsNVC(outfile=options.output_prefix, nx=options.unknown_nucleotide) try: subprocess.call("Rscript " + options.output_prefix + ".NVC_plot.r", shell=True) except: pass else: print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0)
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.[required]") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]") parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat. This gene model is used to determine known splicing junctions. [required]") parser.add_option("-l","--percentile-floor",action="store",type="int",dest="percentile_low_bound",default=5, help="Sampling starts from this percentile. A integer between 0 and 100. default=%default") parser.add_option("-u","--percentile-ceiling",action="store",type="int",dest="percentile_up_bound",default=100, help="Sampling ends at this percentile. A integer between 0 and 100. default=%default") parser.add_option("-s","--percentile-step",action="store",type="int",dest="percentile_step",default=5, help="Sampling frequency. Smaller value means more sampling times. A integer between 0 and 100. default=%default") parser.add_option("-m","--min-intron",action="store",type="int",dest="minimum_intron_size",default=50, help="Minimum intron size (bp). default=%default") parser.add_option("-v","--min-coverage",action="store",type="int",dest="minimum_splice_read",default=1, help="Minimum number of supportting reads to call a junction. default=%default") parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default") (options,args)=parser.parse_args() if not (options.output_prefix and options.input_file and options.refgene_bed): parser.print_help() sys.exit(0) if options.percentile_low_bound <0 or options.percentile_low_bound >100: print("percentile_low_bound must be larger than 0 and samller than 100", file=sys.stderr) sys.exit(0) if options.percentile_up_bound <0 or options.percentile_up_bound >100: print("percentile_up_bound must be larger than 0 and samller than 100", file=sys.stderr) sys.exit(0) if options.percentile_up_bound < options.percentile_low_bound: print("percentile_up_bound must be larger than percentile_low_bound", file=sys.stderr) sys.exit(0) if options.percentile_step <0 or options.percentile_step > options.percentile_up_bound: print("percentile_step must be larger than 0 and samller than percentile_up_bound", file=sys.stderr) sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.saturation_junction(outfile=options.output_prefix, refgene=options.refgene_bed, sample_start=options.percentile_low_bound,sample_end=options.percentile_up_bound,sample_step=options.percentile_step,min_intron=options.minimum_intron_size,recur=options.minimum_splice_read, q_cut = options.map_qual) try: subprocess.call("Rscript " + options.output_prefix + '.junctionSaturation_plot.r', shell=True) except: print("Cannot generate pdf file from " + '.junctionSaturation_plot.r', file=sys.stderr) pass else: print('\n\n' + options.input_file + " does NOT exists" + '\n', file=sys.stderr) sys.exit(0)
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s).") parser.add_option("-u","--up-limit",action="store",type="int",dest="upper_limit",default=500,help="upper limit of duplicated times. Only used for plotting, default=%default (times)") (options,args)=parser.parse_args() if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) obj.readDupRate(outfile=options.output_prefix,up_bound=options.upper_limit) try: subprocess.call("Rscript " + options.output_prefix + ".DupRate_plot.r", shell=True) except: pass else: print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0)
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input alignment file in SAM or BAM format") parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat.") parser.add_option("-s","--sample-size",action="store",type="int",dest="sample_size",default=200000, help="Number of reads sampled from SAM/BAM file. default=%default") parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default") (options,args)=parser.parse_args() if not (options.input_file and options.refgene_bed): parser.print_help() print('\n\n' + __doc__, file=sys.stderr) sys.exit(0) for f in (options.input_file,options.refgene_bed): if not os.path.exists(f): print('\n\n' + f + " does NOT exists." + '\n', file=sys.stderr) sys.exit(0) if options.sample_size <1000: print("Warn: Sample Size too small to give a accurate estimation", file=sys.stderr) obj = SAM.ParseBAM(options.input_file) (protocol,sp1,sp2,other)=obj.configure_experiment(refbed=options.refgene_bed, sample_size = options.sample_size, q_cut = options.map_qual) if other <0: other=0.0 if protocol == "PairEnd": print("\n\nThis is PairEnd Data") print("Fraction of reads failed to determine: %.4f" % other) print("Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1) print("Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2) elif protocol == "SingleEnd": print("\n\nThis is SingleEnd Data") print("Fraction of reads failed to determine: %.4f" % other) print("Fraction of reads explained by \"++,--\": %.4f" % sp1) print("Fraction of reads explained by \"+-,-+\": %.4f" % sp2) else: print("Unknown Data type")
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM format") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output fastq files(s).") parser.add_option( "-s", "--single-end", action="store_true", dest="single", help="Specificy '-s' or '--single-end' for single-end sequencing.") parser.add_option( "-c", "--compress", action="store_true", dest="gzip", help= "Specificy '-c' or '--compress' to compress output fastq file(s) using 'gzip' command." ) (options, args) = parser.parse_args() #print options.single if not (options.output_prefix and options.input_file): parser.print_help() sys.exit(0) if os.path.exists(options.input_file): obj = SAM.ParseBAM(options.input_file) if options.single is True: obj.bam2fq(prefix=options.output_prefix, paired=False) if options.gzip is True: try: print >> sys.stderr, "run gzip ... ", subprocess.call("gzip " + options.output_prefix + '.fastq', shell=True) print >> sys.stderr, "Done." except: pass else: obj.bam2fq(prefix=options.output_prefix, paired=True) if options.gzip is True: try: print >> sys.stderr, "run gzip ..." subprocess.call("gzip " + options.output_prefix + '.R1.fastq', shell=True) subprocess.call("gzip " + options.output_prefix + '.R2.fastq', shell=True) print >> sys.stderr, "Done." except: pass else: print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0)
def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Input alignment file in SAM or BAM format") parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed fomat.") parser.add_option( "-s", "--sample-size", action="store", type="int", dest="sample_size", default=200000, help="Number of reads sampled from SAM/BAM file. default=%default") parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default" ) # extra options added by me: parser.add_option( "-p", "--pval", action="store", type="float", dest="pval_threshold", default=1e-5, help= "Binomial p-value for rejecting null hypothesis that experiment was unstranded. default=%default" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="output_file", help="Name of the output file to write the result to.") (options, args) = parser.parse_args() if not (options.input_file and options.refgene_bed): parser.print_help() print('\n\n' + __doc__, file=sys.stderr) sys.exit(0) for f in (options.input_file, options.refgene_bed): if not os.path.exists(f): print('\n\n' + f + " does NOT exists." + '\n', file=sys.stderr) sys.exit(0) if options.sample_size < 1000: print("Warn: Sample Size too small to give a accurate estimation", file=sys.stderr) obj = SAM.ParseBAM(options.input_file) (protocol, sp1, sp2, other) = obj.configure_experiment(refbed=options.refgene_bed, sample_size=options.sample_size, q_cut=options.map_qual) if other < 0: other = 0.0 ''' #Below is original "status" message that gets printed to the console if protocol == "PairEnd": print("\n\nThis is PairEnd Data") print("Fraction of reads failed to determine: %.4f" % other) print("Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1) print("Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2) elif protocol == "SingleEnd": print("\n\nThis is SingleEnd Data") print("Fraction of reads failed to determine: %.4f" % other) print("Fraction of reads explained by \"++,--\": %.4f" % sp1) print("Fraction of reads explained by \"+-,-+\": %.4f" % sp2) else: print("Unknown Data type") #print mesg ''' ''' sp1 and sp2 are floats giving the fraction of reads. The "safest" option is to assume that the experiment is NOT stranded, which then makes tools like featureCounts skip the ambiguous areas. However, if we have strong enough evidence to support a stranded protocol, that would likely be more accurate for quantification. To that end, we perform a binomial test. We know the number of trials (m) and the three fractions: other, sp1, sp2 From those, we can get rough estimates on the number of reads assigned to each group ( floor(m*other), floor(m*sp1), floor(m*sp2)) Then, we will ignore the unassigned reads and use N=floor(m*sp1) + floor(m*sp2) as an estimate of the total trials. From there, a binomial test with N trials and m*sp1 successes will give us a p-value for the null hypothesis that it is unstranded, which assumes reads have an equal probability from each strand. (p=0.5). It is usually very obvious if the protocol is stranded, e.g. sp1=0.05, sp2=0.95 so the p-value will be VERY stringent ''' fout = open(options.output_file, 'w') m = options.sample_size n1 = int(m * sp1) n2 = int(m * sp2) N = n1 + n2 # total reads that were assigned to one "style" or the other pval = binom_test(n1, N, p=0.5) header = 'sp1_fraction,sp2_fraction,total_sampled,total_assigned,n1,n2,pval_threshold,pval,strand_option\n' fout.write(header) # if we reject the null hypothesis by the chosen threshold: if pval < options.pval_threshold: if sp2 > sp1: # this corresponds to "reverse stranded" in featureCounts parlance. Created by dUTP, for instance. # The option for featurecounts is -s2 strand_option = 2 else: # this corresponds to the other stranded protocols strand_option = 1 else: # not rejecting null hypothesis of unstranded strand_option = 0 data = '%.4f,%4f,%d,%d,%d,%d,%.4E,%.4E,%d' % ( sp1, sp2, m, N, n1, n2, options.pval_threshold, pval, strand_option) fout.write(data) fout.close()
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input", action="store", type="string", dest="input_bam", help='Input BAM file. [required]') parser.add_option( "-l", "--read-align-length", action="store", type="int", dest="read_alignment_length", help= "Alignment length of read. It is usually set to the orignial read length. For example, all these cigar strings (\"101M\", \"68M140N33M\", \"53M1D48M\") suggest the read alignment length is 101. [required]" ) parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s). [required]") parser.add_option( "-n", "--read-num", action="store", type="int", default=1000000, dest="read_number", help= "Number of aligned reads with mismatches used to calculate the mismatch profile. default=%default" ) parser.add_option("-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help="Minimum mapping quality. default=%default") (options, args) = parser.parse_args() if not (options.input_bam): parser.print_help() sys.exit(0) for f in ([options.input_bam]): if not os.path.exists(f): print >> sys.stderr, '\n\n' + f + " does NOT exists" + '\n' parser.print_help() sys.exit(0) if not (options.output_prefix): print >> sys.stderr, '\n\n You must specify the output prefix' parser.print_help() sys.exit(0) if not (options.read_alignment_length): print >> sys.stderr, '\n\n You must specify read alignment length. It is usually the read length.' parser.print_help() sys.exit(0) obj = SAM.ParseBAM(options.input_bam) obj.mismatchProfile(read_length=options.read_alignment_length, read_num=options.read_number, q_cut=options.map_qual, outfile=options.output_prefix) try: subprocess.call("Rscript " + options.output_prefix + '.mismatch_profile.r', shell=True) except: print >> sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.mismatch_profile.r' pass
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.") parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format.") (options,args)=parser.parse_args() if not (options.input_file and options.ref_gene_model): parser.print_help() sys.exit(0) if not os.path.exists(options.ref_gene_model): print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) if not os.path.exists(options.input_file): print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' sys.exit(0) #build bitset (cds_exon_r, intron_r, utr_5_r, utr_3_r,\ intergenic_up_1kb_r,intergenic_up_5kb_r,intergenic_up_10kb_r,\ intergenic_down_1kb_r,intergenic_down_5kb_r,intergenic_down_10kb_r,\ cds_exon_base,intron_base,utr_5_base,utr_3_base,\ intergenic_up1kb_base,intergenic_up5kb_base,intergenic_up10kb_base,\ intergenic_down1kb_base,intergenic_down5kb_base,intergenic_down10kb_base) = process_gene_model(options.ref_gene_model) intron_read=0 cds_exon_read=0 utr_5_read=0 utr_3_read=0 intergenic_up1kb_read=0 intergenic_down1kb_read=0 intergenic_up5kb_read=0 intergenic_down5kb_read=0 intergenic_up10kb_read=0 intergenic_down10kb_read=0 totalReads=0 totalFrags=0 unAssignFrags=0 obj = SAM.ParseBAM(options.input_file) R_qc_fail=0 R_duplicate=0 R_nonprimary=0 R_unmap=0 print >>sys.stderr, "processing " + options.input_file + " ...", try: while(1): aligned_read = obj.samfile.next() if aligned_read.is_qcfail: #skip QC fail read R_qc_fail +=1 continue if aligned_read.is_duplicate: #skip duplicate read R_duplicate +=1 continue if aligned_read.is_secondary: #skip non primary hit R_nonprimary +=1 continue if aligned_read.is_unmapped: #skip unmap read R_unmap +=1 continue totalReads +=1 chrom = obj.samfile.getrname(aligned_read.tid) chrom=chrom.upper() exons = bam_cigar.fetch_exon(chrom, aligned_read.pos, aligned_read.cigar) totalFrags += len(exons) for exn in exons: #print chrom + '\t' + str(exn[1]) + '\t' + str(exn[2]) mid = int(exn[1]) + int((int(exn[2]) - int(exn[1]))/2) if foundone(chrom,cds_exon_r,mid,mid) > 0: cds_exon_read += 1 continue elif foundone(chrom,utr_5_r,mid,mid) >0 and foundone(chrom,utr_3_r,mid,mid) == 0: utr_5_read += 1 continue elif foundone(chrom,utr_3_r,mid,mid) >0 and foundone(chrom,utr_5_r,mid,mid) == 0: utr_3_read += 1 continue elif foundone(chrom,utr_3_r,mid,mid) >0 and foundone(chrom,utr_5_r,mid,mid) > 0: unAssignFrags +=1 continue elif foundone(chrom,intron_r,mid,mid) > 0: intron_read += 1 continue elif foundone(chrom,intergenic_up_10kb_r,mid,mid) >0 and foundone(chrom,intergenic_down_10kb_r,mid,mid) > 0: unAssignFrags +=1 continue elif foundone(chrom,intergenic_up_1kb_r,mid,mid) >0: intergenic_up1kb_read += 1 intergenic_up5kb_read += 1 intergenic_up10kb_read += 1 elif foundone(chrom,intergenic_up_5kb_r,mid,mid) >0: intergenic_up5kb_read += 1 intergenic_up10kb_read += 1 elif foundone(chrom,intergenic_up_10kb_r,mid,mid) >0: intergenic_up10kb_read += 1 elif foundone(chrom,intergenic_down_1kb_r,mid,mid) >0: intergenic_down1kb_read += 1 intergenic_down5kb_read += 1 intergenic_down10kb_read += 1 elif foundone(chrom,intergenic_down_5kb_r,mid,mid) >0: intergenic_down5kb_read += 1 intergenic_down10kb_read += 1 elif foundone(chrom,intergenic_down_10kb_r,mid,mid) >0: intergenic_down10kb_read += 1 else: unAssignFrags +=1 except StopIteration: print >>sys.stderr, "Finished\n" print "%-30s%d" % ("Total Reads",totalReads) print "%-30s%d" % ("Total Tags",totalFrags) print "%-30s%d" % ("Total Assigned Tags",totalFrags-unAssignFrags) print "=====================================================================" print "%-20s%-20s%-20s%-20s" % ('Group','Total_bases','Tag_count','Tags/Kb') print "%-20s%-20d%-20d%-18.2f" % ('CDS_Exons',cds_exon_base,cds_exon_read,cds_exon_read*1000.0/(cds_exon_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("5'UTR_Exons",utr_5_base,utr_5_read, utr_5_read*1000.0/(utr_5_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("3'UTR_Exons",utr_3_base,utr_3_read, utr_3_read*1000.0/(utr_3_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("Introns",intron_base,intron_read,intron_read*1000.0/(intron_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("TSS_up_1kb",intergenic_up1kb_base, intergenic_up1kb_read, intergenic_up1kb_read*1000.0/(intergenic_up1kb_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("TSS_up_5kb",intergenic_up5kb_base, intergenic_up5kb_read, intergenic_up5kb_read*1000.0/(intergenic_up5kb_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("TSS_up_10kb",intergenic_up10kb_base, intergenic_up10kb_read, intergenic_up10kb_read*1000.0/(intergenic_up10kb_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("TES_down_1kb",intergenic_down1kb_base, intergenic_down1kb_read, intergenic_down1kb_read*1000.0/(intergenic_down1kb_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("TES_down_5kb",intergenic_down5kb_base, intergenic_down5kb_read, intergenic_down5kb_read*1000.0/(intergenic_down5kb_base+1)) print "%-20s%-20d%-20d%-18.2f" % ("TES_down_10kb",intergenic_down10kb_base, intergenic_down10kb_read, intergenic_down10kb_read*1000.0/(intergenic_down10kb_base+1)) print "====================================================================="
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM format (SAM is not supported). [required]") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s). [required]") parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed fomat. [required]") parser.add_option( "-d", "--strand", action="store", type="string", dest="strand_rule", default=None, help= "How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'. If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)" ) parser.add_option( "-u", "--skip-multi-hits", action="store_true", dest="skip_multi", help= "How to deal with multiple hit reads. Presence this option renders program to skip multiple hits reads." ) parser.add_option( "-e", "--only-exonic", action="store_true", dest="only_exon", help= "How to count total reads. Presence of this option renders program only used exonic (UTR exons and CDS exons) reads, otherwise use all reads." ) parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default" ) parser.add_option( "-s", "--single-read", action="store", type="float", dest="single_read", default=1, help= "How to count read-pairs that only have one end mapped. 0: ignore it. 0.5: treat it as half fragment. 1: treat it as whole fragment. default=%default" ) (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file and options.refgene_bed): parser.print_help() sys.exit(0) if not os.path.exists(options.input_file + '.bai'): print >> sys.stderr, "cannot find index file of input BAM file" print >> sys.stderr, options.input_file + '.bai' + " does not exists" sys.exit(0) for file in (options.input_file, options.refgene_bed): if not os.path.exists(file): print >> sys.stderr, file + " does NOT exists" + '\n' sys.exit(0) obj = SAM.ParseBAM(options.input_file) OUT = open(options.output_prefix + '.FPKM.xls', 'w') #++++++++++++++++++++++++++++++++++++determine strand rule strandRule = {} if options.strand_rule is None: # Not strand-specific pass elif len(options.strand_rule.split(',')) == 4: #PairEnd, strand-specific for i in options.strand_rule.split(','): strandRule[i[0] + i[1]] = i[2] elif len(options.strand_rule.split(',')) == 2: #singeEnd, strand-specific for i in options.strand_rule.split(','): strandRule[i[0]] = i[1] else: print >> sys.stderr, "Unknown value of option :'strand_rule' " + options.strand_rule sys.exit(1) #++++++++++++++++++++++++++++++++++++counting fragments print >> sys.stderr, "Extract exon regions from " + options.refgene_bed + '...' gene_ranges = build_range(options.refgene_bed) print >> sys.stderr, "Counting total fragment ... ", total_frags = 0.0 exonic_frags = 0.0 try: while (1): aligned_read = obj.samfile.next() if aligned_read.is_qcfail: continue #skip low quanlity if aligned_read.is_duplicate: continue #skip duplicate read if aligned_read.is_secondary: continue #skip non primary hit if options.skip_multi: if aligned_read.mapq < options.map_qual: continue try: chrom = obj.samfile.getrname(aligned_read.tid).upper() except: continue read_st = aligned_read.pos read_end = read_st + aligned_read.rlen #not exactly the end position in case of splicing, insertion,etc if not aligned_read.is_paired: # if read is NOT paired in sequencing (single-end sequencing) total_frags += 1 if (chrom in gene_ranges) and len(gene_ranges[chrom].find( read_st, read_end)) > 0: exonic_frags += 1 elif aligned_read.is_paired: # for pair-end sequencing if aligned_read.is_read2: continue # only count read1 mate_st = aligned_read.pnext mate_end = mate_st + aligned_read.rlen if aligned_read.is_unmapped: #read1 unmapped if aligned_read.mate_is_unmapped: continue #both unmap else: #read2 is mapped total_frags += options.single_read if (chrom in gene_ranges) and (len( gene_ranges[chrom].find(mate_st, mate_end)) > 0): exonic_frags += options.single_read else: if aligned_read.mate_is_unmapped: total_frags += options.single_read if (chrom in gene_ranges) and (len( gene_ranges[chrom].find(read_st, read_end)) > 0): exonic_frags += options.single_read else: total_frags += 1 if (chrom in gene_ranges) and ( len(gene_ranges[chrom].find(read_st, read_end)) > 0) and (len(gene_ranges[chrom].find( mate_st, mate_end)) > 0): exonic_frags += 1 except StopIteration: print >> sys.stderr, "Done" print >> sys.stderr, "Total fragment = %-20s" % (str(total_frags)) print >> sys.stderr, "Total exonic fragment = %-20s" % (str(exonic_frags)) if total_frags > 0 and exonic_frags > 0: if options.only_exon: denominator = exonic_frags else: denominator = total_frags else: print >> sys.stderr, "Total tags cannot be 0 or negative number" sys.exit(1) #++++++++++++++++++++++++++++++++++++++++++++++++ obj = SAM.ParseBAM(options.input_file) print >> OUT, '\t'.join(('#chrom', 'st', 'end', 'accession', 'mRNA_size', 'gene_strand', 'Frag_count', 'FPM', 'FPKM')) gene_finished = 0 #calculate raw count, FPM, FPKM for each gene for line in open(options.refgene_bed, 'r'): frag_count_f = 0.0 frag_count_r = 0.0 frag_count_fr = 0.0 mRNA_size = 0.0 exon_ranges = Intersecter() if line.startswith(('#', 'track', 'browser')): continue fields = line.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] gstrand = fields[5].replace(" ", "_") exon_starts = map(int, fields[11].rstrip(',\n').split(',')) exon_starts = map((lambda x: x + tx_start), exon_starts) exon_ends = map(int, fields[10].rstrip(',\n').split(',')) exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) for st, end in zip(exon_starts, exon_ends): mRNA_size += (end - st) exon_ranges.add_interval(Interval(st, end)) # extract reads mapped gene region try: alignedReads = obj.samfile.fetch(chrom, tx_start, tx_end) except: continue for aligned_read in alignedReads: flag = 0 if aligned_read.is_qcfail: continue #skip low quanlity if aligned_read.is_duplicate: continue #skip duplicate read if aligned_read.is_secondary: continue #skip non primary hit if options.skip_multi: if aligned_read.mapq < options.map_qual: continue #single end sequencing if not aligned_read.is_paired: frag_st = aligned_read.pos frag_end = read_st + aligned_read.rlen #not exactly the end position in case of splicing, insertion,etc if aligned_read.is_reverse: strand_key = '-' else: strand_key = '+' if len(exon_ranges.find(frag_st, frag_end)) > 0: if options.strand_rule is None: frag_count_fr += 1 elif strand_key in strandRule and strandRule[ strand_key] == '+': frag_count_f += 1 elif strand_key in strandRule and strandRule[ strand_key] == '-': frag_count_r += 1 # pair-end sequencing if aligned_read.is_paired: frag_st = aligned_read.pos frag_end = aligned_read.pnext if len(exon_ranges.find(frag_st, frag_st + 1)) < 1 and len( exon_ranges.find(frag_end, frag_end + 1)) < 1: continue if aligned_read.is_read2: continue if aligned_read.is_reverse: strand_key = '1-' else: strand_key = '1+' if options.strand_rule is None: if aligned_read.is_unmapped: if aligned_read.mate_is_unmapped: # both unmapped continue else: #only read2 mapped frag_count_fr += options.single_read else: if aligned_read.mate_is_unmapped: # only read1 mapped frag_count_fr += options.single_read else: #both mapped frag_count_fr += 1 else: if strand_key in strandRule and strandRule[ strand_key] == '+': if aligned_read.is_unmapped: if aligned_read.mate_is_unmapped: # both unmapped continue else: #only read2 mapped frag_count_f += options.single_read else: if aligned_read.mate_is_unmapped: # only read1 mapped frag_count_f += options.single_read else: #both mapped frag_count_f += 1 if strand_key in strandRule and strandRule[ strand_key] == '-': if aligned_read.is_unmapped: if aligned_read.mate_is_unmapped: # both unmapped continue else: #only read2 mapped frag_count_r += options.single_read else: if aligned_read.mate_is_unmapped: # only read1 mapped frag_count_r += options.single_read else: #both mapped frag_count_r += 1 FPM_fr = frag_count_fr * 1000000 / denominator FPM_f = frag_count_f * 1000000 / denominator FPM_r = frag_count_r * 1000000 / denominator FPKM_fr = frag_count_fr * 1000000000 / (denominator * mRNA_size) FPKM_f = frag_count_f * 1000000000 / (denominator * mRNA_size) FPKM_r = frag_count_r * 1000000000 / (denominator * mRNA_size) if options.strand_rule is None: print >> OUT, '\t'.join([ str(i) for i in (chrom, tx_start, tx_end, geneName, mRNA_size, gstrand, frag_count_fr, FPM_fr, FPKM_fr) ]) else: if gstrand == '+': print >> OUT, '\t'.join([ str(i) for i in (chrom, tx_start, tx_end, geneName, mRNA_size, gstrand, frag_count_f, FPM_f, FPKM_f) ]) elif gstrand == '-': print >> OUT, '\t'.join([ str(i) for i in (chrom, tx_start, tx_end, geneName, mRNA_size, gstrand, frag_count_r, FPM_r, FPKM_r) ]) gene_finished += 1 print >> sys.stderr, " %d transcripts finished\r" % (gene_finished),
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM or SAM format.") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s)") parser.add_option("-r", "--refgene", action="store", type="string", dest="ref_gene", help="Prefix of output files(s).") parser.add_option( "-l", "--lower-bound", action="store", type="int", dest="lower_bound_size", default=-250, help= "Lower bound of inner distance (bp). This option is used for ploting histograme. default=%default" ) parser.add_option( "-u", "--upper-bound", action="store", type="int", dest="upper_bound_size", default=250, help= "Upper bound of inner distance (bp). This option is used for plotting histogram. default=%default" ) parser.add_option( "-s", "--step", action="store", type="int", dest="step_size", default=5, help= "Step size (bp) of histograme. This option is used for plotting histogram. default=%default" ) (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file and options.ref_gene): parser.print_help() sys.exit(0) for input_file in ([options.input_file, options.ref_gene]): if not os.path.exists(input_file): print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n' parser.print_help() sys.exit(0) if options.step_size <= 0: print >> sys.stderr, "step size is a positive interger" sys.exit(0) obj = SAM.ParseBAM(options.input_file) obj.mRNA_inner_distance(outfile=options.output_prefix, low_bound=options.lower_bound_size, up_bound=options.upper_bound_size, step=options.step_size, refbed=options.ref_gene) try: subprocess.call("Rscript " + options.output_prefix + '.inner_distance_plot.r', shell=True) except: print >> sys.stderr, "Cannot generate pdf file form " + options.output_prefix + '.inner_distance_plot.r' pass
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input-file", action="store", type="string", dest="input_file", help="Alignment file in BAM format (SAM is not supported). [required]") parser.add_option("-o", "--out-prefix", action="store", type="string", dest="output_prefix", help="Prefix of output files(s). [required]") parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed fomat. [required]") parser.add_option( "-d", "--strand", action="store", type="string", dest="strand_rule", default=None, help= "How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'. If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)" ) parser.add_option( "-u", "--skip-multi-hits", action="store_true", dest="skip_multi", help= "How to deal with multiple hit reads. Presence this option renders program to skip multiple hits reads." ) parser.add_option( "-e", "--only-exonic", action="store_true", dest="only_exon", help= "How to count total reads. Presence of this option renders program only used exonic (UTR exons and CDS exons) reads, otherwise use all reads." ) (options, args) = parser.parse_args() if not (options.output_prefix and options.input_file and options.refgene_bed): parser.print_help() sys.exit(0) if not os.path.exists(options.input_file + '.bai'): print >> sys.stderr, "cannot find index file of input BAM file" print >> sys.stderr, options.input_file + '.bai' + " does not exists" sys.exit(0) for file in (options.input_file, options.refgene_bed): if not os.path.exists(file): print >> sys.stderr, file + " does NOT exists" + '\n' sys.exit(0) obj = SAM.ParseBAM(options.input_file) OUT = open(options.output_prefix + '_read_count.xls', 'w') #++++++++++++++++++++++++++++++++++++determine strand rule strandRule = {} if options.strand_rule is None: # Not strand-specific pass elif len(options.strand_rule.split(',')) == 4: #PairEnd, strand-specific for i in options.strand_rule.split(','): strandRule[i[0] + i[1]] = i[2] elif len(options.strand_rule.split(',')) == 2: #singeEnd, strand-specific for i in options.strand_rule.split(','): strandRule[i[0]] = i[1] else: print >> sys.stderr, "Unknown value of option :'strand_rule' " + options.strand_rule sys.exit(1) #++++++++++++++++++++++++++++++++++++counting reads print >> sys.stderr, "Retrieve exon regions from " + options.refgene_bed + '...' gene_ranges = build_range(options.refgene_bed) #print gene_ranges['ERCC-00002'].find(0,100) print >> sys.stderr, "Counting total reads ... ", total_reads = 0 total_tags = 0 total_exonic_tags = 0 try: while (1): flag = 0 aligned_read = obj.samfile.next() if aligned_read.is_qcfail: continue #skip low quanlity if aligned_read.is_duplicate: continue #skip duplicate read if aligned_read.is_secondary: continue #skip non primary hit if aligned_read.is_unmapped: continue #skip unmap read if options.skip_multi: if len(aligned_read.tags) > 0: #( ("NM", 1),("RG", "L1") ) for i in aligned_read.tags: if i[0] in SAM.ParseBAM.multi_hit_tags and i[1] > 1: flag = 1 #multiple hit read break if flag == 1: continue #skip multiple map read total_reads += 1 chrom = obj.samfile.getrname(aligned_read.tid).upper() hit_st = aligned_read.pos exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar) total_tags += len(exon_blocks) for exn in exon_blocks: mid = exn[1] + int((exn[2] - exn[1]) / 2) #print chrom,mid,mid+1 #print gene_ranges[chrom].find(mid,mid+1) if (chrom in gene_ranges) and len(gene_ranges[chrom].find( mid, mid + 1)) > 0: total_exonic_tags += 1 except StopIteration: print >> sys.stderr, "Done" print >> sys.stderr, "Total Reads = %-20s" % (str(total_reads)) print >> sys.stderr, "Total Tags = %-20s" % (str(total_tags)) print >> sys.stderr, "Total Exon Tags = %-20s" % (str(total_exonic_tags)) if total_tags > 0 and total_exonic_tags > 0: if options.only_exon: denominator = total_exonic_tags else: denominator = total_tags else: print >> sys.stderr, "Total tags cannot be 0 or negative number" sys.exit(1) #++++++++++++++++++++++++++++++++++++++++++++++++ obj = SAM.ParseBAM(options.input_file) if options.strand_rule is None: OUT.write('#chrom' + '\t' + 'st' + '\t' + 'end' + '\t' + 'accession' + '\t' + 'score' + '\t' + 'gene_strand' + '\t' + 'tag_count' + '\t' + 'RPKM' + '\n') else: OUT.write('#chrom' + '\t' + 'st' + '\t' + 'end' + '\t' + 'accession' + '\t' + 'score' + '\t' + 'gene_strand' + '\t' + 'tag_count_Forward' + '\t' + 'tag_count_Reverse' + '\t' + 'RPKM_Forward' + '\t' + 'RPKM_Reverse' + '\n') genome_total_read = 0 genome_unique_read = 0 gene_finished = 0 #calculate raw count, RPKM for each gene for line in open(options.refgene_bed, 'r'): exon_range = Intersecter() intron_range = Intersecter() if line.startswith(('#', 'track', 'browser')): continue fields = line.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] gstrand = fields[5].replace(" ", "_") cds_start = int(fields[6]) cds_end = int(fields[7]) exon_starts = map(int, fields[11].rstrip(',\n').split(',')) exon_starts = map((lambda x: x + tx_start), exon_starts) exon_ends = map(int, fields[10].rstrip(',\n').split(',')) exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) intron_starts = exon_ends[:-1] intron_ends = exon_starts[1:] plus_ranges = Intersecter() minus_ranges = Intersecter() unstrand_ranges = Intersecter() try: alignedReads = obj.samfile.fetch(chrom, tx_start, tx_end) except: print >> sys.stderr, "No alignments for " + geneName + ". Skip" continue for aligned_read in alignedReads: flag = 0 if aligned_read.is_qcfail: continue #skip low quanlity if aligned_read.is_duplicate: continue #skip duplicate read if aligned_read.is_secondary: continue #skip non primary hit if aligned_read.is_unmapped: continue #skip unmap read if options.skip_multi: if len(aligned_read.tags) > 0: #( ("NM", 1),("RG", "L1") ) for i in aligned_read.tags: if i[0] in SAM.ParseBAM.multi_hit_tags and i[1] > 1: flag = 1 #multiple hit read break if flag == 1: continue #skip multiple map read if aligned_read.is_paired: #pair end if aligned_read.is_read1: read_id = '1' if aligned_read.is_read2: read_id = '2' else: read_id = '' #single end if aligned_read.is_reverse: map_strand = '-' else: map_strand = '+' strand_key = read_id + map_strand #used to determine if a read should assign to gene(+) or gene(-) hit_st = aligned_read.pos exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar) #construct bitset if options.strand_rule is not None: if strandRule[strand_key] == '+': for block in exon_blocks: mid = block[1] + int((block[2] - block[1]) / 2) plus_ranges.add_interval(Interval(mid, mid + 1)) elif strandRule[strand_key] == '-': for block in exon_blocks: mid = block[1] + int((block[2] - block[1]) / 2) minus_ranges.add_interval(Interval(mid, mid + 1)) elif options.strand_rule is None: for block in exon_blocks: mid = block[1] + int((block[2] - block[1]) / 2) unstrand_ranges.add_interval(Interval(mid, mid + 1)) mRNA_plus_hits = 0 mRNA_plus_rpkm = 0.0 mRNA_minus_hits = 0 mRNA_minus_rpkm = 0.0 mRNA_hits = 0 mRNA_rpkm = 0.0 mRNA_length = 0 #assign reads to region:exon,intron,mRNA if (options.strand_rule is not None): #this is strand specific if gstrand == '-': intronNum = len(intron_starts) exonNum = len(exon_starts) elif gstrand == '+': intronNum = 1 exonNum = 1 #assign reads to intron regions for st, end in zip(intron_starts, intron_ends): if end > st: size = end - st elif end == st: size = 1 hits_plus = len(plus_ranges.find(st, end)) hits_minus = len(minus_ranges.find(st, end)) hits_plus_rpkm = hits_plus * 1000000000.0 / (size * denominator) hits_minus_rpkm = hits_minus * 1000000000.0 / (size * denominator) print >> OUT, '\t'.join([ '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f', '%.3f' ]) % (chrom, st, end, geneName + "_intron_" + str(intronNum), 0, gstrand, hits_plus, hits_minus, hits_plus_rpkm, hits_minus_rpkm) if gstrand == '-': intronNum -= 1 elif gstrand == '+': intronNum += 1 #assign reads to exon regions for st, end in zip(exon_starts, exon_ends): if end > st: size = end - st elif end == st: size = 1 hits_plus = len(plus_ranges.find(st, end)) hits_minus = len(minus_ranges.find(st, end)) hits_plus_rpkm = hits_plus * 1000000000.0 / (size * denominator) hits_minus_rpkm = hits_minus * 1000000000.0 / (size * denominator) print >> OUT, '\t'.join([ '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f', '%.3f' ]) % (chrom, st, end, geneName + "_exon_" + str(exonNum), 0, gstrand, hits_plus, hits_minus, hits_plus_rpkm, hits_minus_rpkm) if gstrand == '-': exonNum -= 1 elif gstrand == '+': exonNum += 1 mRNA_plus_hits += hits_plus mRNA_minus_hits += hits_minus mRNA_length += size mRNA_plus_rpkm = mRNA_plus_hits * 1000000000.0 / (mRNA_length * denominator) mRNA_minus_rpkm = mRNA_minus_hits * 1000000000.0 / (mRNA_length * denominator) print >> OUT, '\t'.join([ '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f', '%.3f' ]) % (chrom, tx_start, tx_end, geneName + "_mRNA", 0, gstrand, mRNA_plus_hits, mRNA_minus_hits, mRNA_plus_rpkm, mRNA_minus_rpkm) elif (options.strand_rule is None): #this is NOT strand specific if gstrand == '-': intronNum = len(intron_starts) exonNum = len(exon_starts) elif gstrand == '+': intronNum = 1 exonNum = 1 #assign reads to intron regions for st, end in zip(intron_starts, intron_ends): if end > st: size = end - st elif end == st: size = 1 hits = len(unstrand_ranges.find(st, end)) hits_rpkm = hits * 1000000000.0 / (size * denominator) print >> OUT, '\t'.join([ '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f' ]) % (chrom, st, end, geneName + "_intron_" + str(intronNum), 0, gstrand, hits, hits_rpkm) if gstrand == '-': intronNum -= 1 elif gstrand == '+': intronNum += 1 #assign reads to exon regions for st, end in zip(exon_starts, exon_ends): if end > st: size = end - st elif end == st: size = 1 hits = len(unstrand_ranges.find(st, end)) hits_rpkm = hits * 1000000000.0 / (size * denominator) print >> OUT, '\t'.join([ '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f' ]) % (chrom, st, end, geneName + "_exon_" + str(exonNum), 0, gstrand, hits, hits_rpkm) if gstrand == '-': exonNum -= 1 elif gstrand == '+': exonNum += 1 mRNA_hits += hits mRNA_length += size mRNA_rpkm = mRNA_hits * 1000000000.0 / (mRNA_length * denominator) print >> OUT, '\t'.join([ '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f' ]) % (chrom, tx_start, tx_end, geneName + "_mRNA", 0, gstrand, mRNA_hits, mRNA_rpkm) gene_finished += 1 print >> sys.stderr, " %d transcripts finished\r" % (gene_finished),