Python SAM примеры, qcmodule.SAM Python примеры использования

Пример #1

0

Показать файл

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format.")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s).")
    (options, args) = parser.parse_args()

    if not (options.input_file):
        parser.print_help()
        sys.exit(0)
    for input_file in ([options.input_file]):
        if not os.path.exists(input_file):
            print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
            #parser.print_help()
            sys.exit(0)

    obj = SAM.ParseBAM(options.input_file)
    obj.clipping_profile(outfile=options.output_prefix)
    try:
        subprocess.call("Rscript " + options.output_prefix +
                        '.clipping_profile.r',
                        shell=True)
    except:
        print >> sys.stderr, "Cannot generate pdf file form " + options.output_prefix + '.clipping_profile.r'
        pass

Пример #2

0

Показать файл

Файл: junction_annotation.py Проект: UMMS-Biocore/dolphin-bin

def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. This file is better to be a pooled gene model as it will be used to annotate splicing junctions [required]")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
	parser.add_option("-m","--min-intron",action="store",type="int",dest="min_intron",default=50, help="Minimum intron length (bp). default=%default [optional]")
	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default")

	(options,args)=parser.parse_args()
		
	if not (options.output_prefix and options.input_file and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		sys.exit(0)
	if os.path.exists(options.input_file):
		obj = SAM.ParseBAM(options.input_file)
		obj.annotate_junction(outfile=options.output_prefix,refgene=options.ref_gene_model,min_intron=options.min_intron, q_cut = options.map_qual)
		try:
			subprocess.call("Rscript " + options.output_prefix + '.junction_plot.r', shell=True)
		except:
			print >>sys.stderr, "Cannot generate pdf file from " + '.junction_plot.r'
			pass
	else:
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		sys.exit(0)
	try:
		generate_bed12(options.output_prefix + '.junction.xls')	
	except:
		pass

Пример #3

0

Показать файл

Файл: bam_stat.py Проект: UMMS-Biocore/dolphin-bin

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM format.")
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.input_file):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file):
        print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
        sys.exit(0)

    obj = SAM.ParseBAM(options.input_file)
    obj.stat(q_cut=options.map_qual)

Пример #4

0

Показать файл

def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s)")
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene",help="Reference gene model in BED format.")
	parser.add_option("-k","--sample-size",action="store",type="int",dest="sampleSize",default=1000000,help="Number of read-pairs used to estimate inner distance. default=%default")
	parser.add_option("-l","--lower-bound",action="store",type="int",dest="lower_bound_size",default=-250,help="Lower bound of inner distance (bp). This option is used for ploting histograme. default=%default")
	parser.add_option("-u","--upper-bound",action="store",type="int",dest="upper_bound_size",default=250,help="Upper bound of inner distance (bp). This option is used for plotting histogram. default=%default")
	parser.add_option("-s","--step",action="store",type="int",dest="step_size",default=5,help="Step size (bp) of histograme. This option is used for plotting histogram. default=%default")	
	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")

	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.input_file and options.ref_gene):
		parser.print_help()
		sys.exit(0)
	for input_file in ([options.input_file,options.ref_gene]):
		if not os.path.exists(input_file):
			print >>sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
			parser.print_help()
			sys.exit(0)
	if options.step_size <=0:
		print >>sys.stderr, "step size is a positive interger"
		sys.exit(0)
	obj = SAM.ParseBAM(options.input_file)
	obj.mRNA_inner_distance(outfile=options.output_prefix,low_bound=options.lower_bound_size,up_bound=options.upper_bound_size,step=options.step_size,refbed=options.ref_gene,sample_size=options.sampleSize, q_cut = options.map_qual)
	try:
		subprocess.call("Rscript " + options.output_prefix + '.inner_distance_plot.r',shell=True)
	except:
		print >>sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.inner_distance_plot.r'
		pass

Пример #5

0

Показать файл

Файл: read_GC.py Проект: yaskermezli/rseqc

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format.")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s).")
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file):
        parser.print_help()
        sys.exit(0)
    if os.path.exists(options.input_file):
        obj = SAM.ParseBAM(options.input_file)
        obj.readGC(outfile=options.output_prefix)
        try:
            subprocess.call("Rscript " + options.output_prefix + ".GC_plot.r",
                            shell=True)
        except:
            pass
    else:
        print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
        #parser.print_help()
        sys.exit(0)

Пример #6

0

Показать файл

Файл: insertion_profile.py Проект: UMMS-Biocore/dolphin-bin

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format.")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s).")
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default"
    )
    parser.add_option(
        "-s",
        "--sequencing",
        action="store",
        dest="layout",
        help="Sequencing layout. \"SE\"(single-end) or \"PE\"(pair-end). ")
    (options, args) = parser.parse_args()

    if not (options.input_file and options.output_prefix and options.layout):
        parser.print_help()
        sys.exit(0)
    for input_file in ([options.input_file]):
        if not os.path.exists(input_file):
            print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
            sys.exit(0)

    obj = SAM.ParseBAM(options.input_file)
    if options.layout == "SE":
        obj.insertion_profile(outfile=options.output_prefix,
                              q_cut=options.map_qual,
                              PE=False)
    elif options.layout == "PE":
        obj.insertion_profile(outfile=options.output_prefix,
                              q_cut=options.map_qual,
                              PE=True)
    else:
        print >> sys.stderr, "unknow sequencing layout. Must be \"SE\" or \"PE\""
    try:
        subprocess.call("Rscript " + options.output_prefix +
                        '.insertion_profile.r',
                        shell=True)
    except:
        print >> sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.insertion_profile.r'
        pass

Пример #7

0

Показать файл

Файл: read_duplication.py Проект: oicr-gsi/RSeQC-GSI

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format.")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s).")
    parser.add_option(
        "-u",
        "--up-limit",
        action="store",
        type="int",
        dest="upper_limit",
        default=500,
        help=
        "Upper limit of reads' occurrence. Only used for plotting, default=%default (times)"
    )
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file):
        parser.print_help()
        sys.exit(0)
    if os.path.exists(options.input_file):
        obj = SAM.ParseBAM(options.input_file)
        obj.readDupRate(outfile=options.output_prefix,
                        up_bound=options.upper_limit,
                        q_cut=options.map_qual)
        try:
            subprocess.call("Rscript " + options.output_prefix +
                            ".DupRate_plot.r",
                            shell=True)
        except:
            pass
    else:
        print('\n\n' + options.input_file + " does NOT exists" + '\n',
              file=sys.stderr)
        #parser.print_help()
        sys.exit(0)

Пример #8

0

Показать файл

Файл: read_quality.py Проект: oicr-gsi/RSeQC-GSI

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format. [required]")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    parser.add_option(
        "-r",
        "--reduce",
        action="store",
        type="int",
        dest="reduce_fold",
        default=1000,
        help=
        "To avoid making huge vector in R, nucleotide with particular phred score less frequent than this number will be ignored. Increase this number save more memory while reduce precision. Set to 1 achieves maximum precision (i.e. every nucleotide will be considered). This option only applies to the 'boxplot'. default=%default"
    )
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file):
        parser.print_help()
        sys.exit(0)
    if os.path.exists(options.input_file):
        obj = SAM.ParseBAM(options.input_file)
        obj.readsQual_boxplot(outfile=options.output_prefix,
                              q_cut=options.map_qual,
                              shrink=options.reduce_fold)
        try:
            subprocess.call("Rscript " + options.output_prefix + ".qual.r",
                            shell=True)
        except:
            pass
    else:
        print('\n\n' + options.input_file + " does NOT exists" + '\n',
              file=sys.stderr)
        #parser.print_help()
        sys.exit(0)

Пример #9

0

Показать файл

def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format. BAM file must be sorted and indexed using samTools. .bam and .bai files should be placed in the same directory.")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name/ID, second column is chromosome size. Chromosome name (such as \"chr1\") should be consistent between this file and the BAM file.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output wiggle files(s). One wiggle file will be generated for non strand-specific data, two wiggle files (\"Prefix_Forward.wig\" and \"Prefix_Reverse.wig\") will be generated for strand-specific RNA-seq data.")
	parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",help="Specified wigsum. Eg: 1,000,000,000 equals to coverage of 10 million 100nt reads. Ignore this option to disable normalization")
	parser.add_option("-u","--skip-multi-hits",action="store_true",dest="skip_multi",help="Skip non-unique hit reads.")
	parser.add_option("-d","--strand",action="store",type="string",dest="strand_rule",default=None,help="How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq data, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'.  If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data).")
	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality for an alignment to be called \"uniquely mapped\". default=%default")

	(options,args)=parser.parse_args()
	
	if options.skip_multi:print "Skip multi-hits:True"
	else:print "Skip multi-hits:False"
	
	if not (options.output_prefix and options.input_file and options.chromSize and options.output_prefix):
		parser.print_help()
		sys.exit(0)
	for file in (options.input_file,options.chromSize):
		if not os.path.exists(file):
			print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n'
			sys.exit(0)
	if not os.path.exists(options.input_file + '.bai'):
		print >>sys.stderr, "index file " + options.input_file + '.bai' + " does not exists"
		sys.exit(0)


	chromSizes = load_chromsize(options.chromSize)
	
	norm_factor=None
	if options.total_wigsum:
		obj = SAM.ParseBAM(options.input_file)
		wig_sum = obj.calWigSum(chrom_sizes = chromSizes, skip_multi=options.skip_multi)
		print >>sys.stderr, "\n\ntotal wigsum is:" + str(wig_sum) + '\n'
		try:
			norm_factor = options.total_wigsum / wig_sum
		except:
			norm_factor = None
			
	obj = SAM.ParseBAM(options.input_file)		
	obj.bamTowig(outfile = options.output_prefix, chrom_sizes = chromSizes, chrom_file = options.chromSize, q_cut = options.map_qual, skip_multi=options.skip_multi,strand_rule = options.strand_rule, WigSumFactor=norm_factor)

Пример #10

0

Показать файл

Файл: read_NVC.py Проект: oicr-gsi/RSeQC-GSI

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Input file in BAM or SAM format.[required]")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    parser.add_option(
        "-x",
        "--nx",
        action="store_true",
        dest="unknown_nucleotide",
        help=
        "Flag option. Presense of this flag tells program to include N,X in output NVC plot [required]"
    )
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file):
        parser.print_help()
        sys.exit(0)
    if os.path.exists(options.input_file):
        obj = SAM.ParseBAM(options.input_file)
        obj.readsNVC(outfile=options.output_prefix,
                     nx=options.unknown_nucleotide,
                     q_cut=options.map_qual)
        try:
            subprocess.call("Rscript " + options.output_prefix + ".NVC_plot.r",
                            shell=True)
        except:
            pass
    else:
        print('\n\n' + options.input_file + " does NOT exists" + '\n',
              file=sys.stderr)
        #parser.print_help()
        sys.exit(0)

Пример #11

0

Показать файл

Файл: infer_experiment.py Проект: yaskermezli/rseqc

def main():
    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Input alignment file in SAM or BAM format")
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed fomat.")
    parser.add_option(
        "-s",
        "--sample-size",
        action="store",
        type="int",
        dest="sample_size",
        default=200000,
        help="Number of reads sampled from SAM/BAM file. default=%default")
    (options, args) = parser.parse_args()

    if not (options.input_file and options.refgene_bed):
        parser.print_help()
        print >> sys.stderr, '\n\n' + __doc__
        sys.exit(0)
    for f in (options.input_file, options.refgene_bed):
        if not os.path.exists(f):
            print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
            sys.exit(0)
    if options.sample_size < 1000:
        print >> sys.stderr, "Warn: Sample Size too small to give a accurate estimation"
    obj = SAM.ParseBAM(options.input_file)
    (protocol, sp1, sp2,
     other) = obj.configure_experiment(refbed=options.refgene_bed,
                                       sample_size=options.sample_size)
    if other < 0: other = 0.0
    if protocol == "PairEnd":
        print "\n\nThis is PairEnd Data"
        print "Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1
        print "Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2
        print "Fraction of reads explained by other combinations: %.4f" % other
    elif protocol == "SingleEnd":
        print "\n\nThis is SingleEnd Data"
        print "Fraction of reads explained by \"++,--\": %.4f" % sp1
        print "Fraction of reads explained by \"+-,-+\": %.4f" % sp2
        print "Fraction of reads explained by other combinations: %.4f" % other
    else:
        print "Unknown Data type"

Пример #12

0

Показать файл

Файл: geneBody_coverage.py Проект: yaskermezli/rseqc

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format.")
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="ref_gene_model",
                      help="Reference gene model in bed format. [required]")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file
            and options.ref_gene_model):
        parser.print_help()
        sys.exit(0)

    if not os.path.exists(options.ref_gene_model):
        print >> sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
        #parser.print_help()
        sys.exit(0)
    if os.path.exists(options.input_file):
        obj = SAM.ParseBAM(options.input_file)
        obj.coverageGeneBody(outfile=options.output_prefix,
                             refbed=options.ref_gene_model)
        try:
            subprocess.call("Rscript " + options.output_prefix +
                            '.geneBodyCoverage_plot.r',
                            shell=True)
        except:
            print >> sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.geneBodyCoverage_plot.r'
            pass
    else:
        print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
        #parser.print_help()
        sys.exit(0)

Пример #13

0

Показать файл

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format.")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s).")
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.input_file):
        parser.print_help()
        sys.exit(0)
    for input_file in ([options.input_file]):
        if not os.path.exists(input_file):
            print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
            #parser.print_help()
            sys.exit(0)

    obj = SAM.ParseBAM(options.input_file)
    obj.clipping_profile(outfile=options.output_prefix, q_cut=options.map_qual)
    try:
        subprocess.call("Rscript " + options.output_prefix +
                        '.clipping_profile.r',
                        shell=True)
    except:
        print >> sys.stderr, "Cannot generate pdf file form " + options.output_prefix + '.clipping_profile.r'
        pass

Пример #14

0

Показать файл

Файл: read_quality.py Проект: yaskermezli/rseqc

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format. [required]")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    parser.add_option(
        "-r",
        "--reduce",
        action="store",
        type="int",
        dest="reduce_fold",
        default=1000,
        help=
        "To avoid making huge vector in R, nucleotide with particular phred score represented less than this number will be ignored. Increase this number save more memory while reduce precision. This option only applies to the 'boxplot'. default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file):
        parser.print_help()
        sys.exit(0)
    if os.path.exists(options.input_file):
        obj = SAM.ParseBAM(options.input_file)
        obj.readsQual_boxplot(outfile=options.output_prefix)
        try:
            subprocess.call("Rscript " + options.output_prefix + ".qual.r",
                            shell=True)
        except:
            pass
    else:
        print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
        #parser.print_help()
        sys.exit(0)

Пример #15

0

Показать файл

Файл: RPKM_saturation.py Проект: UMMS-Biocore/dolphin-bin

def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. [required]")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat. [required]")
	parser.add_option("-d","--strand",action="store",type="string",dest="strand_rule",default=None,help="How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'.  If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)")
	parser.add_option("-l","--percentile-floor",action="store",type="int",dest="percentile_low_bound",default=5, help="Sampling starts from this percentile. A integer between 0 and 100. default=%default")
	parser.add_option("-u","--percentile-ceiling",action="store",type="int",dest="percentile_up_bound",default=100, help="Sampling ends at this percentile. A integer between 0 and 100. default=%default")
	parser.add_option("-s","--percentile-step",action="store",type="int",dest="percentile_step",default=5, help="Sampling frequency. Smaller value means more sampling times. A integer between 0 and 100. default=%default")	
	parser.add_option("-c","--rpkm-cutoff",action="store",type="float",dest="rpkm_cutoff",default=0.01, help="Transcripts with RPKM smaller than this number will be ignored in visualization plot. default=%default")
	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
	
	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.input_file):
		parser.print_help()
		sys.exit(0)
	if options.percentile_low_bound <0 or options.percentile_low_bound >100:
		print >>sys.stderr, "percentile_low_bound must be larger than 0 and samller than 100"
		sys.exit(0)
	if options.percentile_up_bound <0 or options.percentile_up_bound >100:
		print >>sys.stderr, "percentile_up_bound must be larger than 0 and samller than 100"
		sys.exit(0)
	if options.percentile_up_bound < options.percentile_low_bound:
		print >>sys.stderr, "percentile_up_bound must be larger than percentile_low_bound"
		sys.exit(0)
	if options.percentile_step <0 or options.percentile_step > options.percentile_up_bound:
		print >>sys.stderr, "percentile_step must be larger than 0 and samller than percentile_up_bound"
		sys.exit(0)
	if os.path.exists(options.input_file):
		obj = SAM.ParseBAM(options.input_file)
		obj.saturation_RPKM(outfile=options.output_prefix, refbed=options.refgene_bed, sample_start=options.percentile_low_bound,sample_end=options.percentile_up_bound,sample_step=options.percentile_step,strand_rule=options.strand_rule, q_cut  = options.map_qual)
		show_saturation(infile=options.output_prefix + ".eRPKM.xls", outfile=options.output_prefix + ".saturation.r",rpkm_cut = options.rpkm_cutoff)
		try:
			subprocess.call("Rscript " + options.output_prefix + ".saturation.r", shell=True)
		except:
			pass
	else:
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)

Пример #16

0

Показать файл

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format.")
    (options, args) = parser.parse_args()

    if not (options.input_file):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file):
        print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
        sys.exit(0)

    obj = SAM.ParseBAM(options.input_file)
    obj.stat()

Пример #17

0

Показать файл

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Input file in BAM or SAM format.[required]")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    parser.add_option(
        "-x",
        "--nx",
        action="store_true",
        dest="unknown_nucleotide",
        help=
        "Flag option. Presense of this flag tells program to include N,X in output NVC plot [required]"
    )
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file):
        parser.print_help()
        sys.exit(0)
    if os.path.exists(options.input_file):
        obj = SAM.ParseBAM(options.input_file)
        obj.readsNVC(outfile=options.output_prefix,
                     nx=options.unknown_nucleotide)
        try:
            subprocess.call("Rscript " + options.output_prefix + ".NVC_plot.r",
                            shell=True)
        except:
            pass
    else:
        print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
        #parser.print_help()
        sys.exit(0)

Пример #18

0

Показать файл

Файл: junction_saturation.py Проект: oicr-gsi/RSeQC-GSI

def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.[required]")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat. This gene model is used to determine known splicing junctions. [required]")
	parser.add_option("-l","--percentile-floor",action="store",type="int",dest="percentile_low_bound",default=5, help="Sampling starts from this percentile. A integer between 0 and 100. default=%default")
	parser.add_option("-u","--percentile-ceiling",action="store",type="int",dest="percentile_up_bound",default=100, help="Sampling ends at this percentile. A integer between 0 and 100. default=%default")
	parser.add_option("-s","--percentile-step",action="store",type="int",dest="percentile_step",default=5, help="Sampling frequency. Smaller value means more sampling times. A integer between 0 and 100. default=%default")	
	parser.add_option("-m","--min-intron",action="store",type="int",dest="minimum_intron_size",default=50, help="Minimum intron size (bp). default=%default")
	parser.add_option("-v","--min-coverage",action="store",type="int",dest="minimum_splice_read",default=1, help="Minimum number of supportting reads to call a junction. default=%default")
	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")

	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.input_file and options.refgene_bed):
		parser.print_help()
		sys.exit(0)
	if options.percentile_low_bound <0 or options.percentile_low_bound >100:
		print("percentile_low_bound must be larger than 0 and samller than 100", file=sys.stderr)
		sys.exit(0)
	if options.percentile_up_bound <0 or options.percentile_up_bound >100:
		print("percentile_up_bound must be larger than 0 and samller than 100", file=sys.stderr)
		sys.exit(0)
	if options.percentile_up_bound < options.percentile_low_bound:
		print("percentile_up_bound must be larger than percentile_low_bound", file=sys.stderr)
		sys.exit(0)
	if options.percentile_step <0 or options.percentile_step > options.percentile_up_bound:
		print("percentile_step must be larger than 0 and samller than percentile_up_bound", file=sys.stderr)
		sys.exit(0)
	if os.path.exists(options.input_file):
		obj = SAM.ParseBAM(options.input_file)
		obj.saturation_junction(outfile=options.output_prefix, refgene=options.refgene_bed, sample_start=options.percentile_low_bound,sample_end=options.percentile_up_bound,sample_step=options.percentile_step,min_intron=options.minimum_intron_size,recur=options.minimum_splice_read, q_cut = options.map_qual)
		try:
			subprocess.call("Rscript " + options.output_prefix + '.junctionSaturation_plot.r', shell=True)
		except:
			print("Cannot generate pdf file from " + '.junctionSaturation_plot.r', file=sys.stderr)
			pass
	else:
		print('\n\n' + options.input_file + " does NOT exists" + '\n', file=sys.stderr)
		sys.exit(0)

Пример #19

0

Показать файл

def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s).")
	parser.add_option("-u","--up-limit",action="store",type="int",dest="upper_limit",default=500,help="upper limit of duplicated times. Only used for plotting, default=%default (times)")
	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.input_file):
		parser.print_help()
		sys.exit(0)
	if os.path.exists(options.input_file):
		obj = SAM.ParseBAM(options.input_file)
		obj.readDupRate(outfile=options.output_prefix,up_bound=options.upper_limit)
		try:
			subprocess.call("Rscript " + options.output_prefix +  ".DupRate_plot.r", shell=True)
		except:
			pass
	else:
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)

Пример #20

0

Показать файл

def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input alignment file in SAM or BAM format")
	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat.")
	parser.add_option("-s","--sample-size",action="store",type="int",dest="sample_size",default=200000, help="Number of reads sampled from SAM/BAM file. default=%default")	
	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default")

	(options,args)=parser.parse_args()

	if not (options.input_file and options.refgene_bed):
		parser.print_help()
		print('\n\n' + __doc__, file=sys.stderr)
		sys.exit(0)
	for f in (options.input_file,options.refgene_bed):
		if not os.path.exists(f):
			print('\n\n' + f + " does NOT exists." + '\n', file=sys.stderr)
			sys.exit(0)
	if options.sample_size <1000:
		print("Warn: Sample Size too small to give a accurate estimation", file=sys.stderr)
	obj = SAM.ParseBAM(options.input_file)
	(protocol,sp1,sp2,other)=obj.configure_experiment(refbed=options.refgene_bed, sample_size = options.sample_size, q_cut = options.map_qual)
	if other <0: other=0.0
	if protocol == "PairEnd":
		print("\n\nThis is PairEnd Data")
		print("Fraction of reads failed to determine: %.4f" % other)
		print("Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1)
		print("Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2)
		
	elif protocol == "SingleEnd":
		print("\n\nThis is SingleEnd Data")
		print("Fraction of reads failed to determine: %.4f" % other)
		print("Fraction of reads explained by \"++,--\": %.4f" % sp1)
		print("Fraction of reads explained by \"+-,-+\": %.4f" % sp2)
		
	else:
		print("Unknown Data type")

Пример #21

0

Показать файл

Файл: bam2fq.py Проект: UMMS-Biocore/dolphin-bin

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM format")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output fastq files(s).")
    parser.add_option(
        "-s",
        "--single-end",
        action="store_true",
        dest="single",
        help="Specificy '-s' or '--single-end' for single-end sequencing.")
    parser.add_option(
        "-c",
        "--compress",
        action="store_true",
        dest="gzip",
        help=
        "Specificy '-c' or '--compress' to compress output fastq file(s) using 'gzip' command."
    )
    (options, args) = parser.parse_args()

    #print options.single
    if not (options.output_prefix and options.input_file):
        parser.print_help()
        sys.exit(0)
    if os.path.exists(options.input_file):
        obj = SAM.ParseBAM(options.input_file)
        if options.single is True:
            obj.bam2fq(prefix=options.output_prefix, paired=False)
            if options.gzip is True:
                try:
                    print >> sys.stderr, "run gzip ... ",
                    subprocess.call("gzip " + options.output_prefix + '.fastq',
                                    shell=True)
                    print >> sys.stderr, "Done."
                except:
                    pass
        else:
            obj.bam2fq(prefix=options.output_prefix, paired=True)
            if options.gzip is True:
                try:
                    print >> sys.stderr, "run gzip ..."
                    subprocess.call("gzip " + options.output_prefix +
                                    '.R1.fastq',
                                    shell=True)
                    subprocess.call("gzip " + options.output_prefix +
                                    '.R2.fastq',
                                    shell=True)
                    print >> sys.stderr, "Done."
                except:
                    pass
    else:
        print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
        #parser.print_help()
        sys.exit(0)

Пример #22

0

Показать файл

def main():
    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Input alignment file in SAM or BAM format")
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed fomat.")
    parser.add_option(
        "-s",
        "--sample-size",
        action="store",
        type="int",
        dest="sample_size",
        default=200000,
        help="Number of reads sampled from SAM/BAM file. default=%default")
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default"
    )
    # extra options added by me:
    parser.add_option(
        "-p",
        "--pval",
        action="store",
        type="float",
        dest="pval_threshold",
        default=1e-5,
        help=
        "Binomial p-value for rejecting null hypothesis that experiment was unstranded. default=%default"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="output_file",
                      help="Name of the output file to write the result to.")

    (options, args) = parser.parse_args()

    if not (options.input_file and options.refgene_bed):
        parser.print_help()
        print('\n\n' + __doc__, file=sys.stderr)
        sys.exit(0)
    for f in (options.input_file, options.refgene_bed):
        if not os.path.exists(f):
            print('\n\n' + f + " does NOT exists." + '\n', file=sys.stderr)
            sys.exit(0)
    if options.sample_size < 1000:
        print("Warn: Sample Size too small to give a accurate estimation",
              file=sys.stderr)
    obj = SAM.ParseBAM(options.input_file)
    (protocol, sp1, sp2,
     other) = obj.configure_experiment(refbed=options.refgene_bed,
                                       sample_size=options.sample_size,
                                       q_cut=options.map_qual)
    if other < 0:
        other = 0.0
    '''
    #Below is original "status" message that gets printed to the console
	if protocol == "PairEnd":
		print("\n\nThis is PairEnd Data")
		print("Fraction of reads failed to determine: %.4f" % other)
		print("Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1)
		print("Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2)
		
	elif protocol == "SingleEnd":
		print("\n\nThis is SingleEnd Data")
		print("Fraction of reads failed to determine: %.4f" % other)
		print("Fraction of reads explained by \"++,--\": %.4f" % sp1)
		print("Fraction of reads explained by \"+-,-+\": %.4f" % sp2)
		
	else:
		print("Unknown Data type")
	#print mesg
    '''
    '''
    sp1 and sp2 are floats giving the fraction of reads.
    The "safest" option is to assume that the experiment is NOT stranded, which then makes tools like 
    featureCounts skip the ambiguous areas.  However, if we have strong enough evidence to support a stranded
    protocol, that would likely be more accurate for quantification.

    To that end, we perform a binomial test.  We know the number of trials (m) and the three fractions: other, sp1, sp2
    From those, we can get rough estimates on the number of reads assigned to each group ( floor(m*other), floor(m*sp1), floor(m*sp2))
    Then, we will ignore the unassigned reads and use N=floor(m*sp1) + floor(m*sp2) as an estimate of the total trials.  From there, 
    a binomial test with N trials and m*sp1 successes will give us a p-value for the null hypothesis that it is unstranded, which assumes
    reads have an equal probability from each strand. (p=0.5).  It is usually very obvious if the protocol is stranded, e.g. sp1=0.05, sp2=0.95
    so the p-value will be VERY stringent
    '''
    fout = open(options.output_file, 'w')
    m = options.sample_size
    n1 = int(m * sp1)
    n2 = int(m * sp2)
    N = n1 + n2  # total reads that were assigned to one "style" or the other
    pval = binom_test(n1, N, p=0.5)

    header = 'sp1_fraction,sp2_fraction,total_sampled,total_assigned,n1,n2,pval_threshold,pval,strand_option\n'
    fout.write(header)
    # if we reject the null hypothesis by the chosen threshold:
    if pval < options.pval_threshold:
        if sp2 > sp1:
            # this corresponds to "reverse stranded" in featureCounts parlance.  Created by dUTP, for instance.
            # The option for featurecounts is -s2
            strand_option = 2
        else:
            # this corresponds to the other stranded protocols
            strand_option = 1
    else:
        # not rejecting null hypothesis of unstranded
        strand_option = 0
    data = '%.4f,%4f,%d,%d,%d,%d,%.4E,%.4E,%d' % (
        sp1, sp2, m, N, n1, n2, options.pval_threshold, pval, strand_option)
    fout.write(data)
    fout.close()

Пример #23

0

Показать файл

Файл: mismatch_profile.py Проект: dsperley/Bioinformatics-Scripts

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input",
                      action="store",
                      type="string",
                      dest="input_bam",
                      help='Input BAM file. [required]')
    parser.add_option(
        "-l",
        "--read-align-length",
        action="store",
        type="int",
        dest="read_alignment_length",
        help=
        "Alignment length of read. It is usually set to the orignial read length. For example, all these cigar strings (\"101M\", \"68M140N33M\", \"53M1D48M\") suggest the read alignment length is 101. [required]"
    )
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    parser.add_option(
        "-n",
        "--read-num",
        action="store",
        type="int",
        default=1000000,
        dest="read_number",
        help=
        "Number of aligned reads with mismatches used to calculate the mismatch profile. default=%default"
    )
    parser.add_option("-q",
                      "--mapq",
                      action="store",
                      type="int",
                      dest="map_qual",
                      default=30,
                      help="Minimum mapping quality. default=%default")
    (options, args) = parser.parse_args()

    if not (options.input_bam):
        parser.print_help()
        sys.exit(0)
    for f in ([options.input_bam]):
        if not os.path.exists(f):
            print >> sys.stderr, '\n\n' + f + " does NOT exists" + '\n'
            parser.print_help()
            sys.exit(0)

    if not (options.output_prefix):
        print >> sys.stderr, '\n\n You must specify the output prefix'
        parser.print_help()
        sys.exit(0)

    if not (options.read_alignment_length):
        print >> sys.stderr, '\n\n You must specify read alignment length. It is usually the read length.'
        parser.print_help()
        sys.exit(0)

    obj = SAM.ParseBAM(options.input_bam)
    obj.mismatchProfile(read_length=options.read_alignment_length,
                        read_num=options.read_number,
                        q_cut=options.map_qual,
                        outfile=options.output_prefix)
    try:
        subprocess.call("Rscript " + options.output_prefix +
                        '.mismatch_profile.r',
                        shell=True)
    except:
        print >> sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.mismatch_profile.r'
        pass

Пример #24

0

Показать файл

def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format.")
	(options,args)=parser.parse_args()
		
	if not (options.input_file and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.input_file):
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		sys.exit(0)		

	#build bitset
	(cds_exon_r, intron_r, utr_5_r, utr_3_r,\
	intergenic_up_1kb_r,intergenic_up_5kb_r,intergenic_up_10kb_r,\
	intergenic_down_1kb_r,intergenic_down_5kb_r,intergenic_down_10kb_r,\
	cds_exon_base,intron_base,utr_5_base,utr_3_base,\
	intergenic_up1kb_base,intergenic_up5kb_base,intergenic_up10kb_base,\
	intergenic_down1kb_base,intergenic_down5kb_base,intergenic_down10kb_base) = process_gene_model(options.ref_gene_model)
	
	intron_read=0
	cds_exon_read=0
	utr_5_read=0
	utr_3_read=0
	
	intergenic_up1kb_read=0
	intergenic_down1kb_read=0
	intergenic_up5kb_read=0
	intergenic_down5kb_read=0
	intergenic_up10kb_read=0
	intergenic_down10kb_read=0
		
	totalReads=0
	totalFrags=0
	unAssignFrags=0
	obj = SAM.ParseBAM(options.input_file)
	
	R_qc_fail=0
	R_duplicate=0
	R_nonprimary=0
	R_unmap=0
	
	print >>sys.stderr, "processing " + options.input_file + " ...",
	try:
		while(1):
			aligned_read = obj.samfile.next()
			if aligned_read.is_qcfail:			#skip QC fail read
				R_qc_fail +=1
				continue
			if aligned_read.is_duplicate:		#skip duplicate read
				R_duplicate +=1
				continue
			if aligned_read.is_secondary:		#skip non primary hit
				R_nonprimary +=1
				continue
			if aligned_read.is_unmapped:		#skip unmap read
				R_unmap +=1
				continue		
			totalReads +=1
			chrom = obj.samfile.getrname(aligned_read.tid)
			chrom=chrom.upper()
			exons = bam_cigar.fetch_exon(chrom, aligned_read.pos, aligned_read.cigar)
			totalFrags += len(exons)
			
			for exn in exons:
				#print chrom + '\t' + str(exn[1]) + '\t' + str(exn[2])
				mid = int(exn[1]) + int((int(exn[2]) - int(exn[1]))/2)
				if foundone(chrom,cds_exon_r,mid,mid) > 0:
					cds_exon_read += 1
					continue
				elif foundone(chrom,utr_5_r,mid,mid) >0 and foundone(chrom,utr_3_r,mid,mid) == 0:
					utr_5_read += 1
					continue
				elif foundone(chrom,utr_3_r,mid,mid) >0 and foundone(chrom,utr_5_r,mid,mid) == 0:
					utr_3_read += 1
					continue
				elif foundone(chrom,utr_3_r,mid,mid) >0 and foundone(chrom,utr_5_r,mid,mid) > 0:
					unAssignFrags +=1
					continue
				elif foundone(chrom,intron_r,mid,mid) > 0:
					intron_read += 1
					continue
				elif foundone(chrom,intergenic_up_10kb_r,mid,mid) >0 and foundone(chrom,intergenic_down_10kb_r,mid,mid) > 0:
					unAssignFrags +=1
					continue					
				elif foundone(chrom,intergenic_up_1kb_r,mid,mid) >0:
					intergenic_up1kb_read += 1
					intergenic_up5kb_read += 1
					intergenic_up10kb_read += 1
				elif foundone(chrom,intergenic_up_5kb_r,mid,mid) >0:
					intergenic_up5kb_read += 1
					intergenic_up10kb_read += 1
				elif foundone(chrom,intergenic_up_10kb_r,mid,mid) >0:
					intergenic_up10kb_read += 1
				
				elif foundone(chrom,intergenic_down_1kb_r,mid,mid) >0:
					intergenic_down1kb_read += 1
					intergenic_down5kb_read += 1
					intergenic_down10kb_read += 1
				elif foundone(chrom,intergenic_down_5kb_r,mid,mid) >0:
					intergenic_down5kb_read += 1
					intergenic_down10kb_read += 1
				elif foundone(chrom,intergenic_down_10kb_r,mid,mid) >0:
					intergenic_down10kb_read += 1	
				else:
					unAssignFrags +=1
	except StopIteration:
		print >>sys.stderr, "Finished\n"				

	print "%-30s%d" % ("Total Reads",totalReads)
	print  "%-30s%d" % ("Total Tags",totalFrags)
	print  "%-30s%d" % ("Total Assigned Tags",totalFrags-unAssignFrags)
	
	print  "====================================================================="
	print  "%-20s%-20s%-20s%-20s" % ('Group','Total_bases','Tag_count','Tags/Kb')
	print  "%-20s%-20d%-20d%-18.2f" % ('CDS_Exons',cds_exon_base,cds_exon_read,cds_exon_read*1000.0/(cds_exon_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("5'UTR_Exons",utr_5_base,utr_5_read, utr_5_read*1000.0/(utr_5_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("3'UTR_Exons",utr_3_base,utr_3_read, utr_3_read*1000.0/(utr_3_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("Introns",intron_base,intron_read,intron_read*1000.0/(intron_base+1))
	
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_1kb",intergenic_up1kb_base, intergenic_up1kb_read, intergenic_up1kb_read*1000.0/(intergenic_up1kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_5kb",intergenic_up5kb_base, intergenic_up5kb_read, intergenic_up5kb_read*1000.0/(intergenic_up5kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_10kb",intergenic_up10kb_base, intergenic_up10kb_read, intergenic_up10kb_read*1000.0/(intergenic_up10kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_1kb",intergenic_down1kb_base, intergenic_down1kb_read, intergenic_down1kb_read*1000.0/(intergenic_down1kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_5kb",intergenic_down5kb_base, intergenic_down5kb_read, intergenic_down5kb_read*1000.0/(intergenic_down5kb_base+1))	
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_10kb",intergenic_down10kb_base, intergenic_down10kb_read, intergenic_down10kb_read*1000.0/(intergenic_down10kb_base+1))
	print  "====================================================================="

Пример #25

0

Показать файл

Файл: FPKM_count.py Проект: snandiDS/prokseq

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input-file",
        action="store",
        type="string",
        dest="input_file",
        help="Alignment file in BAM format (SAM is not supported). [required]")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed fomat. [required]")
    parser.add_option(
        "-d",
        "--strand",
        action="store",
        type="string",
        dest="strand_rule",
        default=None,
        help=
        "How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'.  If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)"
    )
    parser.add_option(
        "-u",
        "--skip-multi-hits",
        action="store_true",
        dest="skip_multi",
        help=
        "How to deal with multiple hit reads. Presence this option renders program to skip multiple hits reads."
    )
    parser.add_option(
        "-e",
        "--only-exonic",
        action="store_true",
        dest="only_exon",
        help=
        "How to count total reads. Presence of this option renders program only used exonic (UTR exons and CDS exons) reads, otherwise use all reads."
    )
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default"
    )
    parser.add_option(
        "-s",
        "--single-read",
        action="store",
        type="float",
        dest="single_read",
        default=1,
        help=
        "How to count read-pairs that only have one end mapped. 0: ignore it. 0.5: treat it as half fragment. 1: treat it as whole fragment. default=%default"
    )

    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file
            and options.refgene_bed):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file + '.bai'):
        print >> sys.stderr, "cannot find index file of input BAM file"
        print >> sys.stderr, options.input_file + '.bai' + " does not exists"
        sys.exit(0)
    for file in (options.input_file, options.refgene_bed):
        if not os.path.exists(file):
            print >> sys.stderr, file + " does NOT exists" + '\n'
            sys.exit(0)

    obj = SAM.ParseBAM(options.input_file)
    OUT = open(options.output_prefix + '.FPKM.xls', 'w')

    #++++++++++++++++++++++++++++++++++++determine strand rule
    strandRule = {}
    if options.strand_rule is None:  # Not strand-specific
        pass
    elif len(options.strand_rule.split(',')) == 4:  #PairEnd, strand-specific
        for i in options.strand_rule.split(','):
            strandRule[i[0] + i[1]] = i[2]
    elif len(options.strand_rule.split(',')) == 2:  #singeEnd, strand-specific
        for i in options.strand_rule.split(','):
            strandRule[i[0]] = i[1]
    else:
        print >> sys.stderr, "Unknown value of option :'strand_rule' " + options.strand_rule
        sys.exit(1)

    #++++++++++++++++++++++++++++++++++++counting fragments
    print >> sys.stderr, "Extract exon regions from  " + options.refgene_bed + '...'
    gene_ranges = build_range(options.refgene_bed)
    print >> sys.stderr, "Counting total fragment ... ",

    total_frags = 0.0
    exonic_frags = 0.0

    try:
        while (1):
            aligned_read = obj.samfile.next()
            if aligned_read.is_qcfail: continue  #skip low quanlity
            if aligned_read.is_duplicate: continue  #skip duplicate read
            if aligned_read.is_secondary: continue  #skip non primary hit
            if options.skip_multi:
                if aligned_read.mapq < options.map_qual:
                    continue
            try:
                chrom = obj.samfile.getrname(aligned_read.tid).upper()
            except:
                continue
            read_st = aligned_read.pos
            read_end = read_st + aligned_read.rlen  #not exactly the end position in case of splicing, insertion,etc

            if not aligned_read.is_paired:  # if read is NOT paired in sequencing (single-end sequencing)
                total_frags += 1
                if (chrom in gene_ranges) and len(gene_ranges[chrom].find(
                        read_st, read_end)) > 0:
                    exonic_frags += 1
            elif aligned_read.is_paired:  # for pair-end sequencing
                if aligned_read.is_read2: continue  # only count read1
                mate_st = aligned_read.pnext
                mate_end = mate_st + aligned_read.rlen

                if aligned_read.is_unmapped:  #read1 unmapped
                    if aligned_read.mate_is_unmapped: continue  #both unmap
                    else:  #read2 is mapped
                        total_frags += options.single_read
                        if (chrom in gene_ranges) and (len(
                                gene_ranges[chrom].find(mate_st, mate_end)) >
                                                       0):
                            exonic_frags += options.single_read
                else:
                    if aligned_read.mate_is_unmapped:
                        total_frags += options.single_read
                        if (chrom in gene_ranges) and (len(
                                gene_ranges[chrom].find(read_st, read_end)) >
                                                       0):
                            exonic_frags += options.single_read
                    else:
                        total_frags += 1
                        if (chrom in gene_ranges) and (
                                len(gene_ranges[chrom].find(read_st, read_end))
                                > 0) and (len(gene_ranges[chrom].find(
                                    mate_st, mate_end)) > 0):
                            exonic_frags += 1

    except StopIteration:
        print >> sys.stderr, "Done"
    print >> sys.stderr, "Total fragment = %-20s" % (str(total_frags))
    print >> sys.stderr, "Total exonic fragment = %-20s" % (str(exonic_frags))

    if total_frags > 0 and exonic_frags > 0:
        if options.only_exon:
            denominator = exonic_frags
        else:
            denominator = total_frags
    else:
        print >> sys.stderr, "Total tags cannot be 0 or negative number"
        sys.exit(1)

    #++++++++++++++++++++++++++++++++++++++++++++++++
    obj = SAM.ParseBAM(options.input_file)
    print >> OUT, '\t'.join(('#chrom', 'st', 'end', 'accession', 'mRNA_size',
                             'gene_strand', 'Frag_count', 'FPM', 'FPKM'))

    gene_finished = 0

    #calculate raw count, FPM, FPKM for each gene
    for line in open(options.refgene_bed, 'r'):
        frag_count_f = 0.0
        frag_count_r = 0.0
        frag_count_fr = 0.0
        mRNA_size = 0.0
        exon_ranges = Intersecter()
        if line.startswith(('#', 'track', 'browser')): continue
        fields = line.split()
        chrom = fields[0]
        tx_start = int(fields[1])
        tx_end = int(fields[2])
        geneName = fields[3]
        gstrand = fields[5].replace(" ", "_")

        exon_starts = map(int, fields[11].rstrip(',\n').split(','))
        exon_starts = map((lambda x: x + tx_start), exon_starts)
        exon_ends = map(int, fields[10].rstrip(',\n').split(','))
        exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
        for st, end in zip(exon_starts, exon_ends):
            mRNA_size += (end - st)
            exon_ranges.add_interval(Interval(st, end))

        # extract reads mapped gene region
        try:
            alignedReads = obj.samfile.fetch(chrom, tx_start, tx_end)
        except:
            continue
        for aligned_read in alignedReads:
            flag = 0
            if aligned_read.is_qcfail: continue  #skip low quanlity
            if aligned_read.is_duplicate: continue  #skip duplicate read
            if aligned_read.is_secondary: continue  #skip non primary hit
            if options.skip_multi:
                if aligned_read.mapq < options.map_qual:
                    continue

            #single end sequencing
            if not aligned_read.is_paired:
                frag_st = aligned_read.pos
                frag_end = read_st + aligned_read.rlen  #not exactly the end position in case of splicing, insertion,etc
                if aligned_read.is_reverse:
                    strand_key = '-'
                else:
                    strand_key = '+'

                if len(exon_ranges.find(frag_st, frag_end)) > 0:
                    if options.strand_rule is None:
                        frag_count_fr += 1
                    elif strand_key in strandRule and strandRule[
                            strand_key] == '+':
                        frag_count_f += 1
                    elif strand_key in strandRule and strandRule[
                            strand_key] == '-':
                        frag_count_r += 1

            # pair-end sequencing
            if aligned_read.is_paired:
                frag_st = aligned_read.pos
                frag_end = aligned_read.pnext
                if len(exon_ranges.find(frag_st, frag_st + 1)) < 1 and len(
                        exon_ranges.find(frag_end, frag_end + 1)) < 1:
                    continue
                if aligned_read.is_read2:
                    continue
                if aligned_read.is_reverse:
                    strand_key = '1-'
                else:
                    strand_key = '1+'

                if options.strand_rule is None:
                    if aligned_read.is_unmapped:
                        if aligned_read.mate_is_unmapped:  # both unmapped
                            continue
                        else:  #only read2 mapped
                            frag_count_fr += options.single_read
                    else:
                        if aligned_read.mate_is_unmapped:  # only read1 mapped
                            frag_count_fr += options.single_read
                        else:  #both mapped
                            frag_count_fr += 1
                else:
                    if strand_key in strandRule and strandRule[
                            strand_key] == '+':
                        if aligned_read.is_unmapped:
                            if aligned_read.mate_is_unmapped:  # both unmapped
                                continue
                            else:  #only read2 mapped
                                frag_count_f += options.single_read
                        else:
                            if aligned_read.mate_is_unmapped:  # only read1 mapped
                                frag_count_f += options.single_read
                            else:  #both mapped
                                frag_count_f += 1
                    if strand_key in strandRule and strandRule[
                            strand_key] == '-':
                        if aligned_read.is_unmapped:
                            if aligned_read.mate_is_unmapped:  # both unmapped
                                continue
                            else:  #only read2 mapped
                                frag_count_r += options.single_read
                        else:
                            if aligned_read.mate_is_unmapped:  # only read1 mapped
                                frag_count_r += options.single_read
                            else:  #both mapped
                                frag_count_r += 1

        FPM_fr = frag_count_fr * 1000000 / denominator
        FPM_f = frag_count_f * 1000000 / denominator
        FPM_r = frag_count_r * 1000000 / denominator
        FPKM_fr = frag_count_fr * 1000000000 / (denominator * mRNA_size)
        FPKM_f = frag_count_f * 1000000000 / (denominator * mRNA_size)
        FPKM_r = frag_count_r * 1000000000 / (denominator * mRNA_size)

        if options.strand_rule is None:
            print >> OUT, '\t'.join([
                str(i) for i in (chrom, tx_start, tx_end, geneName, mRNA_size,
                                 gstrand, frag_count_fr, FPM_fr, FPKM_fr)
            ])
        else:
            if gstrand == '+':
                print >> OUT, '\t'.join([
                    str(i)
                    for i in (chrom, tx_start, tx_end, geneName, mRNA_size,
                              gstrand, frag_count_f, FPM_f, FPKM_f)
                ])
            elif gstrand == '-':
                print >> OUT, '\t'.join([
                    str(i)
                    for i in (chrom, tx_start, tx_end, geneName, mRNA_size,
                              gstrand, frag_count_r, FPM_r, FPKM_r)
                ])

        gene_finished += 1
        print >> sys.stderr, " %d transcripts finished\r" % (gene_finished),

Пример #26

0

Показать файл

Файл: inner_distance.py Проект: yaskermezli/rseqc

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Alignment file in BAM or SAM format.")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s)")
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="ref_gene",
                      help="Prefix of output files(s).")
    parser.add_option(
        "-l",
        "--lower-bound",
        action="store",
        type="int",
        dest="lower_bound_size",
        default=-250,
        help=
        "Lower bound of inner distance (bp). This option is used for ploting histograme. default=%default"
    )
    parser.add_option(
        "-u",
        "--upper-bound",
        action="store",
        type="int",
        dest="upper_bound_size",
        default=250,
        help=
        "Upper bound of inner distance (bp). This option is used for plotting histogram. default=%default"
    )
    parser.add_option(
        "-s",
        "--step",
        action="store",
        type="int",
        dest="step_size",
        default=5,
        help=
        "Step size (bp) of histograme. This option is used for plotting histogram. default=%default"
    )
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file and options.ref_gene):
        parser.print_help()
        sys.exit(0)
    for input_file in ([options.input_file, options.ref_gene]):
        if not os.path.exists(input_file):
            print >> sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
            parser.print_help()
            sys.exit(0)
    if options.step_size <= 0:
        print >> sys.stderr, "step size is a positive interger"
        sys.exit(0)
    obj = SAM.ParseBAM(options.input_file)
    obj.mRNA_inner_distance(outfile=options.output_prefix,
                            low_bound=options.lower_bound_size,
                            up_bound=options.upper_bound_size,
                            step=options.step_size,
                            refbed=options.ref_gene)
    try:
        subprocess.call("Rscript " + options.output_prefix +
                        '.inner_distance_plot.r',
                        shell=True)
    except:
        print >> sys.stderr, "Cannot generate pdf file form " + options.output_prefix + '.inner_distance_plot.r'
        pass

Пример #27

0

Показать файл

def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input-file",
        action="store",
        type="string",
        dest="input_file",
        help="Alignment file in BAM format (SAM is not supported). [required]")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed fomat. [required]")
    parser.add_option(
        "-d",
        "--strand",
        action="store",
        type="string",
        dest="strand_rule",
        default=None,
        help=
        "How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'.  If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)"
    )
    parser.add_option(
        "-u",
        "--skip-multi-hits",
        action="store_true",
        dest="skip_multi",
        help=
        "How to deal with multiple hit reads. Presence this option renders program to skip multiple hits reads."
    )
    parser.add_option(
        "-e",
        "--only-exonic",
        action="store_true",
        dest="only_exon",
        help=
        "How to count total reads. Presence of this option renders program only used exonic (UTR exons and CDS exons) reads, otherwise use all reads."
    )

    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file
            and options.refgene_bed):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file + '.bai'):
        print >> sys.stderr, "cannot find index file of input BAM file"
        print >> sys.stderr, options.input_file + '.bai' + " does not exists"
        sys.exit(0)
    for file in (options.input_file, options.refgene_bed):
        if not os.path.exists(file):
            print >> sys.stderr, file + " does NOT exists" + '\n'
            sys.exit(0)

    obj = SAM.ParseBAM(options.input_file)
    OUT = open(options.output_prefix + '_read_count.xls', 'w')

    #++++++++++++++++++++++++++++++++++++determine strand rule
    strandRule = {}
    if options.strand_rule is None:  # Not strand-specific
        pass
    elif len(options.strand_rule.split(',')) == 4:  #PairEnd, strand-specific
        for i in options.strand_rule.split(','):
            strandRule[i[0] + i[1]] = i[2]
    elif len(options.strand_rule.split(',')) == 2:  #singeEnd, strand-specific
        for i in options.strand_rule.split(','):
            strandRule[i[0]] = i[1]
    else:
        print >> sys.stderr, "Unknown value of option :'strand_rule' " + options.strand_rule
        sys.exit(1)

    #++++++++++++++++++++++++++++++++++++counting reads
    print >> sys.stderr, "Retrieve exon regions from  " + options.refgene_bed + '...'
    gene_ranges = build_range(options.refgene_bed)
    #print gene_ranges['ERCC-00002'].find(0,100)
    print >> sys.stderr, "Counting total reads ... ",

    total_reads = 0
    total_tags = 0
    total_exonic_tags = 0

    try:
        while (1):
            flag = 0
            aligned_read = obj.samfile.next()
            if aligned_read.is_qcfail: continue  #skip low quanlity
            if aligned_read.is_duplicate: continue  #skip duplicate read
            if aligned_read.is_secondary: continue  #skip non primary hit
            if aligned_read.is_unmapped: continue  #skip unmap read
            if options.skip_multi:
                if len(aligned_read.tags) > 0:  #( ("NM", 1),("RG", "L1") )
                    for i in aligned_read.tags:
                        if i[0] in SAM.ParseBAM.multi_hit_tags and i[1] > 1:
                            flag = 1  #multiple hit read
                            break
                if flag == 1: continue  #skip multiple map read
            total_reads += 1
            chrom = obj.samfile.getrname(aligned_read.tid).upper()
            hit_st = aligned_read.pos
            exon_blocks = bam_cigar.fetch_exon(chrom, hit_st,
                                               aligned_read.cigar)
            total_tags += len(exon_blocks)

            for exn in exon_blocks:
                mid = exn[1] + int((exn[2] - exn[1]) / 2)
                #print chrom,mid,mid+1
                #print gene_ranges[chrom].find(mid,mid+1)
                if (chrom in gene_ranges) and len(gene_ranges[chrom].find(
                        mid, mid + 1)) > 0:
                    total_exonic_tags += 1

    except StopIteration:
        print >> sys.stderr, "Done"
    print >> sys.stderr, "Total Reads = %-20s" % (str(total_reads))
    print >> sys.stderr, "Total Tags = %-20s" % (str(total_tags))
    print >> sys.stderr, "Total Exon Tags = %-20s" % (str(total_exonic_tags))

    if total_tags > 0 and total_exonic_tags > 0:
        if options.only_exon:
            denominator = total_exonic_tags
        else:
            denominator = total_tags
    else:
        print >> sys.stderr, "Total tags cannot be 0 or negative number"
        sys.exit(1)

    #++++++++++++++++++++++++++++++++++++++++++++++++
    obj = SAM.ParseBAM(options.input_file)
    if options.strand_rule is None:
        OUT.write('#chrom' + '\t' + 'st' + '\t' + 'end' + '\t' + 'accession' +
                  '\t' + 'score' + '\t' + 'gene_strand' + '\t' + 'tag_count' +
                  '\t' + 'RPKM' + '\n')
    else:
        OUT.write('#chrom' + '\t' + 'st' + '\t' + 'end' + '\t' + 'accession' +
                  '\t' + 'score' + '\t' + 'gene_strand' + '\t' +
                  'tag_count_Forward' + '\t' + 'tag_count_Reverse' + '\t' +
                  'RPKM_Forward' + '\t' + 'RPKM_Reverse' + '\n')
    genome_total_read = 0
    genome_unique_read = 0
    gene_finished = 0
    #calculate raw count, RPKM for each gene
    for line in open(options.refgene_bed, 'r'):
        exon_range = Intersecter()
        intron_range = Intersecter()
        if line.startswith(('#', 'track', 'browser')): continue
        fields = line.split()
        chrom = fields[0]
        tx_start = int(fields[1])
        tx_end = int(fields[2])
        geneName = fields[3]
        gstrand = fields[5].replace(" ", "_")
        cds_start = int(fields[6])
        cds_end = int(fields[7])

        exon_starts = map(int, fields[11].rstrip(',\n').split(','))
        exon_starts = map((lambda x: x + tx_start), exon_starts)
        exon_ends = map(int, fields[10].rstrip(',\n').split(','))
        exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
        intron_starts = exon_ends[:-1]
        intron_ends = exon_starts[1:]

        plus_ranges = Intersecter()
        minus_ranges = Intersecter()
        unstrand_ranges = Intersecter()

        try:
            alignedReads = obj.samfile.fetch(chrom, tx_start, tx_end)
        except:
            print >> sys.stderr, "No alignments for " + geneName + ". Skip"
            continue
        for aligned_read in alignedReads:
            flag = 0
            if aligned_read.is_qcfail: continue  #skip low quanlity
            if aligned_read.is_duplicate: continue  #skip duplicate read
            if aligned_read.is_secondary: continue  #skip non primary hit
            if aligned_read.is_unmapped: continue  #skip unmap read

            if options.skip_multi:
                if len(aligned_read.tags) > 0:  #( ("NM", 1),("RG", "L1") )
                    for i in aligned_read.tags:
                        if i[0] in SAM.ParseBAM.multi_hit_tags and i[1] > 1:
                            flag = 1  #multiple hit read
                            break
                if flag == 1: continue  #skip multiple map read

            if aligned_read.is_paired:  #pair end
                if aligned_read.is_read1: read_id = '1'
                if aligned_read.is_read2: read_id = '2'
            else: read_id = ''  #single end

            if aligned_read.is_reverse: map_strand = '-'
            else: map_strand = '+'
            strand_key = read_id + map_strand  #used to determine if a read should assign to gene(+) or gene(-)

            hit_st = aligned_read.pos
            exon_blocks = bam_cigar.fetch_exon(chrom, hit_st,
                                               aligned_read.cigar)

            #construct bitset
            if options.strand_rule is not None:
                if strandRule[strand_key] == '+':
                    for block in exon_blocks:
                        mid = block[1] + int((block[2] - block[1]) / 2)
                        plus_ranges.add_interval(Interval(mid, mid + 1))
                elif strandRule[strand_key] == '-':
                    for block in exon_blocks:
                        mid = block[1] + int((block[2] - block[1]) / 2)
                        minus_ranges.add_interval(Interval(mid, mid + 1))
            elif options.strand_rule is None:
                for block in exon_blocks:
                    mid = block[1] + int((block[2] - block[1]) / 2)
                    unstrand_ranges.add_interval(Interval(mid, mid + 1))
        mRNA_plus_hits = 0
        mRNA_plus_rpkm = 0.0

        mRNA_minus_hits = 0
        mRNA_minus_rpkm = 0.0

        mRNA_hits = 0
        mRNA_rpkm = 0.0

        mRNA_length = 0

        #assign reads to region:exon,intron,mRNA
        if (options.strand_rule is not None):  #this is strand specific
            if gstrand == '-':
                intronNum = len(intron_starts)
                exonNum = len(exon_starts)
            elif gstrand == '+':
                intronNum = 1
                exonNum = 1
            #assign reads to intron regions
            for st, end in zip(intron_starts, intron_ends):
                if end > st:
                    size = end - st
                elif end == st:
                    size = 1
                hits_plus = len(plus_ranges.find(st, end))
                hits_minus = len(minus_ranges.find(st, end))
                hits_plus_rpkm = hits_plus * 1000000000.0 / (size *
                                                             denominator)
                hits_minus_rpkm = hits_minus * 1000000000.0 / (size *
                                                               denominator)
                print >> OUT, '\t'.join([
                    '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f',
                    '%.3f'
                ]) % (chrom, st, end, geneName + "_intron_" + str(intronNum),
                      0, gstrand, hits_plus, hits_minus, hits_plus_rpkm,
                      hits_minus_rpkm)
                if gstrand == '-': intronNum -= 1
                elif gstrand == '+': intronNum += 1
            #assign reads to exon regions
            for st, end in zip(exon_starts, exon_ends):
                if end > st:
                    size = end - st
                elif end == st:
                    size = 1
                hits_plus = len(plus_ranges.find(st, end))
                hits_minus = len(minus_ranges.find(st, end))
                hits_plus_rpkm = hits_plus * 1000000000.0 / (size *
                                                             denominator)
                hits_minus_rpkm = hits_minus * 1000000000.0 / (size *
                                                               denominator)
                print >> OUT, '\t'.join([
                    '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f',
                    '%.3f'
                ]) % (chrom, st, end, geneName + "_exon_" + str(exonNum), 0,
                      gstrand, hits_plus, hits_minus, hits_plus_rpkm,
                      hits_minus_rpkm)
                if gstrand == '-': exonNum -= 1
                elif gstrand == '+': exonNum += 1
                mRNA_plus_hits += hits_plus
                mRNA_minus_hits += hits_minus
                mRNA_length += size
            mRNA_plus_rpkm = mRNA_plus_hits * 1000000000.0 / (mRNA_length *
                                                              denominator)
            mRNA_minus_rpkm = mRNA_minus_hits * 1000000000.0 / (mRNA_length *
                                                                denominator)
            print >> OUT, '\t'.join([
                '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f', '%.3f'
            ]) % (chrom, tx_start, tx_end, geneName + "_mRNA", 0, gstrand,
                  mRNA_plus_hits, mRNA_minus_hits, mRNA_plus_rpkm,
                  mRNA_minus_rpkm)
        elif (options.strand_rule is None):  #this is NOT strand specific
            if gstrand == '-':
                intronNum = len(intron_starts)
                exonNum = len(exon_starts)
            elif gstrand == '+':
                intronNum = 1
                exonNum = 1
            #assign reads to intron regions
            for st, end in zip(intron_starts, intron_ends):
                if end > st:
                    size = end - st
                elif end == st:
                    size = 1
                hits = len(unstrand_ranges.find(st, end))
                hits_rpkm = hits * 1000000000.0 / (size * denominator)
                print >> OUT, '\t'.join([
                    '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f'
                ]) % (chrom, st, end, geneName + "_intron_" + str(intronNum),
                      0, gstrand, hits, hits_rpkm)
                if gstrand == '-': intronNum -= 1
                elif gstrand == '+': intronNum += 1
            #assign reads to exon regions
            for st, end in zip(exon_starts, exon_ends):
                if end > st:
                    size = end - st
                elif end == st:
                    size = 1
                hits = len(unstrand_ranges.find(st, end))
                hits_rpkm = hits * 1000000000.0 / (size * denominator)
                print >> OUT, '\t'.join([
                    '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f'
                ]) % (chrom, st, end, geneName + "_exon_" + str(exonNum), 0,
                      gstrand, hits, hits_rpkm)
                if gstrand == '-': exonNum -= 1
                elif gstrand == '+': exonNum += 1
                mRNA_hits += hits
                mRNA_length += size
            mRNA_rpkm = mRNA_hits * 1000000000.0 / (mRNA_length * denominator)
            print >> OUT, '\t'.join([
                '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f'
            ]) % (chrom, tx_start, tx_end, geneName + "_mRNA", 0, gstrand,
                  mRNA_hits, mRNA_rpkm)

        gene_finished += 1
        print >> sys.stderr, " %d transcripts finished\r" % (gene_finished),

Python SAM примеры использования