def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format.")
	(options,args)=parser.parse_args()
		
	if not (options.input_file and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.input_file):
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		sys.exit(0)		

	#build bitset
	(cds_exon_r, intron_r, utr_5_r, utr_3_r,\
	intergenic_up_1kb_r,intergenic_up_5kb_r,intergenic_up_10kb_r,\
	intergenic_down_1kb_r,intergenic_down_5kb_r,intergenic_down_10kb_r,\
	cds_exon_base,intron_base,utr_5_base,utr_3_base,\
	intergenic_up1kb_base,intergenic_up5kb_base,intergenic_up10kb_base,\
	intergenic_down1kb_base,intergenic_down5kb_base,intergenic_down10kb_base) = process_gene_model(options.ref_gene_model)
	
	intron_read=0
	cds_exon_read=0
	utr_5_read=0
	utr_3_read=0
	
	intergenic_up1kb_read=0
	intergenic_down1kb_read=0
	intergenic_up5kb_read=0
	intergenic_down5kb_read=0
	intergenic_up10kb_read=0
	intergenic_down10kb_read=0
		
	totalReads=0
	totalFrags=0
	unAssignFrags=0
	obj = SAM.ParseBAM(options.input_file)
	
	R_qc_fail=0
	R_duplicate=0
	R_nonprimary=0
	R_unmap=0
	
	print >>sys.stderr, "processing " + options.input_file + " ...",
	try:
		while(1):
			aligned_read = obj.samfile.next()
			if aligned_read.is_qcfail:			#skip QC fail read
				R_qc_fail +=1
				continue
			if aligned_read.is_duplicate:		#skip duplicate read
				R_duplicate +=1
				continue
			if aligned_read.is_secondary:		#skip non primary hit
				R_nonprimary +=1
				continue
			if aligned_read.is_unmapped:		#skip unmap read
				R_unmap +=1
				continue		
			totalReads +=1
			chrom = obj.samfile.getrname(aligned_read.tid)
			chrom=chrom.upper()
			exons = bam_cigar.fetch_exon(chrom, aligned_read.pos, aligned_read.cigar)
			totalFrags += len(exons)
			
			for exn in exons:
				#print chrom + '\t' + str(exn[1]) + '\t' + str(exn[2])
				mid = int(exn[1]) + int((int(exn[2]) - int(exn[1]))/2)
				if foundone(chrom,cds_exon_r,mid,mid) > 0:
					cds_exon_read += 1
					continue
				elif foundone(chrom,utr_5_r,mid,mid) >0 and foundone(chrom,utr_3_r,mid,mid) == 0:
					utr_5_read += 1
					continue
				elif foundone(chrom,utr_3_r,mid,mid) >0 and foundone(chrom,utr_5_r,mid,mid) == 0:
					utr_3_read += 1
					continue
				elif foundone(chrom,utr_3_r,mid,mid) >0 and foundone(chrom,utr_5_r,mid,mid) > 0:
					unAssignFrags +=1
					continue
				elif foundone(chrom,intron_r,mid,mid) > 0:
					intron_read += 1
					continue
				elif foundone(chrom,intergenic_up_10kb_r,mid,mid) >0 and foundone(chrom,intergenic_down_10kb_r,mid,mid) > 0:
					unAssignFrags +=1
					continue					
				elif foundone(chrom,intergenic_up_1kb_r,mid,mid) >0:
					intergenic_up1kb_read += 1
					intergenic_up5kb_read += 1
					intergenic_up10kb_read += 1
				elif foundone(chrom,intergenic_up_5kb_r,mid,mid) >0:
					intergenic_up5kb_read += 1
					intergenic_up10kb_read += 1
				elif foundone(chrom,intergenic_up_10kb_r,mid,mid) >0:
					intergenic_up10kb_read += 1
				
				elif foundone(chrom,intergenic_down_1kb_r,mid,mid) >0:
					intergenic_down1kb_read += 1
					intergenic_down5kb_read += 1
					intergenic_down10kb_read += 1
				elif foundone(chrom,intergenic_down_5kb_r,mid,mid) >0:
					intergenic_down5kb_read += 1
					intergenic_down10kb_read += 1
				elif foundone(chrom,intergenic_down_10kb_r,mid,mid) >0:
					intergenic_down10kb_read += 1	
				else:
					unAssignFrags +=1
	except StopIteration:
		print >>sys.stderr, "Finished\n"				

	print "%-30s%d" % ("Total Reads",totalReads)
	print  "%-30s%d" % ("Total Tags",totalFrags)
	print  "%-30s%d" % ("Total Assigned Tags",totalFrags-unAssignFrags)
	
	print  "====================================================================="
	print  "%-20s%-20s%-20s%-20s" % ('Group','Total_bases','Tag_count','Tags/Kb')
	print  "%-20s%-20d%-20d%-18.2f" % ('CDS_Exons',cds_exon_base,cds_exon_read,cds_exon_read*1000.0/(cds_exon_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("5'UTR_Exons",utr_5_base,utr_5_read, utr_5_read*1000.0/(utr_5_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("3'UTR_Exons",utr_3_base,utr_3_read, utr_3_read*1000.0/(utr_3_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("Introns",intron_base,intron_read,intron_read*1000.0/(intron_base+1))
	
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_1kb",intergenic_up1kb_base, intergenic_up1kb_read, intergenic_up1kb_read*1000.0/(intergenic_up1kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_5kb",intergenic_up5kb_base, intergenic_up5kb_read, intergenic_up5kb_read*1000.0/(intergenic_up5kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_10kb",intergenic_up10kb_base, intergenic_up10kb_read, intergenic_up10kb_read*1000.0/(intergenic_up10kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_1kb",intergenic_down1kb_base, intergenic_down1kb_read, intergenic_down1kb_read*1000.0/(intergenic_down1kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_5kb",intergenic_down5kb_base, intergenic_down5kb_read, intergenic_down5kb_read*1000.0/(intergenic_down5kb_base+1))	
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_10kb",intergenic_down10kb_base, intergenic_down10kb_read, intergenic_down10kb_read*1000.0/(intergenic_down10kb_base+1))
	print  "====================================================================="
Пример #2
0
def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input-file",
        action="store",
        type="string",
        dest="input_file",
        help="Alignment file in BAM format (SAM is not supported). [required]")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed fomat. [required]")
    parser.add_option(
        "-d",
        "--strand",
        action="store",
        type="string",
        dest="strand_rule",
        default=None,
        help=
        "How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'.  If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)"
    )
    parser.add_option(
        "-u",
        "--skip-multi-hits",
        action="store_true",
        dest="skip_multi",
        help=
        "How to deal with multiple hit reads. Presence this option renders program to skip multiple hits reads."
    )
    parser.add_option(
        "-e",
        "--only-exonic",
        action="store_true",
        dest="only_exon",
        help=
        "How to count total reads. Presence of this option renders program only used exonic (UTR exons and CDS exons) reads, otherwise use all reads."
    )

    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_file
            and options.refgene_bed):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file + '.bai'):
        print >> sys.stderr, "cannot find index file of input BAM file"
        print >> sys.stderr, options.input_file + '.bai' + " does not exists"
        sys.exit(0)
    for file in (options.input_file, options.refgene_bed):
        if not os.path.exists(file):
            print >> sys.stderr, file + " does NOT exists" + '\n'
            sys.exit(0)

    obj = SAM.ParseBAM(options.input_file)
    OUT = open(options.output_prefix + '_read_count.xls', 'w')

    #++++++++++++++++++++++++++++++++++++determine strand rule
    strandRule = {}
    if options.strand_rule is None:  # Not strand-specific
        pass
    elif len(options.strand_rule.split(',')) == 4:  #PairEnd, strand-specific
        for i in options.strand_rule.split(','):
            strandRule[i[0] + i[1]] = i[2]
    elif len(options.strand_rule.split(',')) == 2:  #singeEnd, strand-specific
        for i in options.strand_rule.split(','):
            strandRule[i[0]] = i[1]
    else:
        print >> sys.stderr, "Unknown value of option :'strand_rule' " + options.strand_rule
        sys.exit(1)

    #++++++++++++++++++++++++++++++++++++counting reads
    print >> sys.stderr, "Retrieve exon regions from  " + options.refgene_bed + '...'
    gene_ranges = build_range(options.refgene_bed)
    #print gene_ranges['ERCC-00002'].find(0,100)
    print >> sys.stderr, "Counting total reads ... ",

    total_reads = 0
    total_tags = 0
    total_exonic_tags = 0

    try:
        while (1):
            flag = 0
            aligned_read = obj.samfile.next()
            if aligned_read.is_qcfail: continue  #skip low quanlity
            if aligned_read.is_duplicate: continue  #skip duplicate read
            if aligned_read.is_secondary: continue  #skip non primary hit
            if aligned_read.is_unmapped: continue  #skip unmap read
            if options.skip_multi:
                if len(aligned_read.tags) > 0:  #( ("NM", 1),("RG", "L1") )
                    for i in aligned_read.tags:
                        if i[0] in SAM.ParseBAM.multi_hit_tags and i[1] > 1:
                            flag = 1  #multiple hit read
                            break
                if flag == 1: continue  #skip multiple map read
            total_reads += 1
            chrom = obj.samfile.getrname(aligned_read.tid).upper()
            hit_st = aligned_read.pos
            exon_blocks = bam_cigar.fetch_exon(chrom, hit_st,
                                               aligned_read.cigar)
            total_tags += len(exon_blocks)

            for exn in exon_blocks:
                mid = exn[1] + int((exn[2] - exn[1]) / 2)
                #print chrom,mid,mid+1
                #print gene_ranges[chrom].find(mid,mid+1)
                if (chrom in gene_ranges) and len(gene_ranges[chrom].find(
                        mid, mid + 1)) > 0:
                    total_exonic_tags += 1

    except StopIteration:
        print >> sys.stderr, "Done"
    print >> sys.stderr, "Total Reads = %-20s" % (str(total_reads))
    print >> sys.stderr, "Total Tags = %-20s" % (str(total_tags))
    print >> sys.stderr, "Total Exon Tags = %-20s" % (str(total_exonic_tags))

    if total_tags > 0 and total_exonic_tags > 0:
        if options.only_exon:
            denominator = total_exonic_tags
        else:
            denominator = total_tags
    else:
        print >> sys.stderr, "Total tags cannot be 0 or negative number"
        sys.exit(1)

    #++++++++++++++++++++++++++++++++++++++++++++++++
    obj = SAM.ParseBAM(options.input_file)
    if options.strand_rule is None:
        OUT.write('#chrom' + '\t' + 'st' + '\t' + 'end' + '\t' + 'accession' +
                  '\t' + 'score' + '\t' + 'gene_strand' + '\t' + 'tag_count' +
                  '\t' + 'RPKM' + '\n')
    else:
        OUT.write('#chrom' + '\t' + 'st' + '\t' + 'end' + '\t' + 'accession' +
                  '\t' + 'score' + '\t' + 'gene_strand' + '\t' +
                  'tag_count_Forward' + '\t' + 'tag_count_Reverse' + '\t' +
                  'RPKM_Forward' + '\t' + 'RPKM_Reverse' + '\n')
    genome_total_read = 0
    genome_unique_read = 0
    gene_finished = 0
    #calculate raw count, RPKM for each gene
    for line in open(options.refgene_bed, 'r'):
        exon_range = Intersecter()
        intron_range = Intersecter()
        if line.startswith(('#', 'track', 'browser')): continue
        fields = line.split()
        chrom = fields[0]
        tx_start = int(fields[1])
        tx_end = int(fields[2])
        geneName = fields[3]
        gstrand = fields[5].replace(" ", "_")
        cds_start = int(fields[6])
        cds_end = int(fields[7])

        exon_starts = map(int, fields[11].rstrip(',\n').split(','))
        exon_starts = map((lambda x: x + tx_start), exon_starts)
        exon_ends = map(int, fields[10].rstrip(',\n').split(','))
        exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
        intron_starts = exon_ends[:-1]
        intron_ends = exon_starts[1:]

        plus_ranges = Intersecter()
        minus_ranges = Intersecter()
        unstrand_ranges = Intersecter()

        try:
            alignedReads = obj.samfile.fetch(chrom, tx_start, tx_end)
        except:
            print >> sys.stderr, "No alignments for " + geneName + ". Skip"
            continue
        for aligned_read in alignedReads:
            flag = 0
            if aligned_read.is_qcfail: continue  #skip low quanlity
            if aligned_read.is_duplicate: continue  #skip duplicate read
            if aligned_read.is_secondary: continue  #skip non primary hit
            if aligned_read.is_unmapped: continue  #skip unmap read

            if options.skip_multi:
                if len(aligned_read.tags) > 0:  #( ("NM", 1),("RG", "L1") )
                    for i in aligned_read.tags:
                        if i[0] in SAM.ParseBAM.multi_hit_tags and i[1] > 1:
                            flag = 1  #multiple hit read
                            break
                if flag == 1: continue  #skip multiple map read

            if aligned_read.is_paired:  #pair end
                if aligned_read.is_read1: read_id = '1'
                if aligned_read.is_read2: read_id = '2'
            else: read_id = ''  #single end

            if aligned_read.is_reverse: map_strand = '-'
            else: map_strand = '+'
            strand_key = read_id + map_strand  #used to determine if a read should assign to gene(+) or gene(-)

            hit_st = aligned_read.pos
            exon_blocks = bam_cigar.fetch_exon(chrom, hit_st,
                                               aligned_read.cigar)

            #construct bitset
            if options.strand_rule is not None:
                if strandRule[strand_key] == '+':
                    for block in exon_blocks:
                        mid = block[1] + int((block[2] - block[1]) / 2)
                        plus_ranges.add_interval(Interval(mid, mid + 1))
                elif strandRule[strand_key] == '-':
                    for block in exon_blocks:
                        mid = block[1] + int((block[2] - block[1]) / 2)
                        minus_ranges.add_interval(Interval(mid, mid + 1))
            elif options.strand_rule is None:
                for block in exon_blocks:
                    mid = block[1] + int((block[2] - block[1]) / 2)
                    unstrand_ranges.add_interval(Interval(mid, mid + 1))
        mRNA_plus_hits = 0
        mRNA_plus_rpkm = 0.0

        mRNA_minus_hits = 0
        mRNA_minus_rpkm = 0.0

        mRNA_hits = 0
        mRNA_rpkm = 0.0

        mRNA_length = 0

        #assign reads to region:exon,intron,mRNA
        if (options.strand_rule is not None):  #this is strand specific
            if gstrand == '-':
                intronNum = len(intron_starts)
                exonNum = len(exon_starts)
            elif gstrand == '+':
                intronNum = 1
                exonNum = 1
            #assign reads to intron regions
            for st, end in zip(intron_starts, intron_ends):
                if end > st:
                    size = end - st
                elif end == st:
                    size = 1
                hits_plus = len(plus_ranges.find(st, end))
                hits_minus = len(minus_ranges.find(st, end))
                hits_plus_rpkm = hits_plus * 1000000000.0 / (size *
                                                             denominator)
                hits_minus_rpkm = hits_minus * 1000000000.0 / (size *
                                                               denominator)
                print >> OUT, '\t'.join([
                    '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f',
                    '%.3f'
                ]) % (chrom, st, end, geneName + "_intron_" + str(intronNum),
                      0, gstrand, hits_plus, hits_minus, hits_plus_rpkm,
                      hits_minus_rpkm)
                if gstrand == '-': intronNum -= 1
                elif gstrand == '+': intronNum += 1
            #assign reads to exon regions
            for st, end in zip(exon_starts, exon_ends):
                if end > st:
                    size = end - st
                elif end == st:
                    size = 1
                hits_plus = len(plus_ranges.find(st, end))
                hits_minus = len(minus_ranges.find(st, end))
                hits_plus_rpkm = hits_plus * 1000000000.0 / (size *
                                                             denominator)
                hits_minus_rpkm = hits_minus * 1000000000.0 / (size *
                                                               denominator)
                print >> OUT, '\t'.join([
                    '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f',
                    '%.3f'
                ]) % (chrom, st, end, geneName + "_exon_" + str(exonNum), 0,
                      gstrand, hits_plus, hits_minus, hits_plus_rpkm,
                      hits_minus_rpkm)
                if gstrand == '-': exonNum -= 1
                elif gstrand == '+': exonNum += 1
                mRNA_plus_hits += hits_plus
                mRNA_minus_hits += hits_minus
                mRNA_length += size
            mRNA_plus_rpkm = mRNA_plus_hits * 1000000000.0 / (mRNA_length *
                                                              denominator)
            mRNA_minus_rpkm = mRNA_minus_hits * 1000000000.0 / (mRNA_length *
                                                                denominator)
            print >> OUT, '\t'.join([
                '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%d', '%.3f', '%.3f'
            ]) % (chrom, tx_start, tx_end, geneName + "_mRNA", 0, gstrand,
                  mRNA_plus_hits, mRNA_minus_hits, mRNA_plus_rpkm,
                  mRNA_minus_rpkm)
        elif (options.strand_rule is None):  #this is NOT strand specific
            if gstrand == '-':
                intronNum = len(intron_starts)
                exonNum = len(exon_starts)
            elif gstrand == '+':
                intronNum = 1
                exonNum = 1
            #assign reads to intron regions
            for st, end in zip(intron_starts, intron_ends):
                if end > st:
                    size = end - st
                elif end == st:
                    size = 1
                hits = len(unstrand_ranges.find(st, end))
                hits_rpkm = hits * 1000000000.0 / (size * denominator)
                print >> OUT, '\t'.join([
                    '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f'
                ]) % (chrom, st, end, geneName + "_intron_" + str(intronNum),
                      0, gstrand, hits, hits_rpkm)
                if gstrand == '-': intronNum -= 1
                elif gstrand == '+': intronNum += 1
            #assign reads to exon regions
            for st, end in zip(exon_starts, exon_ends):
                if end > st:
                    size = end - st
                elif end == st:
                    size = 1
                hits = len(unstrand_ranges.find(st, end))
                hits_rpkm = hits * 1000000000.0 / (size * denominator)
                print >> OUT, '\t'.join([
                    '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f'
                ]) % (chrom, st, end, geneName + "_exon_" + str(exonNum), 0,
                      gstrand, hits, hits_rpkm)
                if gstrand == '-': exonNum -= 1
                elif gstrand == '+': exonNum += 1
                mRNA_hits += hits
                mRNA_length += size
            mRNA_rpkm = mRNA_hits * 1000000000.0 / (mRNA_length * denominator)
            print >> OUT, '\t'.join([
                '%s', '%d', '%d', '%s', '%d', '%s', '%d', '%.3f'
            ]) % (chrom, tx_start, tx_end, geneName + "_mRNA", 0, gstrand,
                  mRNA_hits, mRNA_rpkm)

        gene_finished += 1
        print >> sys.stderr, " %d transcripts finished\r" % (gene_finished),
Пример #3
0
def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format.")
	(options,args)=parser.parse_args()
		
	if not (options.input_file and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.input_file):
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		sys.exit(0)		

	#build bitset
	(cds_exon_r, intron_r, utr_5_r, utr_3_r,\
	intergenic_up_1kb_r,intergenic_up_5kb_r,intergenic_up_10kb_r,\
	intergenic_down_1kb_r,intergenic_down_5kb_r,intergenic_down_10kb_r,\
	cds_exon_base,intron_base,utr_5_base,utr_3_base,\
	intergenic_up1kb_base,intergenic_up5kb_base,intergenic_up10kb_base,\
	intergenic_down1kb_base,intergenic_down5kb_base,intergenic_down10kb_base) = process_gene_model(options.ref_gene_model)
	
	intron_read=0
	cds_exon_read=0
	utr_5_read=0
	utr_3_read=0
	
	intergenic_up1kb_read=0
	intergenic_down1kb_read=0
	intergenic_up5kb_read=0
	intergenic_down5kb_read=0
	intergenic_up10kb_read=0
	intergenic_down10kb_read=0
		
	totalReads=0
	totalFrags=0
	unAssignFrags=0
	obj = SAM.ParseBAM(options.input_file)
	
	R_qc_fail=0
	R_duplicate=0
	R_nonprimary=0
	R_unmap=0
	
	print >>sys.stderr, "processing " + options.input_file + " ...",
	try:
		while(1):
			aligned_read = obj.samfile.next()
			if aligned_read.is_qcfail:			#skip QC fail read
				R_qc_fail +=1
				continue
			if aligned_read.is_duplicate:		#skip duplicate read
				R_duplicate +=1
				continue
			if aligned_read.is_secondary:		#skip non primary hit
				R_nonprimary +=1
				continue
			if aligned_read.is_unmapped:		#skip unmap read
				R_unmap +=1
				continue		
			totalReads +=1
			chrom = obj.samfile.getrname(aligned_read.tid)
			chrom=chrom.upper()
			exons = bam_cigar.fetch_exon(chrom, aligned_read.pos, aligned_read.cigar)
			totalFrags += len(exons)
			
			for exn in exons:
				#print chrom + '\t' + str(exn[1]) + '\t' + str(exn[2])
				mid = int(exn[1]) + int((int(exn[2]) - int(exn[1]))/2)
				if foundone(chrom,cds_exon_r,mid,mid) > 0:
					cds_exon_read += 1
					continue
				elif foundone(chrom,utr_5_r,mid,mid) >0 and foundone(chrom,utr_3_r,mid,mid) == 0:
					utr_5_read += 1
					continue
				elif foundone(chrom,utr_3_r,mid,mid) >0 and foundone(chrom,utr_5_r,mid,mid) == 0:
					utr_3_read += 1
					continue
				elif foundone(chrom,utr_3_r,mid,mid) >0 and foundone(chrom,utr_5_r,mid,mid) > 0:
					unAssignFrags +=1
					continue
				elif foundone(chrom,intron_r,mid,mid) > 0:
					intron_read += 1
					continue
				elif foundone(chrom,intergenic_up_10kb_r,mid,mid) >0 and foundone(chrom,intergenic_down_10kb_r,mid,mid) > 0:
					unAssignFrags +=1
					continue					
				elif foundone(chrom,intergenic_up_1kb_r,mid,mid) >0:
					intergenic_up1kb_read += 1
					intergenic_up5kb_read += 1
					intergenic_up10kb_read += 1
				elif foundone(chrom,intergenic_up_5kb_r,mid,mid) >0:
					intergenic_up5kb_read += 1
					intergenic_up10kb_read += 1
				elif foundone(chrom,intergenic_up_10kb_r,mid,mid) >0:
					intergenic_up10kb_read += 1
				
				elif foundone(chrom,intergenic_down_1kb_r,mid,mid) >0:
					intergenic_down1kb_read += 1
					intergenic_down5kb_read += 1
					intergenic_down10kb_read += 1
				elif foundone(chrom,intergenic_down_5kb_r,mid,mid) >0:
					intergenic_down5kb_read += 1
					intergenic_down10kb_read += 1
				elif foundone(chrom,intergenic_down_10kb_r,mid,mid) >0:
					intergenic_down10kb_read += 1	
				else:
					unAssignFrags +=1
	except StopIteration:
		print >>sys.stderr, "Finished\n"				

	print "%-30s%d" % ("Total Reads",totalReads)
	print  "%-30s%d" % ("Total Tags",totalFrags)
	print  "%-30s%d" % ("Total Assigned Tags",totalFrags-unAssignFrags)
	
	print  "====================================================================="
	print  "%-20s%-20s%-20s%-20s" % ('Group','Total_bases','Tag_count','Tags/Kb')
	print  "%-20s%-20d%-20d%-18.2f" % ('CDS_Exons',cds_exon_base,cds_exon_read,cds_exon_read*1000.0/(cds_exon_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("5'UTR_Exons",utr_5_base,utr_5_read, utr_5_read*1000.0/(utr_5_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("3'UTR_Exons",utr_3_base,utr_3_read, utr_3_read*1000.0/(utr_3_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("Introns",intron_base,intron_read,intron_read*1000.0/(intron_base+1))
	
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_1kb",intergenic_up1kb_base, intergenic_up1kb_read, intergenic_up1kb_read*1000.0/(intergenic_up1kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_5kb",intergenic_up5kb_base, intergenic_up5kb_read, intergenic_up5kb_read*1000.0/(intergenic_up5kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TSS_up_10kb",intergenic_up10kb_base, intergenic_up10kb_read, intergenic_up10kb_read*1000.0/(intergenic_up10kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_1kb",intergenic_down1kb_base, intergenic_down1kb_read, intergenic_down1kb_read*1000.0/(intergenic_down1kb_base+1))
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_5kb",intergenic_down5kb_base, intergenic_down5kb_read, intergenic_down5kb_read*1000.0/(intergenic_down5kb_base+1))	
	print  "%-20s%-20d%-20d%-18.2f" % ("TES_down_10kb",intergenic_down10kb_base, intergenic_down10kb_read, intergenic_down10kb_read*1000.0/(intergenic_down10kb_base+1))
	print  "====================================================================="
Пример #4
0
def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format (SAM is not supported). [required]")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat. [required]")
	parser.add_option("-d","--strand",action="store",type="string",dest="strand_rule",default=None,help="How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'.  If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)")
	parser.add_option("-u","--skip-multi-hits",action="store_true",dest="skip_multi",help="How to deal with multiple hit reads. Presence this option renders program to skip multiple hits reads.")
	parser.add_option("-e","--only-exonic",action="store_true",dest="only_exon",help="How to count total reads. Presence of this option renders program only used exonic (UTR exons and CDS exons) reads, otherwise use all reads.")
	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
	
	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.input_file and options.refgene_bed):
		parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.input_file + '.bai'):
		print >>sys.stderr, "cannot find index file of input BAM file"
		print >>sys.stderr, options.input_file + '.bai' + " does not exists"
		sys.exit(0)
	for file in (options.input_file, options.refgene_bed):
		if not os.path.exists(file):
			print >>sys.stderr, file + " does NOT exists" + '\n'
			sys.exit(0)

	obj = SAM.ParseBAM(options.input_file)
	OUT = open(options.output_prefix + '_read_count.xls','w')

	#++++++++++++++++++++++++++++++++++++determine strand rule
	strandRule={}
	if options.strand_rule is None:													# Not strand-specific
		pass																
	elif len(options.strand_rule.split(',')) ==4:									#PairEnd, strand-specific
		for i in options.strand_rule.split(','):strandRule[i[0]+i[1]]=i[2]
	elif len(options.strand_rule.split(',')) ==2:									#singeEnd, strand-specific
		for i in options.strand_rule.split(','):strandRule[i[0]]=i[1]
	else:
		print >>sys.stderr, "Unknown value of option :'strand_rule' " +  options.strand_rule
		sys.exit(1)	

	#++++++++++++++++++++++++++++++++++++counting reads
	print >>sys.stderr, "Retrieve exon regions from  "+ options.refgene_bed + '...'
	gene_ranges = build_range( options.refgene_bed)
	#print gene_ranges['ERCC-00002'].find(0,100)
	print >>sys.stderr, "Counting total reads ... ",
	
	total_reads =0
	total_tags =0
	total_exonic_tags =0
	
	try:
		while(1):
			aligned_read = obj.samfile.next()
			if aligned_read.is_qcfail:continue			#skip low quanlity					
			if aligned_read.is_duplicate:continue		#skip duplicate read
			if aligned_read.is_secondary:continue		#skip non primary hit
			if aligned_read.is_unmapped:continue		#skip unmap read
			if options.skip_multi:
				if aligned_read.mapq < options.map_qual:
					continue
			total_reads +=1
			chrom = obj.samfile.getrname(aligned_read.tid).upper()
			hit_st = aligned_read.pos
			exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar)	
			total_tags += len(exon_blocks)

			for exn in exon_blocks:	
				mid = exn[1] + int((exn[2]-exn[1])/2)
				#print chrom,mid,mid+1
				#print gene_ranges[chrom].find(mid,mid+1)
				if (chrom in gene_ranges) and len(gene_ranges[chrom].find(mid,mid+1)) >0:
					total_exonic_tags += 1
					
	except StopIteration:
		print >>sys.stderr, "Done"
	print >>sys.stderr, "Total Reads = %-20s" % (str(total_reads))
	print >>sys.stderr, "Total Tags = %-20s" % (str(total_tags))
	print >>sys.stderr, "Total Exon Tags = %-20s" % (str(total_exonic_tags))

	if total_tags >0 and total_exonic_tags>0:
		if options.only_exon:
			denominator = total_exonic_tags
		else:
			denominator = total_tags
	else:
		print >>sys.stderr, "Total tags cannot be 0 or negative number"
		sys.exit(1)
	
	#++++++++++++++++++++++++++++++++++++++++++++++++
	obj = SAM.ParseBAM(options.input_file)
	if options.strand_rule is None:
		OUT.write('#chrom' + '\t' + 'st' + '\t' + 'end' + '\t' + 'accession' + '\t' + 'score' + '\t' + 'gene_strand' + '\t' + 'tag_count' + '\t' + 'RPKM' + '\n')
	else:
		OUT.write('#chrom' + '\t' + 'st' + '\t' + 'end' + '\t' + 'accession' + '\t' + 'score' + '\t' + 'gene_strand' + '\t' + 'tag_count_Forward' + '\t' + 'tag_count_Reverse' +'\t' + 'RPKM_Forward' + '\t' + 'RPKM_Reverse' + '\n')
	genome_total_read=0
	genome_unique_read=0
	gene_finished=0
	#calculate raw count, RPKM for each gene
	for line in open(options.refgene_bed,'r'):
		exon_range=Intersecter()
		intron_range=Intersecter()		
		if line.startswith(('#','track','browser')):continue   
		fields = line.split()
		chrom     = fields[0]
		tx_start  = int( fields[1] )
		tx_end    = int( fields[2] )
		geneName      = fields[3]
		gstrand    = fields[5].replace(" ","_")
		cds_start = int( fields[6] )
		cds_end   = int( fields[7] )
	    	
		exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
		exon_starts = map((lambda x: x + tx_start ), exon_starts)
		exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) )
		exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends);   
		intron_starts = exon_ends[:-1]
		intron_ends = exon_starts[1:]
		
		plus_ranges=Intersecter()
		minus_ranges=Intersecter()
		unstrand_ranges= Intersecter()
		
		try:
			alignedReads = obj.samfile.fetch(chrom,tx_start,tx_end)
		except:
			print >>sys.stderr, "No alignments for " + geneName + ". Skip"
			continue
		for aligned_read in alignedReads:
			flag=0
			if aligned_read.is_qcfail:continue			#skip low quanlity					
			if aligned_read.is_duplicate:continue		#skip duplicate read
			if aligned_read.is_secondary:continue		#skip non primary hit
			if aligned_read.is_unmapped:continue		#skip unmap read
			
			if options.skip_multi:
				if len(aligned_read.tags)>0:				#( ("NM", 1),("RG", "L1") )
					for i in aligned_read.tags:
						if i[0] in SAM.ParseBAM.multi_hit_tags and i[1] >1:
							flag=1							#multiple hit read
							break
				if flag==1:continue							#skip multiple map read		


			if aligned_read.is_paired:						#pair end
				if aligned_read.is_read1:read_id = '1'
				if aligned_read.is_read2:read_id = '2'
			else:read_id = ''								#single end
			
			if aligned_read.is_reverse:map_strand = '-'
			else:map_strand = '+'				
			strand_key = read_id + map_strand				#used to determine if a read should assign to gene(+) or gene(-)

			hit_st = aligned_read.pos
			exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar)
						
			#construct bitset
			if options.strand_rule is not None:	
				if strandRule[strand_key] == '+':
					for block in exon_blocks:
						mid = block[1] + int((block[2] - block[1])/2)
						plus_ranges.add_interval( Interval( mid,mid+1 ) )
				elif strandRule[strand_key] == '-':
					for block in exon_blocks:
						mid = block[1] + int((block[2] - block[1])/2)
						minus_ranges.add_interval( Interval( mid,mid+1 ) )
			elif options.strand_rule is None:	
				for block in exon_blocks:
					mid = block[1] + int((block[2] - block[1])/2)
					unstrand_ranges.add_interval( Interval( mid,mid+1 ) )
		mRNA_plus_hits =0
		mRNA_plus_rpkm =0.0
		
		mRNA_minus_hits =0
		mRNA_minus_rpkm =0.0
		
		mRNA_hits =0
		mRNA_rpkm =0.0
		
		mRNA_length=0
		
		#assign reads to region:exon,intron,mRNA
		if (options.strand_rule is not None):	#this is strand specific
			if gstrand == '-':
				intronNum=len(intron_starts)	
				exonNum=len(exon_starts)
			elif gstrand == '+':
				intronNum=1
				exonNum=1			
			#assign reads to intron regions
			for st,end in zip(intron_starts,intron_ends):
				if end >st:
					size = end - st
				elif end == st:
					size = 1
				hits_plus = len(plus_ranges.find(st,end))
				hits_minus = len(minus_ranges.find(st,end))
				hits_plus_rpkm = hits_plus*1000000000.0/(size*denominator)
				hits_minus_rpkm = hits_minus*1000000000.0/(size*denominator)
				print >>OUT, '\t'.join(['%s','%d','%d','%s','%d','%s','%d','%d','%.3f','%.3f']) % (chrom,st,end,geneName + "_intron_" + str(intronNum),0,gstrand,hits_plus,hits_minus,hits_plus_rpkm,hits_minus_rpkm)
				if gstrand == '-':intronNum -= 1
				elif gstrand == '+':intronNum +=1
			#assign reads to exon regions
			for st,end in zip(exon_starts,exon_ends):
				if end >st:
					size = end - st
				elif end == st:
					size = 1			
				hits_plus = len(plus_ranges.find(st,end))
				hits_minus = len(minus_ranges.find(st,end))
				hits_plus_rpkm = hits_plus*1000000000.0/(size*denominator)
				hits_minus_rpkm = hits_minus*1000000000.0/(size*denominator)
				print >>OUT, '\t'.join(['%s','%d','%d','%s','%d','%s','%d','%d','%.3f','%.3f']) % (chrom,st,end,geneName + "_exon_" + str(exonNum),0,gstrand,hits_plus,hits_minus,hits_plus_rpkm,hits_minus_rpkm)
				if gstrand == '-':exonNum -= 1
				elif gstrand == '+':exonNum += 1
				mRNA_plus_hits += hits_plus
				mRNA_minus_hits += hits_minus
				mRNA_length += size
			mRNA_plus_rpkm = mRNA_plus_hits*1000000000.0/(mRNA_length*denominator)
			mRNA_minus_rpkm = mRNA_minus_hits*1000000000.0/(mRNA_length*denominator)
			print >>OUT, '\t'.join(['%s','%d','%d','%s','%d','%s','%d','%d','%.3f','%.3f']) % (chrom,tx_start,tx_end,geneName + "_mRNA",0,gstrand,mRNA_plus_hits,mRNA_minus_hits,mRNA_plus_rpkm,mRNA_minus_rpkm)
		elif (options.strand_rule is None):	#this is NOT strand specific
			if gstrand == '-':
				intronNum=len(intron_starts)	
				exonNum=len(exon_starts)
			elif gstrand == '+':
				intronNum=1
				exonNum=1
			#assign reads to intron regions
			for st,end in zip(intron_starts,intron_ends):
				if end >st:
					size = end - st
				elif end == st:
					size = 1						
				hits = len(unstrand_ranges.find(st,end))
				hits_rpkm = hits*1000000000.0/(size*denominator)
				print >>OUT, '\t'.join(['%s','%d','%d','%s','%d','%s','%d','%.3f']) % (chrom,st,end,geneName + "_intron_" + str(intronNum),0,gstrand,hits,hits_rpkm)
				if gstrand == '-':intronNum -= 1
				elif gstrand == '+':intronNum +=1
			#assign reads to exon regions
			for st,end in zip(exon_starts,exon_ends):
				if end >st:
					size = end - st
				elif end == st:
					size = 1						
				hits = len(unstrand_ranges.find(st,end))
				hits_rpkm = hits*1000000000.0/(size*denominator)				
				print >>OUT, '\t'.join(['%s','%d','%d','%s','%d','%s','%d','%.3f']) % (chrom,st,end,geneName + "_exon_" + str(exonNum),0,gstrand,hits,hits_rpkm)
				if gstrand == '-':exonNum -= 1
				elif gstrand == '+':exonNum += 1
				mRNA_hits += hits
				mRNA_length += size
			mRNA_rpkm = mRNA_hits*1000000000.0/(mRNA_length*denominator)
			print >>OUT, '\t'.join(['%s','%d','%d','%s','%d','%s','%d','%.3f']) % (chrom,tx_start,tx_end,geneName + "_mRNA",0,gstrand,mRNA_hits,mRNA_rpkm)
		
		gene_finished +=1
		print >>sys.stderr, " %d transcripts finished\r" % (gene_finished),