def extract_phastcons(bedfile, phas_chrnames, width): """Extract phastcons scores from a bed file. Return the average scores """ info("read bed file...") bfhd = open(bedfile) bed = BedIO.parse_BED(bfhd) # calculate the middle point of bed regions then extend left and right by 1/2 width bchrs = bed.peaks.keys() bchrs.sort() chrs = [] for c in phas_chrnames: if c in bchrs: chrs.append(c) sumscores = [0] * width n = 0 tmpfname = tempfile.mkstemp(prefix="consplotscore")[1] tmpbedfname = tempfile.mkstemp(prefix="consplotbed")[1] # fix regions in bed file for chrom in chrs: pchrom = bed.peaks[chrom] for i in range(len(pchrom)): mid = int((pchrom[i][0] + pchrom[i][1]) / 2) left = int(mid - width / 2) right = int(mid + width / 2) if left < 0: pchrom[i] = (0, width, 1, 1, 1, 1, 1, 1) else: pchrom[i] = (left, right, 1, 1, 1, 1, 1, 1) bedfhd = open(tmpbedfname, 'w') bedfhd.write(bed.tobed()) bedfhd.close() for chrom in chrs: tmpf = open(tmpfname, 'w') info("extract chromosome %s" % (chrom)) p = subprocess.Popen( [hgWiggle, '-bedFile=%s' % tmpbedfname, chrom], stdout=tmpf) p.communicate() tmpf.close() wio = WiggleIO.WiggleIO(open(tmpfname)) wtrack = wio.build_wigtrack() wtrack.sort() scores = bed.extract_wiggle_pv(wtrack) add_scores(sumscores, scores) n += len(scores) os.unlink(tmpfname) os.unlink(tmpbedfname) # calculate average score return map(lambda x: float(x) / n, sumscores)
def extract_phastcons ( bedfile, phas_chrnames, width ): """Extract phastcons scores from a bed file. Return the average scores """ info("read bed file...") bfhd = open(bedfile) bed = BedIO.parse_BED(bfhd) # calculate the middle point of bed regions then extend left and right by 1/2 width bchrs = bed.peaks.keys() bchrs.sort() chrs = [] for c in phas_chrnames: if c in bchrs: chrs.append(c) sumscores = [0]*width n = 0 tmpfname = tempfile.mkstemp(prefix="consplotscore")[1] tmpbedfname = tempfile.mkstemp(prefix="consplotbed")[1] # fix regions in bed file for chrom in chrs: pchrom = bed.peaks[chrom] for i in range(len(pchrom)): mid = int((pchrom[i][0]+pchrom[i][1])/2) left = int(mid - width/2) right = int(mid + width/2) if left < 0: pchrom[i] = (0,width,1,1,1,1,1,1) else: pchrom[i] = (left,right,1,1,1,1,1,1) bedfhd = open (tmpbedfname,'w') bedfhd.write(bed.tobed()) bedfhd.close() for chrom in chrs: tmpf = open(tmpfname,'w') info ("extract chromosome %s" % (chrom)) p = subprocess.Popen([hgWiggle,'-bedFile=%s' % tmpbedfname,chrom],stdout=tmpf) p.communicate() tmpf.close() wio = WiggleIO.WiggleIO(open(tmpfname)) wtrack = wio.build_wigtrack() wtrack.sort() scores = bed.extract_wiggle_pv(wtrack) add_scores ( sumscores, scores ) n += len(scores) os.unlink(tmpfname) os.unlink(tmpbedfname) # calculate average score return map(lambda x:float(x)/n, sumscores)
def main(): usage = "usage: %prog [options] <-r rfile> <-b bed file> <-w wiggle file>(>=2)" description = """Draw correlation plot for many wiggle files at regions by a bed file. Method: It will calculate a value for each region defined in a bed file based on each wiggle files. The method can be chosen from -m option. """ optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-d","--db",type="str",dest="dbname",help="UCSC db name for the assembly. Default: ce4",default="ce4") optparser.add_option("-z","--imgsize",dest="imgsize",type="int", help="image size. default: 10, minimal: 10",default=10) optparser.add_option("-f","--format",dest="imgformat",type="string", help="image format. PDF or PNG",default='PDF') optparser.add_option("-m","--method",dest="method",type="string",default="median", help="method to process the paired two sets of data in the sampling step. Choices are 'median', 'mean', and 'sample' (just take one point out of a data set). Default: median") optparser.add_option("-r","--rfile",dest="rfile", help="R output file. If not set, do not save R file.") optparser.add_option("-b","--bed",dest="bed",type="string", help="the bed file you want to include in the calculation.") optparser.add_option("-w","--wig",dest="wig",type="string",action="append", help="the wiggle file you want to include in the calculation. This option should be used for at least twice.") optparser.add_option("-l","--wig-label",dest="wiglabel",type="string",action="append", help="the wiggle file labels in the figure. No space is allowed. This option should be used same times as -w option, and please input them in the same order as -w option. default: will use the wiggle file filename as labels.") optparser.add_option("--min-score",dest="minscore",type="float",default=0, help="minimum score included in calculation. Points w/ score lower than this will be discarded.") optparser.add_option("--max-score",dest="maxscore",type="float",default=10000, help="maximum score included in calculation. Points w/ score larger than this will be discarded.") optparser.add_option("-H","--heatmap",dest="heatmap",action="store_true",default=False, help="If True, a heatmap image will be generated instead of paired scatterplot image.") (options,args) = optparser.parse_args() imgfmt = options.imgformat.upper() if imgfmt != 'PDF' and imgfmt != 'PNG': print "unrecognized format: %s" % imgfmt sys.exit(1) method = options.method.lower() if method == 'median': medfunc = median elif method == 'mean': medfunc = mean elif method == 'sample': medfunc = lambda u: u[-1] else: print "unrecognized method: %s" % (method) sys.exit(1) # must provide >=2 wiggle files if not options.wig or len(options.wig) < 2 or not options.rfile or not options.bed: optparser.print_help() sys.exit(1) # wig labels if options.wiglabel and len(options.wiglabel) == len(options.wig): wiglabel = options.wiglabel else: # or use the filename wiglabel = map(lambda x:os.path.basename(x),options.wig) wigfilenum = len(options.wig) # check the files if not os.path.isfile(options.bed): error("%s is not valid!" % options.bed) sys.exit(1) for f in options.wig: if not os.path.isfile(f): error("%s is not valid!" % f) sys.exit(1) wigfhds = map(open,options.wig) # file handlers for wiggle files info("number of wiggle files: %d" % wigfilenum) # get chromosome length info from UCSC info("connect to UCSC to get chromosome length information") try: chrom_len = get_chrom_length(options.dbname) except: error("Error!") sys.exit(1) # get the common chromosome list: chromsdict = {} for wigfhd in wigfhds: for l in wigfhd: if l.find("chrom=") != -1: c = re.search("chrom=(\w+)",l).group(1) chromsdict[c] = chromsdict.setdefault(c,0)+1 chroms = [] for c in chromsdict.keys(): if chromsdict[c]==wigfilenum: chroms.append(c) info("common chromosomes are %s..." % ",".join(chroms)) # open the R script file handler rfhd = open(options.rfile,"w") rfhd.write(''' require("RColorBrewer") ## from CRAN ''') info("read bed file %s" % os.path.basename(options.bed)) bedregion = BedIO.parse_BED(open(options.bed,'r')) # for each wig file, sample... for i in range(len(wigfhds)): wigfhd = wigfhds[i] wigfhd.seek(0) # reset info("read wiggle track from wiggle file #%d" % (i+1)) bk = WiggleIO.WiggleIO(wigfhd).build_binKeeper(chromLenDict=chrom_len) p = bedregion.extract_binkeepers(bk,func=medfunc) info("write values to r file") rfhd.write("p%d <- c(" % i ) if p[0]: rfhd.write("%f" % p[0]) else: rfhd.write("NA") for v in p[1:]: if v: rfhd.write(",%f" % v) else: rfhd.write(",NA") rfhd.write(")\n") rfhd.write("c <- cbind(p0") for i in range(wigfilenum-1): rfhd.write(",p%d" % (i+1)) rfhd.write(")\n") rfhd.write("c <- c[ c[,1]<=%f & c[,1]>=%f " % (options.maxscore,options.minscore)) for i in range(wigfilenum-1): rfhd.write("& c[,%d]<=%f & c[,%d]>=%f " % (i+2,options.maxscore,i+2,options.minscore)) rfhd.write(",]\n") if imgfmt == 'PDF': rfhd.write("pdf(\"%s.pdf\",width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize)) elif imgfmt == 'PNG': rfhd.write("png(\"%s.png\",units=\"in\",res=150,width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize)) if options.heatmap: rfhd.write('library(gplots)\n') rfhd.write(''' m <- cor(c, method="pearson", use="pairwise.complete.obs") ''') labels = ",".join(map(lambda x:"\""+x+"\"",wiglabel)) rfhd.write("rownames(m) <- c(%s)\n" % labels) rfhd.write("colnames(m) <- c(%s)\n" % labels) rfhd.write('# draw the heatmap using gplots heatmap.2\n') # rfhd.write('bitmap("%s.bmp",width=%d,height=%d)\n' % (options.rfile,options.imgsize,options.imgsize)) rfhd.write('mn <- -1\n') rfhd.write('mx <- 1\n') rfhd.write('n <- 98\n') rfhd.write('bias <- 1\n') rfhd.write('mc <- matrix(as.character(round(m, 2)), ncol=dim(m)[2])\n') rfhd.write('breaks <- seq(mn, mx, (mx-mn)/(n))\n') rfhd.write('cr <- colorRampPalette(colors = c("#2927FF","#FFFFFF","#DF5C5C"), bias=bias)\n') rfhd.write('heatmap.2(m, col = cr(n), breaks=breaks, trace="none", cellnote=mc, notecol="black", notecex=1.8, keysize=0.5, density.info="histogram", margins=c(27.0,27.0), cexRow=2.20, cexCol=2.20, revC=T, symm=T)\n') else: rfhd.write(''' panel.plot <- function( x,y, ... ) { par(new=TRUE) m <- cbind(x,y) plot(m,col=densCols(m),pch=20) lines(lowess(m[!is.na(m[,1])&!is.na(m[,2]),]),col="red") } panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- cor(x, y,use="complete.obs") txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * abs(r)) } ''') # rfhd.write("bitmap(\"%s.bmp\",width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize)) labels = ",".join(map(lambda x:"\""+x+"\"",wiglabel)) rfhd.write(''' pairs(c, lower.panel=panel.plot, upper.panel=panel.cor, labels=c(%s)) ''' % (labels)) rfhd.write("dev.off()\n") rfhd.close() # try to call R try: subprocess.call(['Rscript',options.rfile]) except: info("Please check %s" % options.rfile) else: info("Please check %s" % (options.rfile+'.bmp'))
def main(): usage = "usage: %prog [options] <-r rfile> <-b bed file> <-w wiggle file>(>=2)" description = """Draw correlation plot for many wiggle files at regions by a bed file. Method: It will calculate a value for each region defined in a bed file based on each wiggle files. The method can be chosen from -m option. """ optparser = OptionParser(version="%prog 0.1", description=description, usage=usage, add_help_option=False) optparser.add_option("-h", "--help", action="help", help="Show this help message and exit.") optparser.add_option("-d", "--db", type="str", dest="dbname", help="UCSC db name for the assembly. Default: ce4", default="ce4") optparser.add_option("-z", "--imgsize", dest="imgsize", type="int", help="image size. default: 10, minimal: 10", default=10) optparser.add_option("-f", "--format", dest="imgformat", type="string", help="image format. PDF or PNG", default='PDF') optparser.add_option( "-m", "--method", dest="method", type="string", default="median", help= "method to process the paired two sets of data in the sampling step. Choices are 'median', 'mean', and 'sample' (just take one point out of a data set). Default: median" ) optparser.add_option("-r", "--rfile", dest="rfile", help="R output file. If not set, do not save R file.") optparser.add_option( "-b", "--bed", dest="bed", type="string", help="the bed file you want to include in the calculation.") optparser.add_option( "-w", "--wig", dest="wig", type="string", action="append", help= "the wiggle file you want to include in the calculation. This option should be used for at least twice." ) optparser.add_option( "-l", "--wig-label", dest="wiglabel", type="string", action="append", help= "the wiggle file labels in the figure. No space is allowed. This option should be used same times as -w option, and please input them in the same order as -w option. default: will use the wiggle file filename as labels." ) optparser.add_option( "--min-score", dest="minscore", type="float", default=0, help= "minimum score included in calculation. Points w/ score lower than this will be discarded." ) optparser.add_option( "--max-score", dest="maxscore", type="float", default=10000, help= "maximum score included in calculation. Points w/ score larger than this will be discarded." ) optparser.add_option( "-H", "--heatmap", dest="heatmap", action="store_true", default=False, help= "If True, a heatmap image will be generated instead of paired scatterplot image." ) (options, args) = optparser.parse_args() imgfmt = options.imgformat.upper() if imgfmt != 'PDF' and imgfmt != 'PNG': print "unrecognized format: %s" % imgfmt sys.exit(1) method = options.method.lower() if method == 'median': medfunc = median elif method == 'mean': medfunc = mean elif method == 'sample': medfunc = lambda u: u[-1] else: print "unrecognized method: %s" % (method) sys.exit(1) # must provide >=2 wiggle files if not options.wig or len( options.wig) < 2 or not options.rfile or not options.bed: optparser.print_help() sys.exit(1) # wig labels if options.wiglabel and len(options.wiglabel) == len(options.wig): wiglabel = options.wiglabel else: # or use the filename wiglabel = map(lambda x: os.path.basename(x), options.wig) wigfilenum = len(options.wig) # check the files if not os.path.isfile(options.bed): error("%s is not valid!" % options.bed) sys.exit(1) for f in options.wig: if not os.path.isfile(f): error("%s is not valid!" % f) sys.exit(1) wigfhds = map(open, options.wig) # file handlers for wiggle files info("number of wiggle files: %d" % wigfilenum) # get chromosome length info from UCSC info("connect to UCSC to get chromosome length information") try: chrom_len = get_chrom_length(options.dbname) except: error("Error!") sys.exit(1) # get the common chromosome list: chromsdict = {} for wigfhd in wigfhds: for l in wigfhd: if l.find("chrom=") != -1: c = re.search("chrom=(\w+)", l).group(1) chromsdict[c] = chromsdict.setdefault(c, 0) + 1 chroms = [] for c in chromsdict.keys(): if chromsdict[c] == wigfilenum: chroms.append(c) info("common chromosomes are %s..." % ",".join(chroms)) # open the R script file handler rfhd = open(options.rfile, "w") rfhd.write(''' require("RColorBrewer") ## from CRAN ''') info("read bed file %s" % os.path.basename(options.bed)) bedregion = BedIO.parse_BED(open(options.bed, 'r')) # for each wig file, sample... for i in range(len(wigfhds)): wigfhd = wigfhds[i] wigfhd.seek(0) # reset info("read wiggle track from wiggle file #%d" % (i + 1)) bk = WiggleIO.WiggleIO(wigfhd).build_binKeeper(chromLenDict=chrom_len) p = bedregion.extract_binkeepers(bk, func=medfunc) info("write values to r file") rfhd.write("p%d <- c(" % i) if p[0]: rfhd.write("%f" % p[0]) else: rfhd.write("NA") for v in p[1:]: if v: rfhd.write(",%f" % v) else: rfhd.write(",NA") rfhd.write(")\n") rfhd.write("c <- cbind(p0") for i in range(wigfilenum - 1): rfhd.write(",p%d" % (i + 1)) rfhd.write(")\n") rfhd.write("c <- c[ c[,1]<=%f & c[,1]>=%f " % (options.maxscore, options.minscore)) for i in range(wigfilenum - 1): rfhd.write("& c[,%d]<=%f & c[,%d]>=%f " % (i + 2, options.maxscore, i + 2, options.minscore)) rfhd.write(",]\n") if imgfmt == 'PDF': rfhd.write("pdf(\"%s.pdf\",width=%d,height=%d)\n" % (options.rfile, options.imgsize, options.imgsize)) elif imgfmt == 'PNG': rfhd.write( "png(\"%s.png\",units=\"in\",res=150,width=%d,height=%d)\n" % (options.rfile, options.imgsize, options.imgsize)) if options.heatmap: rfhd.write('library(gplots)\n') rfhd.write(''' m <- cor(c, method="pearson", use="pairwise.complete.obs") ''') labels = ",".join(map(lambda x: "\"" + x + "\"", wiglabel)) rfhd.write("rownames(m) <- c(%s)\n" % labels) rfhd.write("colnames(m) <- c(%s)\n" % labels) rfhd.write('# draw the heatmap using gplots heatmap.2\n') # rfhd.write('bitmap("%s.bmp",width=%d,height=%d)\n' % (options.rfile,options.imgsize,options.imgsize)) rfhd.write('mn <- -1\n') rfhd.write('mx <- 1\n') rfhd.write('n <- 98\n') rfhd.write('bias <- 1\n') rfhd.write('mc <- matrix(as.character(round(m, 2)), ncol=dim(m)[2])\n') rfhd.write('breaks <- seq(mn, mx, (mx-mn)/(n))\n') rfhd.write( 'cr <- colorRampPalette(colors = c("#2927FF","#FFFFFF","#DF5C5C"), bias=bias)\n' ) rfhd.write( 'heatmap.2(m, col = cr(n), breaks=breaks, trace="none", cellnote=mc, notecol="black", notecex=1.8, keysize=0.5, density.info="histogram", margins=c(27.0,27.0), cexRow=2.20, cexCol=2.20, revC=T, symm=T)\n' ) else: rfhd.write(''' panel.plot <- function( x,y, ... ) { par(new=TRUE) m <- cbind(x,y) plot(m,col=densCols(m),pch=20) lines(lowess(m[!is.na(m[,1])&!is.na(m[,2]),]),col="red") } panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- cor(x, y,use="complete.obs") txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * abs(r)) } ''') # rfhd.write("bitmap(\"%s.bmp\",width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize)) labels = ",".join(map(lambda x: "\"" + x + "\"", wiglabel)) rfhd.write(''' pairs(c, lower.panel=panel.plot, upper.panel=panel.cor, labels=c(%s)) ''' % (labels)) rfhd.write("dev.off()\n") rfhd.close() # try to call R try: subprocess.call(['Rscript', options.rfile]) except: info("Please check %s" % options.rfile) else: info("Please check %s" % (options.rfile + '.bmp'))