def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False): """ we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust to chrom numbers draw a qq for pvals and a manhattan plot if chrom/offset <> 0 contains some R scripts as text strings - we substitute defaults into the calls to make them do our bidding - and save the resulting code for posterity this can be called externally, I guess...for QC eg? """ if debug: print 'doManQQ',input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir ffd,filtered_fname = tempfile.mkstemp(prefix='rgManQQtemp') f = open(filtered_fname,'w') inf = open(input_fname,'r') ohead = inf.readline().strip().split('\t') # see if we have a header inf.seek(0) # rewind newhead = ['pval%d' % (x+1) for x in pval_cols] newhead.insert(0,'Offset') newhead.insert(0,'Chrom') havehead = 0 wewant = [chrom_col,offset_col] wewant += pval_cols try: allnums = ['%d' % x for x in ohead] # this should barf if non numerics == header row? f.write('\t'.join(newhead)) # for R to read f.write('\n') except: havehead = 1 newhead = [ohead[chrom_col],ohead[offset_col]] newhead += [ohead[x] for x in pval_cols] f.write('\t'.join(newhead)) # use the original head f.write('\n') for i,row in enumerate(inf): if i == 0 and havehead: continue # ignore header sr = row.strip().split('\t') if len(sr) > 1: if sr[chrom_col].lower().find('chr') <> -1: sr[chrom_col] = sr[chrom_col][3:] newr = [sr[x] for x in wewant] # grab cols we need s = '\t'.join(newr) f.write(s) f.write('\n') f.close() pvc = [x+3 for x in range(len(pval_cols))] # 2 for offset and chrom, 1 for r offset start pvc = 'c(%s)' % (','.join(map(str,pvc))) rcmd = '%s%s' % (rcode,rcode2 % (filtered_fname,'1','2',pvc,title,grey)) if debug: print 'running\n%s\n' % rcmd rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir) rlog.append('## R script=') rlog.append(rcmd) if beTidy: os.unlink(filtered_fname) return rlog,flist
def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False): """ we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust to chrom numbers draw a qq for pvals and a manhattan plot if chrom/offset <> 0 contains some R scripts as text strings - we substitute defaults into the calls to make them do our bidding - and save the resulting code for posterity this can be called externally, I guess...for QC eg? """ if debug: print 'doManQQ',input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey)) if debug: print 'running\n%s\n' % rcmd rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir) rlog.append('## R script=') rlog.append(rcmd) return rlog,flist
def makeQQ(dat=[], sample=1.0, maxveclen=4000, fname='fname',title='title', xvar='Sample',h=8,w=8,logscale=True): """ y is data for a qq plot and ends up on the x axis go figure if sampling, oversample low values - all the top 1% ? assume we have 0-1 p values """ R = [] colour="maroon" nrows = len(dat) dat.sort() # small to large fn = float(nrows) unifx = [x/fn for x in range(1,(nrows+1))] if logscale: unifx = [-math.log10(x) for x in unifx] # uniform distribution if sample < 1.0 and len(dat) > maxveclen: # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points # oversample part of the distribution always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points if skip <= 1: skip = 2 samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)] # always oversample first sorted (here lowest) values yvec = [dat[i] for i in samplei] # always get first and last xvec = [unifx[i] for i in samplei] # and sample xvec same way maint='QQ %s (random %d of %d)' % (title,len(yvec),nrows) else: yvec = [x for x in dat] maint='QQ %s (n=%d)' % (title,nrows) xvec = unifx if logscale: maint = 'Log%s' % maint mx = [0,math.log10(nrows)] # if 1000, becomes 3 for the null line ylab = '-log10(%s) Quantiles' % title xlab = '-log10(Uniform 0-1) Quantiles' yvec = [-math.log10(x) for x in yvec if x > 0.0] else: mx = [0,1] ylab = '%s Quantiles' % title xlab = 'Uniform 0-1 Quantiles' xv = ['%f' % x for x in xvec] R.append('xvec = c(%s)' % ','.join(xv)) yv = ['%f' % x for x in yvec] R.append('yvec = c(%s)' % ','.join(yv)) R.append('mx = c(0,%f)' % (math.log10(fn))) R.append('pdf("%s",h=%d,w=%d)' % (fname,h,w)) R.append("par(lab=c(10,10,10))") R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) R.append('points(mx,mx,type="l")') R.append('grid(col="lightgray",lty="dotted")') R.append('dev.off()') RRun(rcmd=R,title='makeQQplot',outdir=None)
def doManQQ(input_fname, chrom_col, offset_col, pval_cols, title, grey, ctitle, outdir, beTidy=False): """ we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust to chrom numbers draw a qq for pvals and a manhattan plot if chrom/offset <> 0 contains some R scripts as text strings - we substitute defaults into the calls to make them do our bidding - and save the resulting code for posterity this can be called externally, I guess...for QC eg? """ if debug: print 'doManQQ', input_fname, chrom_col, offset_col, pval_cols, title, grey, ctitle, outdir rcmd = '%s%s' % ( rcode, rcode2 % (input_fname, chrom_col, offset_col, pval_cols, title, grey)) if debug: print 'running\n%s\n' % rcmd rlog, flist = RRun(rcmd=rcmd, title=ctitle, outdir=outdir) rlog.append('## R script=') rlog.append(rcmd) return rlog, flist
def checkR(myTitle='Rgenetics installation test',outDir='./'): """ report missing packages or X11 for R """ rexpLibs=['ArrayExpress','lumi','limma','AffyExpress',"affyQCReport", "GenABEL","simpleaffy","snpMatrix", "affyPLM",'haplo.stats','GEOquery', 'arrayQualityMetrics'] rgenLibs=['hexbin',] rchecks = ['X11()'] res = [] missinglibs = [] reqLibs = rexpLibs + rgenLibs for thisLib in reqLibs: # ensure all R packages are available rlog,flist=RRun(['library(%s)' % thisLib,],outdir=outDir,title=myTitle) if rlog[0][:19] == 'Nonzero exit code =': # fail missinglibs.append(thisLib) res += rlog if len(missinglibs) > 0: res += ['The R package %s cannot be loaded - please install' % x for x in missinglibs] rlog,flist = RRun(rchecks,outdir=outDir,title=myTitle) if rlog[0][:19] == 'Nonzero exit code =': # fail res.append('X11() failed in R - all graphical (png/jpg/pdf) outputs will fail') res.append('Was X11 configured at R compilation; is the virtual frame buffer Xvfb available for headless nodes?') return res
def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False): """ we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust to chrom numbers draw a qq for pvals and a manhattan plot if chrom/offset <> 0 contains some R scripts as text strings - we substitute defaults into the calls to make them do our bidding - and save the resulting code for posterity this can be called externally, I guess...for QC eg? """ ffd,filtered_fname = tempfile.mkstemp(prefix='rgManQQtemp') f = open(filtered_fname,'w') inf = open(input_fname,'r') ohead = inf.readline().strip().split('\t') # see if we have a header inf.seek(0) # rewind newhead = ['pval%d' % (x+1) for x in pval_cols] newhead.insert(0,'Offset') newhead.insert(0,'Chrom') havehead = 0 wewant = [chrom_col,offset_col] wewant += pval_cols try: allnums = ['%d' % x for x in ohead] # this should barf if non numerics == header row? f.write('\t'.join(newhead)) # for R to read f.write('\n') except: havehead = 1 newhead = [ohead[chrom_col],ohead[offset_col]] newhead += [ohead[x] for x in pval_cols] f.write('\t'.join(newhead)) # use the original head f.write('\n') for i,row in enumerate(inf): if i == 0 and havehead: continue # ignore header sr = row.strip().split('\t') if len(sr) > 1: if sr[chrom_col].lower().find('chr') <> -1: sr[chrom_col] = sr[chrom_col][3:] newr = [sr[x] for x in wewant] # grab cols we need s = '\t'.join(newr) f.write(s) f.write('\n') f.close() pvc = [x+3 for x in range(len(pval_cols))] # 2 for offset and chrom, 1 for r offset start pvc = 'c(%s)' % (','.join(map(str,pvc))) rcmd = '%s%s' % (rcode,rcode2 % (filtered_fname,'1','2',pvc,title,grey)) rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir) rlog.append('## R script=') rlog.append(rcmd) if beTidy: os.unlink(filtered_fname) return rlog,flist
def makePlot(eigpca='test.pca',title='test',pdfname='test.pdf',h=8,w=10,nfp=None,rexe=''): """ the eigenvec file has a # row with the eigenvectors, then subject ids, eigenvecs and lastly the subject class Rpy not being used here. Write a real R script and run it. Sadly, this means putting numbers somewhere - like in the code as monster R vector constructor c(99.3,2.14) strings At least you have the data and the analysis in one single place. Highly reproducible little piece of research. """ debug=False f = file(eigpca,'r') R = [] if debug: R.append('sessionInfo()') R.append("print('dir()=:')") R.append('dir()') R.append("print('pdfname=%s')" % pdfname) gvec = [] pca1 = [] pca2 = [] groups = {} glist = [] # list for legend ngroup = 1 # increment for each new group encountered for pch vector for n,row in enumerate(f): if n > 1: rowlist = row.strip().split() group = rowlist[-1] v1 = rowlist[1] v2 = rowlist[2] try: v1 = float(v1) except: v1 = 0.0 try: v2 = float(v2) except: v2 = 0.0 if not groups.get(group,None): groups[group] = ngroup glist.append(group) ngroup += 1 # for next group gvec.append(groups[group]) # lookup group number pca1.append('%f' % v1) pca2.append('%f' % v2) # now have vectors of group,pca1 and pca2 llist = [x.encode('ascii') for x in glist] # remove label unicode - eesh llist = ['"%s"' % x for x in llist] # need to quote for R R.append('llist=c(%s)' % ','.join(llist)) plist = range(2,len(llist)+2) # pch - avoid black circles R.append('glist=c(%s)' % ','.join(['%d' % x for x in plist])) pgvec = ['%d' % (plist[i-1]) for i in gvec] # plot symbol/colour for each point R.append("par(lab=c(10,10,10))") # so our grid is denser than the default 5 R.append("par(mai=c(1,1,1,0.5))") maint = title R.append('pdf("%s",h=%d,w=%d)' % (pdfname,h,w)) R.append("par(lab=c(10,10,10))") R.append('pca1 = c(%s)' % ','.join(pca1)) R.append('pca2 = c(%s)' % ','.join(pca2)) R.append('pgvec = c(%s)' % ','.join(pgvec)) s = "plot(pca1,pca2,type='p',main='%s', ylab='Second ancestry eigenvector'," % maint s += "xlab='First ancestry eigenvector',col=pgvec,cex=0.8,pch=pgvec)" R.append(s) R.append('legend("top",legend=llist,pch=glist,col=glist,title="Sample")') R.append('grid(nx = 10, ny = 10, col = "lightgray", lty = "dotted")') R.append('dev.off()') R.append('png("%s.png",h=%d,w=%d,units="in",res=72)' % (pdfname,h,w)) s = "plot(pca1,pca2,type='p',main='%s', ylab='Second ancestry eigenvector'," % maint s += "xlab='First ancestry eigenvector',col=pgvec,cex=0.8,pch=pgvec)" R.append(s) R.append('legend("top",legend=llist,pch=glist,col=glist,title="Sample")') R.append('grid(nx = 10, ny = 10, col = "lightgray", lty = "dotted")') R.append('dev.off()') rlog,flist = RRun(rcmd=R,title=title,outdir=nfp) print >> sys.stdout, '\n'.join(R) print >> sys.stdout, rlog
opts.log_file = opts.log or os.path.join(opts.output_dir, opts.input_file + '.log') ext = opts.file_ext if ext == 'fastqsanger': ext = 'fastq' elif ext == 'bwa': ext = 'Bowtie' # Create output folder and save our R script in there. if not os.path.exists(opts.output_dir): os.makedirs(opts.output_dir) r_script_file = os.path.join(opts.output_dir, R_SCRIPT_NAME) r_script = R_SCRIPT % (opts.input_dir, opts.input_file, ext, opts.output_dir) #run_r(R_SCRIPT % ( opts.input_dir, opts.input_file, ext, opts.output_dir),r_script_file,opts.log_file,opts.output_dir) rlog, flist = RRun(rcmd=r_script, outdir=opts.output_dir, title=opts.namejob, tidy=True) # Get file contents. index_file_name = os.path.join(opts.output_dir, HTML_PAGE) try: index_file = open(index_file_name) index_contents = index_file.read() index_file.close() except: index_contents = '## error - is the Bioconductor shortreadqc package installed? ##\n' # Substitute contents in memory. A regexp looking for actual links # would be more correct, but a straight substitution seems ok for # our cases. index_contents = index_contents.replace('./image/', '') index_contents = index_contents.replace( '</body>',