示例#1
0
def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False):
    """ 
    we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust
    to chrom numbers
    draw a qq for pvals and a manhattan plot if chrom/offset <> 0
    contains some R scripts as text strings - we substitute defaults into the calls
    to make them do our bidding - and save the resulting code for posterity
    this can be called externally, I guess...for QC eg?
    """
    if debug:
	print 'doManQQ',input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir
    ffd,filtered_fname = tempfile.mkstemp(prefix='rgManQQtemp')
    f = open(filtered_fname,'w')
    inf = open(input_fname,'r')
    ohead = inf.readline().strip().split('\t') # see if we have a header
    inf.seek(0) # rewind
    newhead = ['pval%d' % (x+1) for x in pval_cols]
    newhead.insert(0,'Offset')
    newhead.insert(0,'Chrom')
    havehead = 0
    wewant = [chrom_col,offset_col]
    wewant += pval_cols
    try:
        allnums = ['%d' % x for x in ohead] # this should barf if non numerics == header row?
        f.write('\t'.join(newhead)) # for R to read
        f.write('\n')
    except:
        havehead = 1
        newhead = [ohead[chrom_col],ohead[offset_col]]
        newhead += [ohead[x] for x in pval_cols]
        f.write('\t'.join(newhead)) # use the original head
        f.write('\n')
    for i,row in enumerate(inf):
        if i == 0 and havehead:
            continue # ignore header
        sr = row.strip().split('\t')
        if len(sr) > 1:
            if sr[chrom_col].lower().find('chr') <> -1:
                sr[chrom_col] = sr[chrom_col][3:]
            newr = [sr[x] for x in wewant] # grab cols we need
            s = '\t'.join(newr)
            f.write(s)
            f.write('\n')
    f.close()
    pvc = [x+3 for x in range(len(pval_cols))] # 2 for offset and chrom, 1 for r offset start
    pvc = 'c(%s)' % (','.join(map(str,pvc)))
    rcmd = '%s%s' % (rcode,rcode2 % (filtered_fname,'1','2',pvc,title,grey))
    if debug:
	print 'running\n%s\n' % rcmd
    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
    rlog.append('## R script=')
    rlog.append(rcmd)
    if beTidy:
        os.unlink(filtered_fname)
    return rlog,flist
示例#2
0
def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False):
    """ 
    we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust
    to chrom numbers
    draw a qq for pvals and a manhattan plot if chrom/offset <> 0
    contains some R scripts as text strings - we substitute defaults into the calls
    to make them do our bidding - and save the resulting code for posterity
    this can be called externally, I guess...for QC eg?
    """
    if debug:
        print 'doManQQ',input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir
    rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
    if debug:
        print 'running\n%s\n' % rcmd
    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
    rlog.append('## R script=')
    rlog.append(rcmd)
    return rlog,flist
示例#3
0
def makeQQ(dat=[], sample=1.0, maxveclen=4000, fname='fname',title='title',
           xvar='Sample',h=8,w=8,logscale=True):
    """
    y is data for a qq plot and ends up on the x axis go figure
    if sampling, oversample low values - all the top 1% ?
    assume we have 0-1 p values
    """
    R = []
    colour="maroon"
    nrows = len(dat)
    dat.sort() # small to large
    fn = float(nrows)
    unifx = [x/fn for x in range(1,(nrows+1))]
    if logscale:
        unifx = [-math.log10(x) for x in unifx] # uniform distribution
    if sample < 1.0 and len(dat) > maxveclen:
        # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
        # oversample part of the distribution
        always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
        skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
        if skip <= 1:
            skip = 2
        samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
        # always oversample first sorted (here lowest) values
        yvec = [dat[i] for i in samplei] # always get first and last
        xvec = [unifx[i] for i in samplei] # and sample xvec same way
        maint='QQ %s (random %d of %d)' % (title,len(yvec),nrows)
    else:
        yvec = [x for x in dat] 
        maint='QQ %s (n=%d)' % (title,nrows)
        xvec = unifx
    if logscale:
        maint = 'Log%s' % maint
        mx = [0,math.log10(nrows)] # if 1000, becomes 3 for the null line
        ylab = '-log10(%s) Quantiles' % title
        xlab = '-log10(Uniform 0-1) Quantiles'
        yvec = [-math.log10(x) for x in yvec if x > 0.0]
    else:
        mx = [0,1]
        ylab = '%s Quantiles' % title
        xlab = 'Uniform 0-1 Quantiles'

    xv = ['%f' % x for x in xvec]
    R.append('xvec = c(%s)' % ','.join(xv))
    yv = ['%f' % x for x in yvec]
    R.append('yvec = c(%s)' % ','.join(yv))
    R.append('mx = c(0,%f)' % (math.log10(fn)))
    R.append('pdf("%s",h=%d,w=%d)' % (fname,h,w))
    R.append("par(lab=c(10,10,10))")
    R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
    R.append('points(mx,mx,type="l")')
    R.append('grid(col="lightgray",lty="dotted")')
    R.append('dev.off()')
    RRun(rcmd=R,title='makeQQplot',outdir=None)
示例#4
0
def doManQQ(input_fname,
            chrom_col,
            offset_col,
            pval_cols,
            title,
            grey,
            ctitle,
            outdir,
            beTidy=False):
    """ 
    we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust
    to chrom numbers
    draw a qq for pvals and a manhattan plot if chrom/offset <> 0
    contains some R scripts as text strings - we substitute defaults into the calls
    to make them do our bidding - and save the resulting code for posterity
    this can be called externally, I guess...for QC eg?
    """
    if debug:
        print 'doManQQ', input_fname, chrom_col, offset_col, pval_cols, title, grey, ctitle, outdir
    rcmd = '%s%s' % (
        rcode, rcode2 %
        (input_fname, chrom_col, offset_col, pval_cols, title, grey))
    if debug:
        print 'running\n%s\n' % rcmd
    rlog, flist = RRun(rcmd=rcmd, title=ctitle, outdir=outdir)
    rlog.append('## R script=')
    rlog.append(rcmd)
    return rlog, flist
def checkR(myTitle='Rgenetics installation test',outDir='./'):
    """ report missing packages or X11 for R """
    rexpLibs=['ArrayExpress','lumi','limma','AffyExpress',"affyQCReport",
    "GenABEL","simpleaffy","snpMatrix", "affyPLM",'haplo.stats','GEOquery',
    'arrayQualityMetrics']
    rgenLibs=['hexbin',]
    rchecks = ['X11()']
    res = []
    missinglibs = []
    reqLibs = rexpLibs + rgenLibs
    for thisLib in reqLibs: # ensure all R packages are available
        rlog,flist=RRun(['library(%s)' % thisLib,],outdir=outDir,title=myTitle)
        if rlog[0][:19] == 'Nonzero exit code =': # fail
            missinglibs.append(thisLib)
            res += rlog
    if len(missinglibs) > 0:
        res += ['The R package %s cannot be loaded - please install' % x for x in missinglibs] 
    rlog,flist = RRun(rchecks,outdir=outDir,title=myTitle)
    if rlog[0][:19] == 'Nonzero exit code =': # fail
            res.append('X11() failed in R - all graphical (png/jpg/pdf) outputs will fail')
            res.append('Was X11 configured at R compilation; is the virtual frame buffer Xvfb available for headless nodes?')
    return res
示例#6
0
def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False):
    """ 
    we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust
    to chrom numbers
    draw a qq for pvals and a manhattan plot if chrom/offset <> 0
    contains some R scripts as text strings - we substitute defaults into the calls
    to make them do our bidding - and save the resulting code for posterity
    this can be called externally, I guess...for QC eg?
    """
    ffd,filtered_fname = tempfile.mkstemp(prefix='rgManQQtemp')
    f = open(filtered_fname,'w')
    inf = open(input_fname,'r')
    ohead = inf.readline().strip().split('\t') # see if we have a header
    inf.seek(0) # rewind
    newhead = ['pval%d' % (x+1) for x in pval_cols]
    newhead.insert(0,'Offset')
    newhead.insert(0,'Chrom')
    havehead = 0
    wewant = [chrom_col,offset_col]
    wewant += pval_cols
    try:
        allnums = ['%d' % x for x in ohead] # this should barf if non numerics == header row?
        f.write('\t'.join(newhead)) # for R to read
        f.write('\n')
    except:
        havehead = 1
        newhead = [ohead[chrom_col],ohead[offset_col]]
        newhead += [ohead[x] for x in pval_cols]
        f.write('\t'.join(newhead)) # use the original head
        f.write('\n')
    for i,row in enumerate(inf):
        if i == 0 and havehead:
            continue # ignore header
        sr = row.strip().split('\t')
        if len(sr) > 1:
            if sr[chrom_col].lower().find('chr') <> -1:
                sr[chrom_col] = sr[chrom_col][3:]
            newr = [sr[x] for x in wewant] # grab cols we need
            s = '\t'.join(newr)
            f.write(s)
            f.write('\n')
    f.close()
    pvc = [x+3 for x in range(len(pval_cols))] # 2 for offset and chrom, 1 for r offset start
    pvc = 'c(%s)' % (','.join(map(str,pvc)))
    rcmd = '%s%s' % (rcode,rcode2 % (filtered_fname,'1','2',pvc,title,grey))
    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
    rlog.append('## R script=')
    rlog.append(rcmd)
    if beTidy:
        os.unlink(filtered_fname)
    return rlog,flist
def makePlot(eigpca='test.pca',title='test',pdfname='test.pdf',h=8,w=10,nfp=None,rexe=''):
    """
    the eigenvec file has a # row with the eigenvectors, then subject ids, eigenvecs and lastly
    the subject class
    Rpy not being used here. Write a real R script and run it. Sadly, this means putting numbers
    somewhere - like in the code as monster R vector constructor c(99.3,2.14) strings
    At least you have the data and the analysis in one single place. Highly reproducible little
    piece of research.
    """
    debug=False
    f = file(eigpca,'r')
    R = []
    if debug:
      R.append('sessionInfo()')
      R.append("print('dir()=:')")
      R.append('dir()')
      R.append("print('pdfname=%s')" % pdfname)
    gvec = []
    pca1 = []
    pca2 = []
    groups = {}
    glist = [] # list for legend
    ngroup = 1 # increment for each new group encountered for pch vector
    for n,row in enumerate(f):
        if n > 1:
            rowlist = row.strip().split()
            group = rowlist[-1]
            v1 = rowlist[1]
            v2 = rowlist[2]
            try:
                v1 = float(v1)
            except:
                v1 = 0.0
            try:
                v2 = float(v2)
            except:
                v2 = 0.0
            if not groups.get(group,None):
                groups[group] = ngroup
                glist.append(group)
                ngroup += 1 # for next group
            gvec.append(groups[group]) # lookup group number
            pca1.append('%f' % v1)
            pca2.append('%f' % v2)
    # now have vectors of group,pca1 and pca2
    llist = [x.encode('ascii') for x in glist] # remove label unicode - eesh
    llist = ['"%s"' % x for x in llist] # need to quote for R
    R.append('llist=c(%s)' % ','.join(llist))

    plist = range(2,len(llist)+2) # pch - avoid black circles
    R.append('glist=c(%s)' % ','.join(['%d' % x for x in plist]))
    pgvec = ['%d' % (plist[i-1]) for i in gvec] # plot symbol/colour for each point
    R.append("par(lab=c(10,10,10))") # so our grid is denser than the default 5
    R.append("par(mai=c(1,1,1,0.5))")
    maint = title
    R.append('pdf("%s",h=%d,w=%d)' % (pdfname,h,w))
    R.append("par(lab=c(10,10,10))")
    R.append('pca1 = c(%s)' % ','.join(pca1))
    R.append('pca2 = c(%s)' % ','.join(pca2))
    R.append('pgvec = c(%s)' % ','.join(pgvec))
    s = "plot(pca1,pca2,type='p',main='%s', ylab='Second ancestry eigenvector'," % maint
    s += "xlab='First ancestry eigenvector',col=pgvec,cex=0.8,pch=pgvec)"
    R.append(s)
    R.append('legend("top",legend=llist,pch=glist,col=glist,title="Sample")')
    R.append('grid(nx = 10, ny = 10, col = "lightgray", lty = "dotted")')
    R.append('dev.off()')
    R.append('png("%s.png",h=%d,w=%d,units="in",res=72)' % (pdfname,h,w))
    s = "plot(pca1,pca2,type='p',main='%s', ylab='Second ancestry eigenvector'," % maint
    s += "xlab='First ancestry eigenvector',col=pgvec,cex=0.8,pch=pgvec)"
    R.append(s)
    R.append('legend("top",legend=llist,pch=glist,col=glist,title="Sample")')
    R.append('grid(nx = 10, ny = 10, col = "lightgray", lty = "dotted")')
    R.append('dev.off()')
    rlog,flist = RRun(rcmd=R,title=title,outdir=nfp)
    print >> sys.stdout, '\n'.join(R)
    print >> sys.stdout, rlog
 opts.log_file = opts.log or os.path.join(opts.output_dir,
                                          opts.input_file + '.log')
 ext = opts.file_ext
 if ext == 'fastqsanger':
     ext = 'fastq'
 elif ext == 'bwa':
     ext = 'Bowtie'
 # Create output folder and save our R script in there.
 if not os.path.exists(opts.output_dir):
     os.makedirs(opts.output_dir)
 r_script_file = os.path.join(opts.output_dir, R_SCRIPT_NAME)
 r_script = R_SCRIPT % (opts.input_dir, opts.input_file, ext,
                        opts.output_dir)
 #run_r(R_SCRIPT % ( opts.input_dir, opts.input_file, ext, opts.output_dir),r_script_file,opts.log_file,opts.output_dir)
 rlog, flist = RRun(rcmd=r_script,
                    outdir=opts.output_dir,
                    title=opts.namejob,
                    tidy=True)
 # Get file contents.
 index_file_name = os.path.join(opts.output_dir, HTML_PAGE)
 try:
     index_file = open(index_file_name)
     index_contents = index_file.read()
     index_file.close()
 except:
     index_contents = '## error - is the Bioconductor shortreadqc package installed? ##\n'
 # Substitute contents in memory. A regexp looking for actual links
 # would be more correct, but a straight substitution seems ok for
 # our cases.
 index_contents = index_contents.replace('./image/', '')
 index_contents = index_contents.replace(
     '</body>',