def ceQTL_filter_tfs(args, pCefile, hasAdjPval, starts): if not args.tfs: return pCefile header = ['Case', 'Pooled', 'Groups', 'Fstat', 'Pval'] if hasAdjPval: header.append('AdjPval') if not path.isfile(args.tfs[0]): pTffile = pStr2File.copy() pTffile.input = [','.join(sorted(args.tfs))] else: pTffile = pFile2Proc.copy() pTffile.input = [args.tfs[0]] starts.append(pTffile) pSortByTF.depends = pCefile pFilterTfs = pTsvJoin.copy() pFilterTfs.depends = pSortByTF, pTffile pFilterTfs.input = lambda ch1, ch2: [[ch1.get(), ch2.get()]] pFilterTfs.args.inopts.cnames = False pFilterTfs.args.inopts.skip = [1, 0] pFilterTfs.args.inopts.delimit = ['.', '\t'] pFilterTfs.args.outopts.cnames = header pFilterTfs.args.outopts.delimit = '.' pFilterTfs.args.outopts.headCallback = 'lambda cnames: "\t".join(cnames)' pFilterTfs.args.match = 'lambda r1, r2: TsvJoin.compare(r1[1], r2[0])' pFilterTfs.args.do = 'lambda out, r1, r2: out.write(r1.values())' return pFilterTfs
def ceQTL_atsnp(args): pGTMat2Bed.input = [args.cefile] pGTMat2Bed.args.inopts.cnames = False pGTMat2Bed.args.inopts.skip = 1 pGTMat2Bed.args.inopts.delimit = '.' pGTMat2Bed.args.name = 'full' pGTMat2Bed.args.ncol = 8 pSortSnp = pSort.copy() pSortSnp.depends = pGTMat2Bed pSortSnp.args.unique = True pSortByTF.input = [args.cefile] pFilterTFs = pTsvJoin.copy() pFilterTFs.depends = pSortByTF pFilterTFs.input = lambda ch: [ch.insert(0, args.tflist).flatten()] pFilterTFs.args.inopts.cnames = False pFilterTFs.args.inopts.skip = [0, 1] pFilterTFs.args.inopts.delimit = ['\t', '.'] pFilterTFs.args.outopts.cnames = False pFilterTFs.args.match = 'lambda r1, r2: TsvJoin.compare(r1[1], r2[1])' pFilterTFs.args.do = 'lambda out, r1, r2: out.write(r1)' pAtSnp.depends = pFilterTFs, pSortSnp pAtSnp.args.tfmotifs = args.motifdb pAtSnp.args.fdr = False pAtSnp.args.plot = False pAtSnp.args.nthread = args.nthread setOutfile(pAtSnp, args.outfile) if args.man: pToMan = pTsv.copy() pToMan.depends = pAtSnp pToMan.args.outopts.cnames = False # [chr1, 12496021, rs6541023, 0.04] pToMan.args.helper = 'snprec = lambda x: [x[0], int(x[1]) - 1, x[1], x[2], 0, "+"]' pToMan.args.row = 'lambda r: snprec(r.Snp.split("_")[:3]) + [r.Pval_Diff]' pBedSort.depends = pToMan pBedSort.args.chrorder = params.chrorder.value pManhattan.depends = pBedSort if args.hifile: pManhattan.input = lambda ch: ch.cbind(args.hifile) pManhattan.args.gsize = params.gsize.value setOutfile(pManhattan, args.man) PyPPL().start(pGTMat2Bed, pSortByTF).run()
def ceQTL_filter_regs(args, pCefile, hasAdjPval, starts): if not args.regs: return pCefile header = ['Case', 'Pooled', 'Groups', 'Fstat', 'Pval'] if hasAdjPval: header.append('AdjPval') if not path.isfile(args.regs[0]): pRegfile = pStr2File.copy() pRegfile.input = [ ','.join( reg.replace(':', '\t').replace('-', '\t') for reg in args.regs) ] else: pRegfile = pFile2Proc.copy() pRegfile.input = [args.regs[0]] starts.append(pRegfile) pGTMat2Bed.depends = pCefile pGTMat2Bed.args.inopts.cnames = False pGTMat2Bed.args.inopts.skip = 1 pGTMat2Bed.args.inopts.delimit = '.' pGTMat2Bed.args.name = 'full' pGTMat2Bed.args.ncol = 6 pUniqueBed = pSort.copy() pUniqueBed.depends = pGTMat2Bed pUniqueBed.args.unique = True pBedIntersect.depends = pUniqueBed, pRegfile pSortBySnp.depends = pCefile pFilterRegs = pTsvJoin.copy() pFilterRegs.depends = pSortBySnp, pBedIntersect pFilterRegs.input = lambda ch1, ch2: [[ch1.get(), ch2.get()]] pFilterRegs.args.inopts.cnames = False pFilterRegs.args.inopts.skip = [1, 0] pFilterRegs.args.inopts.delimit = ['.', '\t'] pFilterRegs.args.outopts.cnames = header pFilterRegs.args.outopts.delimit = '.' pFilterRegs.args.outopts.headCallback = 'lambda cnames: "\t".join(cnames)' pFilterRegs.args.match = 'lambda r1, r2: TsvJoin.compare(r1[0], r2[3])' pFilterRegs.args.do = 'lambda out, r1, r2: out.write(r1.values())' return pFilterRegs
def common_samples(gtype, expr, transpose=False): """Select common samples between genotype and expression data""" pTsvHeaderGT = pTsvHeader.copy() pTsvHeaderGT.input = [gtype] pTsvHeaderExpr = pTsvHeader.copy() pTsvHeaderExpr.input = [expr] pSortForJoin = pSort.copy() pSortForJoin.depends = pTsvHeaderGT, pTsvHeaderExpr pSortForJoin.input = lambda ch1, ch2: ch1.rbind(ch2) pTsvJoinHeader = pTsvJoin.copy() pTsvJoinHeader.depends = pSortForJoin pTsvJoinHeader.input = lambda ch: [ch.flatten()] pTsvJoinHeader.args.inopts.cnames = False pTsvJoinHeader.args.helper = [ 'rnames_written = False', 'def write(writer, r1, r2):', ' global rnames_written', ' if not rnames_written:', ' writer.write(["ID"])', ' writer.write(["ROWNAME"])', ' rnames_written = True', ' writer.write(r1)', ] pTsvJoinHeader.args.do = 'lambda writer, r1, r2: write(writer, r1, r2)' pTsvColSelectCommonSamples = pTsvColSelect.copy() pTsvColSelectCommonSamples.input = 'infile:file, colfile:file' pTsvColSelectCommonSamples.depends = pTsvJoinHeader pTsvColSelectCommonSamples.input = lambda ch: ch.rep_row(2).insert( 0, [gtype, expr]) if transpose: pTransposeInput = pTranspose.copy() pTransposeInput.depends = pTsvColSelectCommonSamples return [pTsvHeaderGT, pTsvHeaderExpr], [pTransposeInput] return [pTsvHeaderGT, pTsvHeaderExpr], [pTsvColSelectCommonSamples]
def main(): """Main function""" opts = params._parse(dict_wrapper=Diot) from bioprocs.stats import pChow, pAdjust from bioprocs.tsv import pTranspose, pTsvSplit, pTsvJoin, pTsv from bioprocs.common import pSort from bioprocs.utils import shell2 as shell from procs import pTFT2GeneGroups if (opts.njobs == 1): pTranspose.input = [opts.gtype] pTFT2GeneGroups.input = [opts.tft] pChow.depends = pTFT2GeneGroups, pTranspose pChow.input = lambda ch1, ch2: ch1.insert(0, opts.expr, ch2.get(), opts.snpgene) pChow.args.nthread = opts.ncores pChow.args.pval = opts.pcut pChow.args.plot = False pChow.output = ('outfile:file:%s, ' 'outdir:dir:{{i.infile | fn}}.chow') % Path(opts.outfile).name pChow.config.export_dir = Path(opts.outfile).parent start_processes = pTranspose, pTFT2GeneGroups else: n_sg_pair = shell.wc_l(opts.snpgene).split()[0] # sort the snpgene file by gene for splitting by gene later on pSortSG = pSort.copy() pSortSG.input = [opts.snpgene] pSortSG.args.inopts.skip = 0 pSortSG.args.params.k = 2 pTsvSplit.depends = pSortSG pTsvSplit.args.inopts.cnames = False pTsvSplit.args.by = math.ceil(float(n_sg_pair)/float(opts.njobs)) pSortSGBySNP = pSortSG.copy() pSortSGBySNP.input = lambda ch: ch.expand() pSortSGBySNP.depends = pTsvSplit pSortSGBySNP.args.params.k = 1 # sort genotype file to split pSortGT = pSort.copy() pSortGT.input = [opts.gtype] pSortGT.args.inopts.skip = 1 pSortGT.args.params.k = 1 # select gtype type for each set of genes pGTSplit = pTsvJoin.copy() pGTSplit.depends = pSortGT, pSortSGBySNP # size: 1, opts.njobs pGTSplit.input = lambda ch1, ch2: ch2.insert(0, ch1).map(list) pGTSplit.args.inopts.cnames = [True, False] pGTSplit.args.outopts.cnames = 0 pGTSplit.args.match = 'lambda r1, r2: compare(r1[0], r2[0])' pGTSplit.args.do = 'lambda writer, r1, r2: writer.write(r1)' pTranspose.depends = pGTSplit # sort tft to split pSortTFT = pSortSG.copy() pSortTFT.input = [opts.tft] pTFTSplit = pTsvJoin.copy() pTFTSplit.depends = pSortTFT, pTsvSplit pTFTSplit.input = lambda ch1, ch2: ch2.expand().insert(0, ch1).map(list) pTFTSplit.args.inopts.cnames = False pTFTSplit.args.match = 'lambda r1, r2: compare(r1[1], r2[1])' pTFTSplit.args.do = 'lambda writer, r1, r2: writer.write(r1)' pTFT2GeneGroups.depends = pTFTSplit pChow.depends = pTranspose, pTsvSplit, pTFT2GeneGroups pChow.input = (lambda ch1, ch2, ch3: ch1.cbind(ch2.expand(), ch3).insert(0, opts.expr)) pChow.args.plot = False pChow.args.fdr = False # don't do fdr for single job pChow.args.pval = 1.1 # all pvalues to calculate adjusted p pAdjust.depends = pChow pAdjust.input = lambda ch: [ch.outfile.flatten()] pAdjust.args.method = 'BH' pAdjust.args.pcol = 'Pval' # apply pcut pTsv.depends = pAdjust pTsv.args.inopts.cnames = True pTsv.args.row = 'lambda row: float(row.Pval) < %f' % opts.pcut pTsv.output = 'outfile:file:%s' % Path(opts.outfile).name pTsv.config.export_dir = Path(opts.outfile).parent start_processes = [pSortSG, pSortGT, pSortTFT] PyPPL(forks=opts.njobs).start(start_processes).run(opts.runner)
def splitjob(opts): """Pipeline for split job""" med = 'med' in opts.type.lower() starts, ends = common_samples(opts.gtype, opts.expr) n_sg_pair = shell.wc_l(opts.snpgene).split()[0] # sort the snpgene file by gene for splitting by gene later on pSortSG = pSort.copy() pSortSG.input = [opts.snpgene] pSortSG.args.inopts.skip = 0 pSortSG.args.params.k = 2 starts.append(pSortSG) pTsvSplit.depends = pSortSG pTsvSplit.args.inopts.cnames = False pTsvSplit.args.by = math.ceil(float(n_sg_pair) / float(opts.njobs)) pSortSGBySNP = pSortSG.copy() pSortSGBySNP.depends = pTsvSplit pSortSGBySNP.input = lambda ch: ch.expand() pSortSGBySNP.args.params.k = 1 # sort genotype file to split pSortGT = pSort.copy() pSortGT.depends = ends pSortGT.input = lambda ch: ch.row_at(0) pSortGT.args.inopts.skip = 1 pSortGT.args.params.k = 1 # select gtype type for each set of genes pGTSplit = pTsvJoin.copy() pGTSplit.depends = pSortGT, pSortSGBySNP # size: 1, opts.njobs pGTSplit.input = lambda ch1, ch2: ch2.insert(0, ch1).map(list) pGTSplit.args.inopts.cnames = [True, False] pGTSplit.args.outopts.cnames = 0 pGTSplit.args.match = 'lambda r1, r2: compare(r1[0], r2[0])' pGTSplit.args.do = 'lambda writer, r1, r2: writer.write(r1)' pTranspose.depends = pGTSplit pTransposeExpr = pTranspose.copy() pTransposeExpr.depends = ends pTransposeExpr.input = lambda ch: ch.row_at(1) starts.append(pTransposeExpr) # sort tft to split pSortTFT = pSortSG.copy() pSortTFT.input = [opts.tft] starts.append(pSortTFT) pTFTSplit = pTsvJoin.copy() pTFTSplit.depends = pSortTFT, pTsvSplit pTFTSplit.input = lambda ch1, ch2: ch2.expand().insert(0, ch1).map(list) pTFTSplit.args.inopts.cnames = False pTFTSplit.args.match = 'lambda r1, r2: compare(r1[1], r2[1])' pTFTSplit.args.do = 'lambda writer, r1, r2: writer.write(r1)' pTFTSG2MedCases.depends = pTFTSplit, pTsvSplit pTFTSG2MedCases.input = lambda ch1, ch2: ch2.expand().insert(0, ch1) pTsvCbind.depends = pTranspose, pTransposeExpr pTsvCbind.input = lambda ch1, ch2: ch1.cbind(ch2).map(list) pTsvCbind.args.fill = False pTsvCbind.args.fn2cname = 'function(fn, cnames) cnames' pMed = pMediation if med else pModeration pMed.depends = pTsvCbind, pTFTSG2MedCases pMed.args.plot = False pMed.args.fdr = False pMed.args.pval = 1.1 pAdjust.depends = pMed pAdjust.input = lambda ch: [ch.outfile.flatten()] pAdjust.args.method = 'BH' pAdjust.args.pcol = 'Pval' # apply pcut pTsv.depends = pAdjust pTsv.args.inopts.cnames = True pTsv.args.row = 'lambda row: float(row.Pval) < %f %s' % ( opts.pcut, 'and float(row.PropMed) > 0' if med else '') pTsv.output = 'outfile:file:%s' % Path(opts.outfile).name pTsv.config.export_dir = Path(opts.outfile).parent return starts