Пример #1
0
def ceQTL_filter_tfs(args, pCefile, hasAdjPval, starts):
    if not args.tfs:
        return pCefile
    header = ['Case', 'Pooled', 'Groups', 'Fstat', 'Pval']
    if hasAdjPval:
        header.append('AdjPval')

    if not path.isfile(args.tfs[0]):
        pTffile = pStr2File.copy()
        pTffile.input = [','.join(sorted(args.tfs))]
    else:
        pTffile = pFile2Proc.copy()
        pTffile.input = [args.tfs[0]]
    starts.append(pTffile)
    pSortByTF.depends = pCefile

    pFilterTfs = pTsvJoin.copy()
    pFilterTfs.depends = pSortByTF, pTffile
    pFilterTfs.input = lambda ch1, ch2: [[ch1.get(), ch2.get()]]
    pFilterTfs.args.inopts.cnames = False
    pFilterTfs.args.inopts.skip = [1, 0]
    pFilterTfs.args.inopts.delimit = ['.', '\t']
    pFilterTfs.args.outopts.cnames = header
    pFilterTfs.args.outopts.delimit = '.'
    pFilterTfs.args.outopts.headCallback = 'lambda cnames: "\t".join(cnames)'
    pFilterTfs.args.match = 'lambda r1, r2: TsvJoin.compare(r1[1], r2[0])'
    pFilterTfs.args.do = 'lambda out, r1, r2: out.write(r1.values())'

    return pFilterTfs
Пример #2
0
def ceQTL_atsnp(args):

    pGTMat2Bed.input = [args.cefile]
    pGTMat2Bed.args.inopts.cnames = False
    pGTMat2Bed.args.inopts.skip = 1
    pGTMat2Bed.args.inopts.delimit = '.'
    pGTMat2Bed.args.name = 'full'
    pGTMat2Bed.args.ncol = 8

    pSortSnp = pSort.copy()
    pSortSnp.depends = pGTMat2Bed
    pSortSnp.args.unique = True

    pSortByTF.input = [args.cefile]

    pFilterTFs = pTsvJoin.copy()
    pFilterTFs.depends = pSortByTF
    pFilterTFs.input = lambda ch: [ch.insert(0, args.tflist).flatten()]
    pFilterTFs.args.inopts.cnames = False
    pFilterTFs.args.inopts.skip = [0, 1]
    pFilterTFs.args.inopts.delimit = ['\t', '.']
    pFilterTFs.args.outopts.cnames = False
    pFilterTFs.args.match = 'lambda r1, r2: TsvJoin.compare(r1[1], r2[1])'
    pFilterTFs.args.do = 'lambda out, r1, r2: out.write(r1)'

    pAtSnp.depends = pFilterTFs, pSortSnp
    pAtSnp.args.tfmotifs = args.motifdb
    pAtSnp.args.fdr = False
    pAtSnp.args.plot = False
    pAtSnp.args.nthread = args.nthread
    setOutfile(pAtSnp, args.outfile)

    if args.man:
        pToMan = pTsv.copy()
        pToMan.depends = pAtSnp
        pToMan.args.outopts.cnames = False
        # [chr1, 12496021, rs6541023, 0.04]
        pToMan.args.helper = 'snprec = lambda x: [x[0], int(x[1]) - 1, x[1], x[2], 0, "+"]'
        pToMan.args.row = 'lambda r: snprec(r.Snp.split("_")[:3]) + [r.Pval_Diff]'

        pBedSort.depends = pToMan
        pBedSort.args.chrorder = params.chrorder.value

        pManhattan.depends = pBedSort
        if args.hifile:
            pManhattan.input = lambda ch: ch.cbind(args.hifile)
        pManhattan.args.gsize = params.gsize.value
        setOutfile(pManhattan, args.man)

    PyPPL().start(pGTMat2Bed, pSortByTF).run()
Пример #3
0
def ceQTL_filter_regs(args, pCefile, hasAdjPval, starts):
    if not args.regs:
        return pCefile
    header = ['Case', 'Pooled', 'Groups', 'Fstat', 'Pval']
    if hasAdjPval:
        header.append('AdjPval')

    if not path.isfile(args.regs[0]):
        pRegfile = pStr2File.copy()
        pRegfile.input = [
            ','.join(
                reg.replace(':', '\t').replace('-', '\t') for reg in args.regs)
        ]
    else:
        pRegfile = pFile2Proc.copy()
        pRegfile.input = [args.regs[0]]
    starts.append(pRegfile)

    pGTMat2Bed.depends = pCefile
    pGTMat2Bed.args.inopts.cnames = False
    pGTMat2Bed.args.inopts.skip = 1
    pGTMat2Bed.args.inopts.delimit = '.'
    pGTMat2Bed.args.name = 'full'
    pGTMat2Bed.args.ncol = 6

    pUniqueBed = pSort.copy()
    pUniqueBed.depends = pGTMat2Bed
    pUniqueBed.args.unique = True

    pBedIntersect.depends = pUniqueBed, pRegfile
    pSortBySnp.depends = pCefile

    pFilterRegs = pTsvJoin.copy()
    pFilterRegs.depends = pSortBySnp, pBedIntersect
    pFilterRegs.input = lambda ch1, ch2: [[ch1.get(), ch2.get()]]
    pFilterRegs.args.inopts.cnames = False
    pFilterRegs.args.inopts.skip = [1, 0]
    pFilterRegs.args.inopts.delimit = ['.', '\t']
    pFilterRegs.args.outopts.cnames = header
    pFilterRegs.args.outopts.delimit = '.'
    pFilterRegs.args.outopts.headCallback = 'lambda cnames: "\t".join(cnames)'
    pFilterRegs.args.match = 'lambda r1, r2: TsvJoin.compare(r1[0], r2[3])'
    pFilterRegs.args.do = 'lambda out, r1, r2: out.write(r1.values())'

    return pFilterRegs
Пример #4
0
def common_samples(gtype, expr, transpose=False):
    """Select common samples between genotype and expression data"""
    pTsvHeaderGT = pTsvHeader.copy()
    pTsvHeaderGT.input = [gtype]

    pTsvHeaderExpr = pTsvHeader.copy()
    pTsvHeaderExpr.input = [expr]

    pSortForJoin = pSort.copy()
    pSortForJoin.depends = pTsvHeaderGT, pTsvHeaderExpr
    pSortForJoin.input = lambda ch1, ch2: ch1.rbind(ch2)

    pTsvJoinHeader = pTsvJoin.copy()
    pTsvJoinHeader.depends = pSortForJoin
    pTsvJoinHeader.input = lambda ch: [ch.flatten()]
    pTsvJoinHeader.args.inopts.cnames = False
    pTsvJoinHeader.args.helper = [
        'rnames_written = False',
        'def write(writer, r1, r2):',
        '   global rnames_written',
        '   if not rnames_written:',
        '       writer.write(["ID"])',
        '       writer.write(["ROWNAME"])',
        '       rnames_written = True',
        '   writer.write(r1)',
    ]
    pTsvJoinHeader.args.do = 'lambda writer, r1, r2: write(writer, r1, r2)'

    pTsvColSelectCommonSamples = pTsvColSelect.copy()
    pTsvColSelectCommonSamples.input = 'infile:file, colfile:file'
    pTsvColSelectCommonSamples.depends = pTsvJoinHeader
    pTsvColSelectCommonSamples.input = lambda ch: ch.rep_row(2).insert(
        0, [gtype, expr])

    if transpose:
        pTransposeInput = pTranspose.copy()
        pTransposeInput.depends = pTsvColSelectCommonSamples
        return [pTsvHeaderGT, pTsvHeaderExpr], [pTransposeInput]

    return [pTsvHeaderGT, pTsvHeaderExpr], [pTsvColSelectCommonSamples]
Пример #5
0
def main():
    """Main function"""
    opts = params._parse(dict_wrapper=Diot)
    from bioprocs.stats import pChow, pAdjust
    from bioprocs.tsv import pTranspose, pTsvSplit, pTsvJoin, pTsv
    from bioprocs.common import pSort
    from bioprocs.utils import shell2 as shell
    from procs import pTFT2GeneGroups

    if (opts.njobs == 1):
        pTranspose.input = [opts.gtype]
        pTFT2GeneGroups.input = [opts.tft]

        pChow.depends = pTFT2GeneGroups, pTranspose
        pChow.input = lambda ch1, ch2: ch1.insert(0, opts.expr, ch2.get(),
                                                  opts.snpgene)
        pChow.args.nthread = opts.ncores
        pChow.args.pval = opts.pcut
        pChow.args.plot = False
        pChow.output = ('outfile:file:%s, '
                        'outdir:dir:{{i.infile | fn}}.chow') % Path(opts.outfile).name
        pChow.config.export_dir = Path(opts.outfile).parent

        start_processes = pTranspose, pTFT2GeneGroups
    else:
        n_sg_pair = shell.wc_l(opts.snpgene).split()[0]
        # sort the snpgene file by gene for splitting by gene later on
        pSortSG = pSort.copy()
        pSortSG.input = [opts.snpgene]
        pSortSG.args.inopts.skip = 0
        pSortSG.args.params.k = 2

        pTsvSplit.depends = pSortSG
        pTsvSplit.args.inopts.cnames = False
        pTsvSplit.args.by = math.ceil(float(n_sg_pair)/float(opts.njobs))

        pSortSGBySNP = pSortSG.copy()
        pSortSGBySNP.input = lambda ch: ch.expand()
        pSortSGBySNP.depends = pTsvSplit
        pSortSGBySNP.args.params.k = 1

        # sort genotype file to split
        pSortGT = pSort.copy()
        pSortGT.input = [opts.gtype]
        pSortGT.args.inopts.skip = 1
        pSortGT.args.params.k = 1

        # select gtype type for each set of genes
        pGTSplit = pTsvJoin.copy()
        pGTSplit.depends = pSortGT, pSortSGBySNP
        # size:                  1,  opts.njobs
        pGTSplit.input = lambda ch1, ch2: ch2.insert(0, ch1).map(list)
        pGTSplit.args.inopts.cnames = [True, False]
        pGTSplit.args.outopts.cnames = 0
        pGTSplit.args.match = 'lambda r1, r2: compare(r1[0], r2[0])'
        pGTSplit.args.do = 'lambda writer, r1, r2: writer.write(r1)'

        pTranspose.depends = pGTSplit

        # sort tft to split
        pSortTFT = pSortSG.copy()
        pSortTFT.input = [opts.tft]

        pTFTSplit = pTsvJoin.copy()
        pTFTSplit.depends = pSortTFT, pTsvSplit
        pTFTSplit.input = lambda ch1, ch2: ch2.expand().insert(0, ch1).map(list)
        pTFTSplit.args.inopts.cnames = False
        pTFTSplit.args.match = 'lambda r1, r2: compare(r1[1], r2[1])'
        pTFTSplit.args.do = 'lambda writer, r1, r2: writer.write(r1)'

        pTFT2GeneGroups.depends = pTFTSplit

        pChow.depends = pTranspose, pTsvSplit, pTFT2GeneGroups
        pChow.input = (lambda ch1, ch2, ch3:
                       ch1.cbind(ch2.expand(), ch3).insert(0, opts.expr))
        pChow.args.plot = False
        pChow.args.fdr = False # don't do fdr for single job
        pChow.args.pval = 1.1 # all pvalues to calculate adjusted p

        pAdjust.depends = pChow
        pAdjust.input = lambda ch: [ch.outfile.flatten()]
        pAdjust.args.method = 'BH'
        pAdjust.args.pcol = 'Pval'

        # apply pcut
        pTsv.depends = pAdjust
        pTsv.args.inopts.cnames = True
        pTsv.args.row = 'lambda row: float(row.Pval) < %f' % opts.pcut

        pTsv.output = 'outfile:file:%s' % Path(opts.outfile).name
        pTsv.config.export_dir = Path(opts.outfile).parent

        start_processes = [pSortSG, pSortGT, pSortTFT]

    PyPPL(forks=opts.njobs).start(start_processes).run(opts.runner)
Пример #6
0
def splitjob(opts):
    """Pipeline for split job"""

    med = 'med' in opts.type.lower()
    starts, ends = common_samples(opts.gtype, opts.expr)

    n_sg_pair = shell.wc_l(opts.snpgene).split()[0]
    # sort the snpgene file by gene for splitting by gene later on
    pSortSG = pSort.copy()
    pSortSG.input = [opts.snpgene]
    pSortSG.args.inopts.skip = 0
    pSortSG.args.params.k = 2
    starts.append(pSortSG)

    pTsvSplit.depends = pSortSG
    pTsvSplit.args.inopts.cnames = False
    pTsvSplit.args.by = math.ceil(float(n_sg_pair) / float(opts.njobs))

    pSortSGBySNP = pSortSG.copy()
    pSortSGBySNP.depends = pTsvSplit
    pSortSGBySNP.input = lambda ch: ch.expand()
    pSortSGBySNP.args.params.k = 1

    # sort genotype file to split
    pSortGT = pSort.copy()
    pSortGT.depends = ends
    pSortGT.input = lambda ch: ch.row_at(0)
    pSortGT.args.inopts.skip = 1
    pSortGT.args.params.k = 1

    # select gtype type for each set of genes
    pGTSplit = pTsvJoin.copy()
    pGTSplit.depends = pSortGT, pSortSGBySNP
    # size:                  1,  opts.njobs
    pGTSplit.input = lambda ch1, ch2: ch2.insert(0, ch1).map(list)
    pGTSplit.args.inopts.cnames = [True, False]
    pGTSplit.args.outopts.cnames = 0
    pGTSplit.args.match = 'lambda r1, r2: compare(r1[0], r2[0])'
    pGTSplit.args.do = 'lambda writer, r1, r2: writer.write(r1)'

    pTranspose.depends = pGTSplit

    pTransposeExpr = pTranspose.copy()
    pTransposeExpr.depends = ends
    pTransposeExpr.input = lambda ch: ch.row_at(1)
    starts.append(pTransposeExpr)

    # sort tft to split
    pSortTFT = pSortSG.copy()
    pSortTFT.input = [opts.tft]
    starts.append(pSortTFT)

    pTFTSplit = pTsvJoin.copy()
    pTFTSplit.depends = pSortTFT, pTsvSplit
    pTFTSplit.input = lambda ch1, ch2: ch2.expand().insert(0, ch1).map(list)
    pTFTSplit.args.inopts.cnames = False
    pTFTSplit.args.match = 'lambda r1, r2: compare(r1[1], r2[1])'
    pTFTSplit.args.do = 'lambda writer, r1, r2: writer.write(r1)'

    pTFTSG2MedCases.depends = pTFTSplit, pTsvSplit
    pTFTSG2MedCases.input = lambda ch1, ch2: ch2.expand().insert(0, ch1)

    pTsvCbind.depends = pTranspose, pTransposeExpr
    pTsvCbind.input = lambda ch1, ch2: ch1.cbind(ch2).map(list)
    pTsvCbind.args.fill = False
    pTsvCbind.args.fn2cname = 'function(fn, cnames) cnames'

    pMed = pMediation if med else pModeration
    pMed.depends = pTsvCbind, pTFTSG2MedCases
    pMed.args.plot = False
    pMed.args.fdr = False
    pMed.args.pval = 1.1

    pAdjust.depends = pMed
    pAdjust.input = lambda ch: [ch.outfile.flatten()]
    pAdjust.args.method = 'BH'
    pAdjust.args.pcol = 'Pval'

    # apply pcut
    pTsv.depends = pAdjust
    pTsv.args.inopts.cnames = True
    pTsv.args.row = 'lambda row: float(row.Pval) < %f %s' % (
        opts.pcut, 'and float(row.PropMed) > 0' if med else '')

    pTsv.output = 'outfile:file:%s' % Path(opts.outfile).name
    pTsv.config.export_dir = Path(opts.outfile).parent

    return starts