Exemplo n.º 1
0
def gatherCausalStats(Ddata,
                      scenario,
                      selpos=500000,
                      thinSfx='',
                      complikeSfx=None,
                      likesTableSfx='',
                      nonNanStats='ALL',
                      getio=None):
    """For each replica in one scenario, gather causal SNP stats and save them as the replica statistic.
    """

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    complikeFN = os.path.join(
        snpStatsDir,
        AddFileSfx('complike.data/', 'normedLocal', scenario.mutPop,
                   complikeSfx, 'nonNan', *MakeSeq(nonNanStats)))

    causalStatsFN = os.path.join(
        replicaStatsDir,
        AddFileSfx('causalStats.tsv', complikeSfx, likesTableSfx, 'nonNan',
                   *MakeSeq(nonNanStats)))

    if getio:
        return dict(depends_on=complikeFN,
                    creates=causalStatsFN,
                    mediumRuleNameSfx=(scenario.scenDir(), complikeSfx,
                                       likesTableSfx))

    complikeFile = IDotData(complikeFN)
    complikeFile[complikeFile.Pos == selpos].addComputedCols(
        newColNames='replicaNum',
        newColFn=lambda r: int(r.Chrom)).save(causalStatsFN)
Exemplo n.º 2
0
def normalizeColumnsWithinGroups(inFN,
                                 cols,
                                 groupCols,
                                 outFN,
                                 groupsAreContiguous=True,
                                 getio=None):
    """Normalize the specified columns of a table within groups.

    Params:

       inFN - the input table
       cols - the columns to be normalized
       groupCols - the columns that define the groups: rows that have the same combination of values
          in the group columns, are in the same group.
       outFN - the output table
       groupsAreContiguous - if True, rows belonging to the same group must be contiguous in the table;
          if False, no such assumption is made.
    """

    cols = tuple(MakeSeq(cols))
    groupCols = tuple(MakeSeq(groupCols))

    meansFN = GetCreates(computeMeanStdWithinGroups,
                         **Dict('inFN cols groupCols groupsAreContiguous'))[0]

    if getio:
        return dict(depends_on=(inFN, meansFN),
                    creates=outFN,
                    splitByCols={inFN: dict(keyCols=())})

    inFile = IDotData(inFN)
    means = IDotData(meansFN)

    inFile.normalizeColumnsWithinGroups_using_means(
        **Dict('cols groupCols groupsAreContiguous means')).save(outFN)
Exemplo n.º 3
0
def checkTableKey(inFN,
                  cols,
                  comparison='lt',
                  writeCheckedFile=True,
                  tsvOpts={},
                  lineFilter=None,
                  lineFilterCols=(),
                  getio=None):
    """Check that in the given table, record identifiers increase uniformly.

  Params:

     cols - the columns whose tuple should uniformly inrease
     comparison - this comparison must be true between each record and the next.
       the comparison is the name of a routine in the operator module.
  """

    cols = tuple(MakeSeq(cols))
    lineFilterCols = tuple(MakeSeq(lineFilterCols))
    checkedFN = Str('$inFN.checked_${comparison}') + Sfx(*cols)
    if getio:
        return dict(depends_on=inFN,
                    creates=checkedFN if writeCheckedFile else (),
                    attrs=dict(piperun_short=True))

    comparisonFunc = getattr(operator, comparison)
    prevRec = None
    loadCols = cols + lineFilterCols

    nskipped = 0
    nchecked = 0
    for i, r in enumerate(IDotData(inFN, ToLoad=loadCols, **tsvOpts)):
        if lineFilter and not lineFilter(r):
            nskipped += 1
            continue

        thisRec = r[cols] if IsSeq(r) else (r, )
        if i > 0 and not comparisonFunc(prevRec, thisRec):
            logging.error(
                Str('at line $i of $inFN, looking at $cols: $prevRec is not $comparison $thisRec'
                    ))
            assert False
        else:
            nchecked += 1
        prevRec = thisRec

    dbg('nchecked nskipped')
    DumpFile(checkedFN, 'checked ok.')
Exemplo n.º 4
0
def computeMeanFstAndFreqDiffScores(pops,
                                    chrom,
                                    selPop,
                                    sweepDir,
                                    pop2ancFreqFN,
                                    pop2sampleSizeFN,
                                    outMeanFstFN,
                                    outFreqDiffFN,
                                    getio=None):
    """Compute meanFst and freqDiff scores"""

    if selPop not in pops: pops = tuple(MakeSeq(pops)) + (selPop, )
    cmpPops = [pop for pop in pops if pop != selPop]

    if getio:
        return dict(depends_on=(pop2ancFreqFN, pop2sampleSizeFN),
                    creates=(outMeanFstFN, outFreqDiffFN),
                    attrs=Dict('chrom', pop=pops))

    #    pop2ancFreq.to_csv( 'befdrop.tsv', sep = '\t' )
    #    pop2ancFreq.fillna( value = 1.0, inplace = True )

    #    pop2ancFreq.to_csv( 'aftdrop.tsv', sep = '\t' )

    pop2ancFreq = pd.read_table(pop2ancFreqFN, index_col='pos')
    pop2sampleSize = pd.read_table(pop2sampleSizeFN,
                                   index_col='pop').sampleSize

    dbg('pop2sampleSize')

    #pop2snpInfo.to_csv( 'test.tsv', sep = '\t', header = True )

    derFreq = 1.0 - pop2ancFreq[selPop]
    cmpAncFreqs = pop2ancFreq[[pop for pop in pops if pop != selPop]]
    meanAnc = cmpAncFreqs.mean(axis=1)
    freqDiff = derFreq - (1.0 - meanAnc)
    freqDiff.name = 'freqDiff'
    freqDiff.to_csv(outFreqDiffFN, sep='\t', header=True)

    # compute meanFst

    #    dbg( '"vvvvvvvvvvvw" selPop pop2ancFreq[selPop] pop2ancFreq["JPT+CHB"] pop2ancFreq["YRI"]' )
    #    dbg( 'selPop pop2sampleSize[selPop] pop2sampleSize["JPT+CHB"] pop2sampleSize["YRI"]' )
    d = dict([(pop,
               fst_onePopPair(ancFreqs=np.array(
                   (pop2ancFreq[selPop], pop2ancFreq[pop])),
                              sampleSizes=(pop2sampleSize[selPop],
                                           pop2sampleSize[pop])))
              for pop in cmpPops])
    fstVals = pd.DataFrame(data=d, index=pop2ancFreq.index)
    #    spc = fst_onePopPair( ancFreqs = np.array( ( pop2ancFreq[ 'BEB' ], pop2ancFreq[ 'ASN' ] ) ),
    #                          sampleSizes = ( pop2sampleSize[ 'BEB' ], pop2sampleSize[ 'ASN' ] ) )
    #    dbg( '"ddddddddddd" fstVals.loc[526736] spc' )
    #    dbg( 'fstVals' )
    fstVals.fillna(value=0.0, inplace=True)
    #fstVals.to_csv( 'fstvals.tsv', sep = '\t', header = True, na_rep = 'NaN' )
    fstMean = fstVals.mean(axis=1)
    dbg('fstVals fstMean')
    fstMean.name = 'meanFst'
    fstMean.to_csv(outMeanFstFN, sep='\t', header=True, na_rep='NaN')
Exemplo n.º 5
0
def DefineRulesTo_normalizeColumnsWithinGroups(pr,
                                               inFN,
                                               cols,
                                               groupCols,
                                               groupsAreContiguous=True,
                                               nameSfx='',
                                               outFN=None):
    """Adds rules to create a version of a table with given columns normalized within groups."""

    cols = tuple(MakeSeq(cols))
    groupCols = tuple(MakeSeq(groupCols))

    DefineRulesTo_meanStdWithinGroups(
        **Dict('pr inFN cols groupCols groupsAreContiguous nameSfx'))
    pr.addInvokeRule(
        invokeFn=normalizeColumnsWithinGroups,
        invokeArgs=Dict('inFN cols groupCols outFN groupsAreContiguous'),
        name='normalizeColumnsWithinGroups' + Sfx(nameSfx))
Exemplo n.º 6
0
def DefineRulesTo_computeMeanStd(pr, inFNs, colNum, outFN, addRuleArgs={}):
    """Define rules to compute mean and stddev for a given column in the given tsv files"""

    pr.addRule(commands=' | '.join(
        ('tail -q -n +2 ' + ' '.join(MakeSeq(inFNs)), 'cut -f %d' % colNum,
         'grep -iv nan', '../Operations/Ilya_Operations/tblstats')),
               depends_on=inFNs,
               saveOutputTo=outFN,
               **addRuleArgs)
Exemplo n.º 7
0
def sortTableOn(inFN, outFN, keyCols, reverse=False, getio=None):
    """Sort the given table on the given column(s)."""

    if getio: return dict(depends_on=inFN, creates=outFN)

    result = IDotData(inFN).sortedOn(*MakeSeq(keyCols))
    if reverse:
        d = result.toDotData()
        result = d[range(len(d) - 1, -1, -1)]
    result.save(outFN)
Exemplo n.º 8
0
def plotHistograms(inFN, cols, outFNs=None, getio=None, **kwargs):

    histFNs = GetCreates(computeHistograms, **Dict('inFN cols outFNs'))
    outFN = AddFileSfx(ReplaceFileExt(inFN, '.svg'), 'hist')

    if getio: return dict(depends_on=histFNs, creates=outFN)

    GraphHistograms(histFiles=histFNs,
                    outFile=outFN,
                    labels=tuple(MakeSeq(cols)),
                    **kwargs)
Exemplo n.º 9
0
def gatherCausalRanks(Ddata=None,
                      scenario=None,
                      selpos=500000,
                      thinSfx='',
                      complikeSfx=None,
                      likesTableSfx='',
                      nonNanStats='ALL',
                      cmsFileFN=None,
                      causalRankFN=None,
                      getio=None):
    """For each replica in one scenario, get the rank of the causal SNP by CMS score, and save as a replica statistic.
    """

    assert cmsFileFN or (Ddata and scenario)
    scenDir = scenario.scenDir() if scenario else 'unknown_scenDir'
    if not cmsFileFN:
        snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenDir)
    if not cmsFileFN:
        cmsFileFN = os.path.join(
            snpStatsDir,
            AddFileSfx('complike.data/', scenario.mutPop, complikeSfx,
                       likesTableSfx))

    if not causalRankFN:
        causalRankFN = os.path.join(
            Ddata, 'replicastats', scenDir,
            AddFileSfx('causalRank.tsv', complikeSfx, 'nonNan',
                       *MakeSeq(nonNanStats)))

    if getio:
        return dict(depends_on=cmsFileFN,
                    creates=causalRankFN,
                    mediumRuleNameSfx=(scenDir, complikeSfx))

    cmsScores = IDotData(cmsFileFN)
    if nonNanStats.upper() == 'ALL': nonNanStats = cmsScores.headings

    with IDotData.openForWrite(
            causalRankFN,
            headings='replicaNum causalRank causalScore') as causalRankFile:
        for replicaNum, cmsScores1, cmsScores2 in cmsScores.groupby(
                'Chrom', multiPass=2):
            for r1 in cmsScores1:
                if r1.Pos == selpos:
                    causalScore = r1.complike
                    numHigher = 0
                    for r2 in cmsScores2:
                        if r2.complike > causalScore and all(
                            [np.isfinite(r2[c]) for c in nonNanStats]):
                            numHigher += 1
                    causalRankFile.writeRecord(int(replicaNum), numHigher,
                                               causalScore)

                if r1.Pos >= selpos: break
Exemplo n.º 10
0
def computeHistograms(inFN, cols, binSizes=None, outFNs=None, getio=None):
    """Compute histograms of the specified columns of the input"""

    cols = tuple(MakeSeq(cols))
    binSizesHere = (.001, ) * len(cols) if binSizes is None else tuple(
        MakeSeq(binSizes))
    outFNsHere = outFNs
    if outFNsHere is None:
        outFNsHere = [
            AddFileSubdir('stats', AddFileSfx(inFN, 'hist', col))
            for col in cols
        ]

    assert len(cols) == len(binSizesHere) == len(outFNsHere)
    if getio: return dict(depends_on=inFN, creates=outFNsHere)
    # add histogram combiner

    hists = [Histogrammer(binSize=binSize) for binSize in binSizesHere]
    z = IDotData(inFN)
    for h, c, outFN in zip(hists, cols, outFNsHere):
        h.addVals(z[c])
        h.save(outFN)
Exemplo n.º 11
0
def CreateSimsParams_neutral(Ddata, suffix, inputParamsFiles, getio=None):
    """Write the neutral parameter file.
	"""

    inputParamsFiles = MakeSeq(inputParamsFiles)

    neutralParamsFile = Ddata + '/params_neutral' + suffix

    if getio:
        return dict(depends_on=inputParamsFiles, creates=neutralParamsFile)

    neutralParams = reduce(concat, map(SlurpFile, inputParamsFiles))

    DumpFile(neutralParamsFile, neutralParams)
Exemplo n.º 12
0
def computeSumsWithinGroups(inFN,
                            cols,
                            groupCols,
                            groupsAreContiguous=True,
                            outFN=None,
                            getio=None):
    """For a tsv file, compute sums, sumsquares and counts for each of the given columns within groups
  defined by groupCols.

  >>> z = IDotData( names = ( 'a', 'b' ), Records = ( ( 1, 2 ), ( 1, 3 ), ( 2, 4 ), ( 2, 5 ) ) )
  >>> computeSumsWithinGroups( inFN = z, cols = 'b', groupCols = 'a', outFN = sys.stdout )
  ... # doctest: +NORMALIZE_WHITESPACE
  a	b_count	b_sum	b_sumSq	b_numNaN
  1	2	5.0	13.0	0
  2	2	9.0	41.0	0

  """

    cols = tuple(MakeSeq(cols))
    groupCols = tuple(MakeSeq(groupCols))
    if outFN is None:
        outFN = AddFileSubdir('stats',
                              AddFileSfx(inFN, 'sums', *(cols + groupCols)))

    def combiner(inFNs, outFN):
        IDotData.mergeColumnSummaries(iDotDatas=inFNs,
                                      cols=cols,
                                      groupCols=groupCols).save(outFN)

    if getio:
        return dict(depends_on=inFN,
                    creates=outFN,
                    splitByCols={inFN: dict(keyCols=())},
                    combiner={outFN: combiner})

    IDotData(inFN).summarizeColumnsWithinGroups(
        **Dict('cols groupCols groupsAreContiguous')).save(outFN)
Exemplo n.º 13
0
def computeMeanStd_binned_tsvs(inFNs,
                               valCol,
                               binCol,
                               binMin,
                               binMax,
                               binStep,
                               outFN,
                               getio=None):
    """Compute binned stats for a set of tables"""

    if getio:
        return dict(depends_on=inFNs,
                    creates=outFN,
                    uses=computeMeanStd_binned)

    computeMeanStd_binned(
        inDatas=itertools.imap(
            lambda f: pd.read_table(f, usecols=(valCol, binCol)).dropna(),
            MakeSeq(inFNs)),
        **Dict('valCol binCol binMin binMax binStep')).to_csv(
            outFN, sep='\t', index_label='binId', na_rep='NaN')
Exemplo n.º 14
0
def joinStats(snpInfoFN, statLikesFNs, likesRatioFN, outFN, getio=None):
    """Join stats into one file"""

    if getio:
        return dict(depends_on=(snpInfoFN, likesRatioFN) +
                    tuple(MakeSeq(statLikesFNs)),
                    creates=outFN)

    snpInfo = pd.read_table(snpInfoFN, index_col='SNP pos (bases)')
    snpInfo.index.rename('pos', inplace=True)

    statLikes = [
        pd.read_table(statLikeFN, index_col='pos')
        for statLikeFN in statLikesFNs
    ]
    likesRatio = pd.read_table(likesRatioFN, index_col='pos')

    result = snpInfo.join(statLikes + [likesRatio], how='outer')
    result.info()
    dbg('result.describe()')

    result.to_csv(outFN, sep='\t', na_rep='NaN', header=True)
Exemplo n.º 15
0
def DefineRulesTo_runSims(pr,
                          mutAges,
                          mutPops,
                          mutFreqs,
                          nreplicas,
                          allPops=None,
                          Ddata='../Data/Ilya_Data/sim/sfs/working/pardis2',
                          simsOut='simsOut',
                          suffix='',
                          shortSimTime=True,
                          DdataSeeds='',
                          useGenMap=None,
                          includeNeutral=True,
                          withGeneConvBug=False,
                          withNewCosi=False,
                          withCosi=None,
                          DdataMimic=None):
    """Instantiate, for each combination of ( mutAge, mutPop, mutFreq ),   the script that creates simulation parameters
	for simulations with that selected-mutation-age.
	"""

    assert not (DdataSeeds and DdataMimic)

    mutPops = MakeSeq(mutPops)
    mutAges = MakeSeq(mutAges)
    mutFreqs = MakeSeq(mutFreqs)

    if allPops is None: allPops = mutPops

    Dsims = Ddata + '/' + simsOut + suffix

    for scen in GetScenarios(
            **Dict('mutAges mutPops mutFreqs includeNeutral')):
        if DdataSeeds:
            seeds = IDotData(
                os.path.join(DdataSeeds, 'replicastats', scen.scenDir(),
                             'simSeeds.tsv'))

        for replicaNum, seedsLine in zip(
                range(nreplicas),
                seeds if DdataSeeds else itertools.repeat(None, nreplicas)):

            assert not DdataSeeds or seedsLine.replicaNum == replicaNum

            pfx = os.path.join(Dsims, scen.scenDir(),
                               '%d_%s' % (replicaNum, scen.scenName()))
            recombDir = '../Data/Ilya_Data/sim/sfs/working/pardis2'

            attrs = Dict('replicaNum', scenDir=scen.scenDir())
            if not scen.is_neutral():
                attrs.update(mutAge=scen.mutAge,
                             mutPop=scen.mutPop,
                             mutFreq=scen.mutFreq)
            else:
                attrs.update(mutAge=0, mutPop=0, mutFreq=0)
            if shortSimTime: attrs['piperun_short'] = True

            mutAge = '%dky' % (0 if scen.isNeutral() else scen.mutAge)

            useGenMapFile = os.path.join(
                DdataMimic, 'simsOut', scen.scenDir(), '%d_%s.model' %
                (replicaNum, scen.scenName())) if DdataMimic else ''
            useMutRateFile = os.path.join(
                DdataMimic, 'simsOut', scen.scenDir(), '%d_%s.mut' %
                (replicaNum, scen.scenName())) if DdataMimic else ''
            #			dbg( '"GGGGGGGGG" mutPops' )
            pr.addRule( targets = [ pfx + ext for ext in ( [ '.model', '.mut', '.cosiParams' ] +
                        ( [ '.recombParams' ] if not useGenMap else [] ) +
                        [ '.%s-%d' % ( hapOrPos, pop )
                   for hapOrPos in ('hap', 'pos') for pop in allPops ]  +
                        ( [] if ( withNewCosi or withCosi ) else [ os.path.join( Dsims, scen.scenDir(), 'treeinfo',
                         '%d_%s.%s' % ( replicaNum, scen.scenName(), which ) )
                         for which in ( 'regions.tsv', 'mutlist.tsv', 'nodes.dat' )
                         + ( () if scen.isNeutral() else ( 'sweepinfo.tsv', ) ) ] ) ) ],
                 sources = [ Ddata + '/' +  ( 'params_neutral' + suffix if scen.isNeutral()
                  else 'params%s/%s/params_%s' % \
                   ( suffix, mutAge, scen.scenName() ) ) ] \
                  + ( [ useGenMap ] if useGenMap else [ recombDir + '/recParams_bestfit_generic', \
                      recombDir + '/autosomes_decode.distr' ] ) + \
                  ( [ useGenMapFile, useMutRateFile ] if DdataMimic else [] ),
                 commands = ' '.join(('perl ../Operations/Ilya_Operations/sim/sfs/working/pardis2/' \
                  'runOneSim.pl' + ( ' --coalSeed %ld --recombSeed %ld --useMutRate %s'
                       % ( long( seedsLine.coalescentSeed ),
                    long( seedsLine.recombSeed ),
                    seedsLine.GetStrItem( 'mutRate' ) )
                       if DdataSeeds else '' )
                 + ( ' --useGenMap ' + useGenMap if useGenMap else '' )
                 + ( ' --withGeneConversionBug' if withGeneConvBug else '' )
                 + ( ' --withNewCosi' if withNewCosi else '' )
                 + ( ( ' --withCosi ' + withCosi ) if withCosi else '' )
                 + ( ( ' --useGenMap ' + useGenMapFile + ' --useMutRateFile ' + useMutRateFile )
                     if DdataMimic else '' ),
                 scen.scenName(), mutAge,
                 str(replicaNum), Ddata, Dsims, suffix )),
                 name = 'RunOneSim',
                 attrs = attrs,
                 comment = 'Adding simulation', mediumRuleNameSfx = ( scen.scenName(), mutAge, replicaNum ) )
Exemplo n.º 16
0
def DefineRulesTo_fastCMS(pr,
                          pops,
                          chroms,
                          selPop,
                          sweepDir,
                          cmsDir,
                          genomeBuild='hg19'):
    """Define rules to do fast CMS computation.

    Params:

       pr - the PipeRun object to which to add rules

       selPop - testing selection in which pop?
       pops - comparing selPop to which pops?
       sweepDir - the sweep directory
       cmsDir - the directory under which CMS stats go
    """

    pops = list(MakeSeq(pops))
    if selPop not in pops: pops.append(selPop)

    allPops = tuple(MakeSeq(pops))
    if selPop not in allPops: allPops += (selPop, )
    cmpPops = [pop for pop in allPops if pop != selPop]

    rawScoresFN = {}

    genMapSfx = genomeBuild2genMapSfx[genomeBuild]
    for pop in allPops:
        for chrom in chroms:
            with pr.settingAttrs('pop chrom'):
                snpInfoFN = os.path.join(
                    sweepDir,
                    'analysis/chr%(chrom)s/snps_%(pop)s.tsv' % locals())
                projDir = os.path.join(sweepDir,
                                       'data/chr%(chrom)s' % locals())
                ancestralImportedFN = os.path.join(projDir,
                                                   'ancestral.tsv.imported')
                genotypesImportedFN = os.path.join(
                    projDir,
                    'genotypes_chr%(chrom)s_%(pop)s_r21_nr_fwd_phased_all.imported'
                    % locals())
                genMapImportedFN = os.path.join(
                    projDir,
                    'genetic_map_chr%(chrom)s_%(genMapSfx)s.txt.imported' %
                    locals())
                pr.addRule(
                    name='extractSnpInfo',
                    commands=
                    'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s'
                    % locals(),
                    commandsOld=
                    'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep/target/sweep-1.0-SNAPSHOT-jar-with-dependencies.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s'
                    % locals(),
                    depends_on=(ancestralImportedFN, genotypesImportedFN,
                                genMapImportedFN),
                    creates=snpInfoFN)

    chr2dihhFN = {}

    for chrom in chroms:
        with pr.settingAttrs('chrom'):

            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            xpopScoresFN = os.path.join(
                chromDir, AddFileSfx('max_xpop.tsv', chrom_s, selPop, pops))

            pr.addInvokeRule(invokeFn=gatherXPOPscores,
                             invokeArgs=Dict('pops chrom selPop sweepDir',
                                             outFN=xpopScoresFN),
                             attrs=dict(pop=allPops,
                                        stat='max_xpop',
                                        piperun_short=True))

            ihsFN = getFN_ihs_signif(**Dict('sweepDir chrom', pop=selPop))

            ihsScoresFN = os.path.join(
                chromDir, AddFileSfx('iHS.tsv', chrom_s, selPop, pops))
            dihhScoresFN = os.path.join(
                chromDir, AddFileSfx('dihh.tsv', chrom_s, selPop, pops))

            chr2dihhFN[chrom] = dihhScoresFN

            pop2ancFreqFN = os.path.join(
                cmsDir, chrom_s, AddFileSfx('pop2ancFreq.tsv', chrom_s, pops))
            pop2sampleSizeFN = os.path.join(
                cmsDir, chrom_s, AddFileSfx('pop2sampleSize.tsv', chrom_s,
                                            pops))

            pop2snpInfoFN = dict([(pop,
                                   os.path.join(sweepDir, 'analysis',
                                                'chr%(chrom)s' % locals(),
                                                'snps_%(pop)s.tsv' % locals()))
                                  for pop in pops])

            pr.addInvokeRule(
                invokeFn=gather_snp_info,
                invokeArgs=Dict(
                    'pops pop2snpInfoFN pop2ancFreqFN pop2sampleSizeFN'))

            pr.addInvokeRule(
                invokeFn=gather_iHS_scores,
                invokeArgs=Dict(
                    'chrom selPop ihsFN pop2ancFreqFN',
                    #                                                 snpInfoFN = pop2snpInfoFN[ selPop ],
                    ihsOutFN=ihsScoresFN,
                    dihhOutFN=dihhScoresFN),
                attrs=dict(pop=selPop,
                           stat=('iHS', 'StdDiff'),
                           piperun_short=True))

            freqDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('freqDiff.tsv', chrom_s, selPop, pops))
            meanFstScoresFN = os.path.join(
                chromDir, AddFileSfx('meanFst.tsv', chrom_s, selPop, pops))

            pr.addInvokeRule(
                invokeFn=computeMeanFstAndFreqDiffScores,
                invokeArgs=Dict(
                    'chrom selPop sweepDir pops pop2ancFreqFN pop2sampleSizeFN',
                    outMeanFstFN=meanFstScoresFN,
                    outFreqDiffFN=freqDiffScoresFN),
                attrs=dict(pop=allPops,
                           stat=('freqDiff', 'meanFst'),
                           piperun_short=True))

            StdDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('StdDiff.tsv', chrom_s, selPop, pops))

            rawScoresFN[chrom] = dict(iHS=ihsScoresFN,
                                      StdDiff=StdDiffScoresFN,
                                      meanFst=meanFstScoresFN,
                                      freqDiff=freqDiffScoresFN,
                                      max_xpop=xpopScoresFN)

        # end: with pr.settingAttrs( 'chrom' )
    # end: for chrom in chroms

    #    ihhStdFN = os.path.join( cmsDir, 'dihhstd.tsv' )

    dihhGlobalStdFN = os.path.join(
        cmsDir, AddFileSfx('dihh_global_std.tsv', selPop, pops))
    dihhBinMeansFN = os.path.join(
        cmsDir, AddFileSfx('dihh_bin_means.tsv', selPop, pops))

    pr.addInvokeRule(invokeFn=normalizeByFreq_getMeanStd_tsv,
                     invokeArgs=dict(
                         iHHDiffFNs=[chr2dihhFN[k] for k in chroms],
                         globalStatFN=dihhGlobalStdFN,
                         binsStatFN=dihhBinMeansFN),
                     name='compute_dihh_meanstd')

    # pr.addInvokeRule( invokeFn = computeMeanStd_binned_tsvs,
    #                   invokeArgs = dict( inFNs = chr2dihhFN.values(), valCol = 'iHHDiff',
    #                                      binCol = 'normingFreqs', binMin = 0.05, binMax = 1.05, binStep = .05,
    #                                      outFN = ihhStdFN ),
    #                   name = 'compute_dihh_std' )

    for chrom in chroms:
        with pr.settingAttrs('chrom'):
            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            StdDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('StdDiff.tsv', chrom_s, selPop, pops))
            dbg('chrom chr2dihhFN[chrom]')
            pr.addInvokeRule(invokeFn=normalizeByFreq_compute_normed_tsv,
                             invokeArgs=dict(iHHDiffFN=chr2dihhFN[chrom],
                                             globalStatFN=dihhGlobalStdFN,
                                             binsStatFN=dihhBinMeansFN,
                                             StdDiffFN=StdDiffScoresFN))

    statLikesRatioFNs = {}

    for stat in CMSBins.CMSstats:
        with pr.settingAttrs(
                stat=stat,
                pop=(selPop, ) if stat in ('iHS', 'StdDiff') else allPops,
                piperun_short=True):
            if stat not in CMSBins.nonNormedStats:
                rawFNs = [rawScoresFN[chrom][stat] for chrom in chroms]
                meanStdFN = os.path.join(
                    cmsDir, AddFileSfx('meanStd.tsv', stat, selPop, pops))

                # DefineRulesTo_computeMeanStd( pr, inFNs = rawFNs, colNum = 1,
                #                               outFN = meanStdFN,
                #                               addRuleArgs = \
                #                               dict( name = 'computeMeanStd_for_stat',
                #                                     attrs = dict( chrom = chroms ) ) )

                #                meanStdBzFN = os.path.join( cmsDir, stat + '_meanStdForStat.tsv' )
                pr.addInvokeRule(invokeFn=computeMeanStd,
                                 invokeArgs=dict(inFNs=rawFNs,
                                                 colName=stat,
                                                 outFN=meanStdFN))

            # end: if stat not in CMSBins.nonNormedStats

            for chrom in chroms:
                with pr.settingAttrs('chrom'):
                    statFN = rawScoresFN[chrom][stat]

                    if stat not in CMSBins.nonNormedStats:
                        normedFN = AddFileSfx(statFN, 'normed')

                        DefineRulesTo_normalizeOneColumn(
                            pr,
                            inFN=statFN,
                            meanStdFN=meanStdFN,
                            colName=stat,
                            outFN=normedFN,
                            addRuleArgs=dict(attrs=Dict('chrom')))
                        statFN = normedFN

                    bins_beg = CMSBins.stat_start[stat]
                    bins_end = CMSBins.stat_end[stat]
                    bins_n = CMSBins.stat_nbin[stat]

                    statLikesRatioFN = AddFileSfx(rawScoresFN[chrom][stat],
                                                  'likesRatio')
                    statLikesRatioFNs[(chrom, stat)] = statLikesRatioFN

                    pr.addInvokeRule(
                        invokeFn=computeLikeRatioForStat,
                        invokeArgs=dict(
                            stat=stat,
                            statValsFN=statFN,
                            hitLikesFN=
                            '../Data/Common_Data/sim/likes/hitsLikes_toneutFixed_1.tsv',
                            missLikesFN=
                            '../Data/Common_Data/sim/likes/missLikes_toneutFixed_1.tsv',
                            stat_start=bins_beg,
                            stat_end=bins_end,
                            stat_nbin=bins_n,
                            statLikesRatioFN=statLikesRatioFN))

                # end: with pr.settingAttrs( 'chrom' )
            # end: for chrom in chroms
        # end: with pr.settingAttrs( stat = stat, piperun_short = True )
    # end: for stat in CMSBins.CMSstats

    for chrom in chroms:
        with pr.settingAttrs(chrom=chrom, stat=CMSBins.CMSstats):
            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            likesRatioFN = os.path.join(
                chromDir,
                AddFileSfx('likesRatio.tsv', CMSBins.CMSstats, selPop, pops))
            pr.addInvokeRule(invokeFn=addLikesRatios,
                             invokeArgs=dict(
                                 inFNs=[
                                     statLikesRatioFNs[(chrom, stat)]
                                     for stat in CMSBins.CMSstats
                                 ],
                                 colNames=[
                                     colName + 'likeRatio'
                                     for colName in CMSBins.CMSstats
                                 ],
                                 outFN=likesRatioFN))
Exemplo n.º 17
0
def runCmdParallelized(commands,
                       depends_on,
                       creates,
                       comment,
                       splitFunc,
                       joinFunc,
                       saveOutputTo=None,
                       splitFN=None,
                       joinFN=None,
                       name=None,
                       mediumRuleName=None,
                       getio=None):
    """Run the specified command, using parallelization."""
    from Operations.Ilya_Operations.PipeRun.python.PipeRun import PipeRun

    dbg('"IN_RUNCMDPAR_EEEEEE" depends_on creates saveOutputTo')

    if creates is None: creates = ()

    commands = MakeSeq(commands)
    depends_on = MakeSeq(depends_on)
    creates = MakeSeq(creates)

    gio = Dict('depends_on creates comment name mediumRuleName')
    dbg('gio')
    if getio:
        return Dict(
            'depends_on creates comment name mediumRuleName saveOutputTo',
            uses=(splitFunc, joinFunc))

    splitFN = splitFN or list(MakeSeq(depends_on))[0]
    joinFN = RandomString(12) if saveOutputTo else (
        joinFN or list(MakeSeq(creates))[0])

    assert any([splitFN in command for command in commands])
    assert saveOutputTo or any([joinFN in command for command in commands])

    logging.info('calling ' + str(splitFunc) + ' to split ' + splitFN)
    outDir = os.path.join('/broad/hptmp', getpass.getuser(), 'par',
                          os.path.abspath(splitFN)[1:])

    pr = PipeRun(name='splitting', descr='splitting')
    r = pr.addInvokeRule(invokeFn=doSplit,
                         invokeArgs=Dict('splitFunc splitFN outDir'))
    pr.runSubPipeline()

    chunkFNs = SlurpFileLines(r.creates[0])

    logging.info('finished running ' + str(splitFunc) + ' to split ' + splitFN)
    dbg('"CHUNKS_ARE" chunkFNs')

    pr = PipeRun(name='parallelizing', descr='parallelizing')
    chunkOutFNs = []
    for chunkFN in chunkFNs:
        chunkOutFN = AddFileSfx(chunkFN, 'out')
        chunkOutFNs.append(chunkOutFN)

        for command in commands:
            dbg('splitFN chunkFN chunkOutFN command command.replace(splitFN,chunkFN)'
                )

        pr.addRule(
            commands=[
                command.replace(splitFN, chunkFN).replace(joinFN, chunkOutFN)
                for command in commands
            ],
            depends_on=[f if f != splitFN else chunkFN for f in depends_on],
            creates=[f if f != joinFN else chunkOutFN for f in creates],
            saveOutputTo=None if saveOutputTo is None else chunkOutFN)

    pr.runSubPipeline()

    joinFunc(inFNs=chunkOutFNs, outFN=None if saveOutputTo else joinFN)