Пример #1
0
def normalizeColumnsWithinGroups(inFN,
                                 cols,
                                 groupCols,
                                 outFN,
                                 groupsAreContiguous=True,
                                 getio=None):
    """Normalize the specified columns of a table within groups.

    Params:

       inFN - the input table
       cols - the columns to be normalized
       groupCols - the columns that define the groups: rows that have the same combination of values
          in the group columns, are in the same group.
       outFN - the output table
       groupsAreContiguous - if True, rows belonging to the same group must be contiguous in the table;
          if False, no such assumption is made.
    """

    cols = tuple(MakeSeq(cols))
    groupCols = tuple(MakeSeq(groupCols))

    meansFN = GetCreates(computeMeanStdWithinGroups,
                         **Dict('inFN cols groupCols groupsAreContiguous'))[0]

    if getio:
        return dict(depends_on=(inFN, meansFN),
                    creates=outFN,
                    splitByCols={inFN: dict(keyCols=())})

    inFile = IDotData(inFN)
    means = IDotData(meansFN)

    inFile.normalizeColumnsWithinGroups_using_means(
        **Dict('cols groupCols groupsAreContiguous means')).save(outFN)
Пример #2
0
def DefineRulesTo_RunSimsOnly(
        pr,
        mutAges=AllAges,
        mutPops=AllPops,
        mutFreqs=AllFreqs,
        nreplicas=100,
        Ddata='../Data/Shari_Data_3/sim',
        simsOut='simsOut5',
        suffix='',
        inputParamsFiles='../Data/Ilya_Data/sim2/neutralParams.txt',
        DdataSeeds='',
        useGenMap=None,
        scen2alternateNeutralParams={},
        withGeneConvBug=False,
        withNewCosi=False,
        withCosi=None,
        DdataMimic=None):
    """Define rules for running simulations.

    Parameters:

       mutAges, mutPops, mutFreqs - parameters defining the selection scenario.
    
    """

    DefineRulesTo_CreateSimulationParams(
        **Dict('pr mutAges mutPops mutFreqs Ddata suffix inputParamsFiles '
               'scen2alternateNeutralParams'))
    DefineRulesTo_runSims(**Dict(
        'pr mutAges mutPops mutFreqs nreplicas Ddata simsOut suffix DdataSeeds useGenMap withGeneConvBug '
        'withNewCosi withCosi DdataMimic'))
Пример #3
0
def DefineRulesTo_CreateSimulationParams(
        pr,
        mutAges,
        mutPops,
        mutFreqs,
        Ddata='../Data/Ilya_Data/sim/sfs/working/pardis2',
        suffix='',
        inputParamsFiles=None,
        scen2alternateNeutralParams={}):
    """Create simulation parameters for cross( mutAges, mutPops, mutFreqs ).
	"""

    dbg('"YYYYYYYYYYYYY" inputParamsFiles')
    if inputParamsFiles == None:
        inputParamsFiles = [
            Ddata + '/simParams' + befAft + suffix + '.txt'
            for befAft in 'Bef', 'Aft'
        ]
    dbg('"ZZZZZZZZZZZZZ" inputParamsFiles')

    pr.addInvokeRule(invokeFn=CreateSimsParams_neutral,
                     invokeArgs=Dict('Ddata suffix inputParamsFiles'))
    for mutAge in mutAges:
        for mutPop in mutPops:
            for mutFreq in mutFreqs:
                pr.addInvokeRule(
                    invokeFn=CreateSimsParams_selection,
                    invokeArgs=dict(
                        mutAges=(mutAge, ),
                        mutPops=(mutPop, ),
                        mutFreqs=(mutFreq, ),
                        **Dict('Ddata suffix scen2alternateNeutralParams')),
                    mediumRuleNameSfx=(mutAge, mutPop, mutFreq),
                    attrs=Dict('mutAge mutPop mutFreq'))
Пример #4
0
def DefineRulesTo_RunSimsAndSweep(pr,
                                  Ddata,
                                  simsOut='simsOut',
                                  mutAges=AllAges,
                                  mutPops=AllPops,
                                  mutFreqs=AllFreqs,
                                  nreplicas=100,
                                  suffix='',
                                  thinning='',
                                  thinExt='',
                                  tests=('lrh', 'ihs', 'xpop'),
                                  doRunSims=True,
                                  doRunThinning=True,
                                  doRunSweep=True,
                                  inputParamsFiles=[],
                                  acceptExistingSimConfigFiles=False,
                                  pop2name=pop2name,
                                  runImportsLocally=True,
                                  doOnlyStages=None,
                                  DdataSeeds='',
                                  setOptions=(),
                                  appendOptions=(),
                                  scen2alternateNeutralParams={},
                                  useGenMap=None,
                                  powerSfx='',
                                  withGeneConvBug=False,
                                  withNewCosi=False,
                                  withCosi=None,
                                  DdataMimic=None):
    """Define rules for running simulations and doing Sweep analyses of them.

    Parameters:

       mutAges, mutPops, mutFreqs - parameters defining the selection scenario.
    
    """

    dbg('"Running_doRunSims"')
    if doRunSims:
        DefineRulesTo_RunSimsOnly(**Dict(
            'pr mutAges mutPops mutFreqs nreplicas Ddata simsOut suffix '
            'inputParamsFiles DdataSeeds useGenMap scen2alternateNeutralParams '
            'withGeneConvBug withNewCosi withCosi DdataMimic'))
    dbg('"Running_doThinning"')
    if doRunThinning:
        DefineRulesTo_DoThinning(**Dict(
            'pr mutAges mutPops mutFreqs nreplicas Ddata simsOut thinning '
            'thinExt suffix'))
    dbg('"Running_doRunSweep"')
    if doRunSweep:
        DefineRulesTo_RunSweepOnSims(**Dict(
            'pr mutAges mutPops mutFreqs nreplicas Ddata simsOut thinning '
            'suffix tests inputParamsFiles thinExt setOptions appendOptions '
            'acceptExistingSimConfigFiles pop2name runImportsLocally '
            'doOnlyStages powerSfx'))
    dbg('"FINISHED_runSimsAndSweep"')
Пример #5
0
def gatherXPOPscores(pops, chrom, selPop, sweepDir, outFN, getio=None):
    """Gather xpop scores into a convenient form."""

    pops = [p for p in pops if p != selPop]
    pop2FN = dict([(pop,
                    getFN_xpop_signif(pop1=selPop,
                                      pop2=pop,
                                      **Dict('sweepDir chrom')))
                   for pop in pops])

    if getio:
        return dict(depends_on=list(pop2FN.values()),
                    creates=outFN,
                    attrs=Dict('chrom', pop=pops, piperun_short=True))

    def LoadComparison(pop):
        """Load comparison with one pop"""

        d0 = pd.read_csv(pop2FN[pop],
                         sep='\t',
                         usecols=('Pop 1', 'Pop 2', 'Chrom'),
                         nrows=1)
        dbg('d0')

        assert str(d0.loc[0, 'Chrom']) == str(chrom)
        assert (d0.loc[0, 'Pop 1'] == selPop and d0.loc[0, 'Pop 2']
                == pop) or (d0.loc[0, 'Pop 1'] == pop
                            and d0.loc[0, 'Pop 2'] == selPop)

        flip = (d0.loc[0, 'Pop 1'] == pop)

        d = pd.read_csv(pop2FN[pop],
                        sep='\t',
                        usecols=('SNP pos (bases)',
                                 'L AllEHH logratio Deviation',
                                 'R AllEHH logratio Deviation'),
                        index_col='SNP pos (bases)',
                        na_values=('-', ))
        d.info()

        if flip:
            d['L AllEHH logratio Deviation'] *= -1
            d['R AllEHH logratio Deviation'] *= -1

        return pd.DataFrame.from_dict({pop: d.max(axis=1)})

    # end: def LoadComparison( pop )

    comparisons = reduce(lambda d1, d2: d1.join(d2, how='inner'),
                         list(map(LoadComparison,
                                  pops))).max(axis=1, columns=('max_xpop', ))
    comparisons.index.name = 'pos'
    comparisons.name = 'max_xpop'
    #    print 'type of comparisons is', type(comparisons)
    #    print comparisons
    comparisons.to_csv(outFN, sep='\t', header=True)
Пример #6
0
def DefineRulesTo_gatherCausalSnpGdPos(pr,
                                       Ddata,
                                       mutAges=AllAges,
                                       mutPops=AllPops,
                                       mutFreqs=AllFreqs,
                                       thinSfx=''):
    """Define rules to gather the causal SNP genetic position in every replica"""

    for scenario in GetSelectionScenarios(**Dict('mutAges mutFreqs mutPops')):
        pr.addInvokeRule(invokeFn=gatherCausalSnpGdPos,
                         invokeArgs=Dict('Ddata scenario thinSfx'))
Пример #7
0
def DefineRulesTo_extractGeneticMapsFromSims(pr,
                                             Ddata,
                                             mutAges=AllAges,
                                             mutPops=AllPops,
                                             mutFreqs=AllFreqs):
    """Define rules to extract genetic map from Sweep import of cosi simulations"""

    for scen in GetScenarios(**Dict('mutAges mutPops mutFreqs')):

        projFile = os.path.join(Ddata, 'data', scen.scenDir(), 'project')

        thePop = scen.scenName() + ':' + popName[mutPops[0]]

        outFile = os.path.join(Ddata, 'snpStats', scen.scenDir(), 'gdMap.tsv')

        pr.addRule(
            commands=
            '../Other/Ilya_Other/sweep/scripts/run-sweep ExtractGeneticMap '
            '$projFile $thePop $outFile',
            depends_on=projFile,
            creates=outFile,
            name='ExtractGeneticMap',
            mediumRuleNameSfx=scen.scenDir(),
            comment=
            'Extract genetic map for all replicas in one simulation scenario')
Пример #8
0
def DefineRulesTo_meanStdWithinGroups(pr,
                                      inFN,
                                      cols,
                                      groupCols,
                                      groupsAreContiguous=True,
                                      nameSfx=''):
    """Adds rules to create a version of a table with given columns normalized within groups."""

    pr.addInvokeRule(
        invokeFn=computeSumsWithinGroups,
        invokeArgs=Dict('inFN cols groupCols groupsAreContiguous'),
        name='computeSumsWithinGroups' + Sfx(nameSfx))
    pr.addInvokeRule(
        invokeFn=computeMeanStdWithinGroups,
        invokeArgs=Dict('inFN cols groupCols groupsAreContiguous'),
        name='computeMeanStdWithinGroups' + Sfx(nameSfx))
Пример #9
0
def computeMeanFstAndFreqDiffScores(pops,
                                    chrom,
                                    selPop,
                                    sweepDir,
                                    pop2ancFreqFN,
                                    pop2sampleSizeFN,
                                    outMeanFstFN,
                                    outFreqDiffFN,
                                    getio=None):
    """Compute meanFst and freqDiff scores"""

    if selPop not in pops: pops = tuple(MakeSeq(pops)) + (selPop, )
    cmpPops = [pop for pop in pops if pop != selPop]

    if getio:
        return dict(depends_on=(pop2ancFreqFN, pop2sampleSizeFN),
                    creates=(outMeanFstFN, outFreqDiffFN),
                    attrs=Dict('chrom', pop=pops))

    #    pop2ancFreq.to_csv( 'befdrop.tsv', sep = '\t' )
    #    pop2ancFreq.fillna( value = 1.0, inplace = True )

    #    pop2ancFreq.to_csv( 'aftdrop.tsv', sep = '\t' )

    pop2ancFreq = pd.read_table(pop2ancFreqFN, index_col='pos')
    pop2sampleSize = pd.read_table(pop2sampleSizeFN,
                                   index_col='pop').sampleSize

    dbg('pop2sampleSize')

    #pop2snpInfo.to_csv( 'test.tsv', sep = '\t', header = True )

    derFreq = 1.0 - pop2ancFreq[selPop]
    cmpAncFreqs = pop2ancFreq[[pop for pop in pops if pop != selPop]]
    meanAnc = cmpAncFreqs.mean(axis=1)
    freqDiff = derFreq - (1.0 - meanAnc)
    freqDiff.name = 'freqDiff'
    freqDiff.to_csv(outFreqDiffFN, sep='\t', header=True)

    # compute meanFst

    #    dbg( '"vvvvvvvvvvvw" selPop pop2ancFreq[selPop] pop2ancFreq["JPT+CHB"] pop2ancFreq["YRI"]' )
    #    dbg( 'selPop pop2sampleSize[selPop] pop2sampleSize["JPT+CHB"] pop2sampleSize["YRI"]' )
    d = dict([(pop,
               fst_onePopPair(ancFreqs=np.array(
                   (pop2ancFreq[selPop], pop2ancFreq[pop])),
                              sampleSizes=(pop2sampleSize[selPop],
                                           pop2sampleSize[pop])))
              for pop in cmpPops])
    fstVals = pd.DataFrame(data=d, index=pop2ancFreq.index)
    #    spc = fst_onePopPair( ancFreqs = np.array( ( pop2ancFreq[ 'BEB' ], pop2ancFreq[ 'ASN' ] ) ),
    #                          sampleSizes = ( pop2sampleSize[ 'BEB' ], pop2sampleSize[ 'ASN' ] ) )
    #    dbg( '"ddddddddddd" fstVals.loc[526736] spc' )
    #    dbg( 'fstVals' )
    fstVals.fillna(value=0.0, inplace=True)
    #fstVals.to_csv( 'fstvals.tsv', sep = '\t', header = True, na_rep = 'NaN' )
    fstMean = fstVals.mean(axis=1)
    dbg('fstVals fstMean')
    fstMean.name = 'meanFst'
    fstMean.to_csv(outMeanFstFN, sep='\t', header=True, na_rep='NaN')
Пример #10
0
def computeLikeRatioForStat(stat,
                            statValsFN,
                            hitLikesFN,
                            missLikesFN,
                            stat_start,
                            stat_end,
                            stat_nbin,
                            statLikesRatioFN,
                            getio=None):
    """Compute likes for one stat"""

    if getio:
        return dict(depends_on=(statValsFN, hitLikesFN, missLikesFN),
                    creates=statLikesRatioFN,
                    uses=computeLikeRatioForStat_do)

    statVals = pd.read_table(statValsFN)
    hitLikes = pd.read_table(hitLikesFN)[stat]
    missLikes = pd.read_table(missLikesFN)[stat]

    bins = np.linspace(stat_start, stat_end, stat_nbin + 1)

    statLikeRatio, statBinIds, statBinIds2 = computeLikeRatioForStat_do(
        statVals=statVals[stat], **Dict('hitLikes missLikes bins'))
    statVals[stat + 'likeRatio'] = statLikeRatio
    statVals[stat + 'Bin'] = statBinIds
    statVals[stat + 'Bin2'] = statBinIds2

    statVals.to_csv(statLikesRatioFN,
                    sep='\t',
                    columns=('pos', stat, stat + 'likeRatio', stat + 'Bin',
                             stat + 'Bin2'),
                    index=False,
                    na_rep='NaN')
Пример #11
0
def normalizeInBins_tsv(inDataFN,
                        valCol,
                        binCol,
                        binMin,
                        binMax,
                        binStep,
                        binsFN,
                        outFN,
                        normedCol,
                        getio=None):
    """Normalize data within bins, using previously computed bin means"""

    if getio:
        return dict(depends_on=(inDataFN, binsFN),
                    creates=outFN,
                    uses=normalizeInBins)

    inData = pd.read_table(inDataFN)
    binStats = pd.read_table(binsFN)
    binMeans = binStats.means
    totCount = float(binStats.counts.sum())
    totMean = binStats.sums.sum() / totCount
    commonStd = np.sqrt(binStats.sumsSq.sum() / totCount - totMean * totMean)
    dbg('"CCCCCCCC" commonStd binMeans totCount totMean binStats.sums.sum() binStats.sumsSq.sum()'
        )
    normed = normalizeInBins(**Dict(
        'inData valCol binCol binMin binMax binStep binMeans commonStd'))
    inData.insert(len(inData.columns), normedCol, normed)
    inData.to_csv(outFN, sep='\t', na_rep='NaN', index=False)
Пример #12
0
def DefineRulesTo_gatherCausalRanks(pr,
                                    Ddata,
                                    mutPops=AllPops,
                                    mutFreqs=AllFreqs,
                                    mutAges=AllAges,
                                    selpos=500000,
                                    complikeSfx=None,
                                    nonNanStats='ALL'):

    for scenario in GetSelectionScenarios(mutAges=mutAges,
                                          mutPops=mutPops,
                                          mutFreqs=mutFreqs):
        pr.addInvokeRule(
            invokeFn=gatherCausalRanks,
            invokeArgs=Dict('Ddata scenario selpos complikeSfx nonNanStats'))
        pr.addInvokeRule(
            invokeFn=gatherCausalStats,
            invokeArgs=Dict('Ddata scenario selpos complikeSfx nonNanStats'))
Пример #13
0
def DefineRulesTo_normalizeColumnsWithinGroups(pr,
                                               inFN,
                                               cols,
                                               groupCols,
                                               groupsAreContiguous=True,
                                               nameSfx='',
                                               outFN=None):
    """Adds rules to create a version of a table with given columns normalized within groups."""

    cols = tuple(MakeSeq(cols))
    groupCols = tuple(MakeSeq(groupCols))

    DefineRulesTo_meanStdWithinGroups(
        **Dict('pr inFN cols groupCols groupsAreContiguous nameSfx'))
    pr.addInvokeRule(
        invokeFn=normalizeColumnsWithinGroups,
        invokeArgs=Dict('inFN cols groupCols outFN groupsAreContiguous'),
        name='normalizeColumnsWithinGroups' + Sfx(nameSfx))
Пример #14
0
def DefineRulesTo_normalizeOneColumn(pr,
                                     inFN,
                                     colName,
                                     meanStdFN,
                                     outFN,
                                     addRuleArgs={}):
    """Define rules to normalize one column"""

    pr.addInvokeRule(invokeFn=normalizeOneColumn,
                     invokeArgs=Dict('inFN colName meanStdFN outFN'),
                     **addRuleArgs)
Пример #15
0
def plotHistograms(inFN, cols, outFNs=None, getio=None, **kwargs):

    histFNs = GetCreates(computeHistograms, **Dict('inFN cols outFNs'))
    outFN = AddFileSfx(ReplaceFileExt(inFN, '.svg'), 'hist')

    if getio: return dict(depends_on=histFNs, creates=outFN)

    GraphHistograms(histFiles=histFNs,
                    outFile=outFN,
                    labels=tuple(MakeSeq(cols)),
                    **kwargs)
Пример #16
0
def DefineRulesTo_MergeSims( pr, Ddata, simsOut = 'simsOut', thinSfx = '', thinExt = '',
			     nreplicas = 100, mutAges = AllAges, mutPops = AllPops, mutFreqs = AllFreqs,
			     pop2name = pop2name, limitToPop = None ):
	"""Define rules to merge per-SNP data for each SNP into one file"""

	assert False
	
	if not Ddata.endswith('/'): Ddata += '/'
    
	for scenario in GetScenarios( mutAges, mutPops, mutFreqs ):
		pr.addInvokeRule( invokeFn = mergePosFilesOneSim,
				  invokeArgs = Dict( 'Ddata simsOut thinSfx thinExt '
						     'scenario nreplicas pop2name' ) )

		for putativeMutPop in ( mutPops if scenario.isNeutral() else (scenario.mutPop,) ):

			       
			pr.addInvokeRule( invokeFn = mergeSims,
					  invokeArgs = Dict( 'Ddata simsOut thinSfx thinExt '
							     'scenario nreplicas putativeMutPop pop2name limitToPop' ) )
Пример #17
0
def computeMeanStdWithinGroups(inFN,
                               cols,
                               groupCols,
                               groupsAreContiguous=True,
                               outFN=None,
                               getio=None):
    """Add columns representing mean and std within each group.
    """

    sumsFN = GetCreates(computeSumsWithinGroups,
                        **Dict('inFN cols groupCols groupsAreContiguous'))[0]
    if outFN is None:
        outFN = AddFileSubdir('stats',
                              AddFileSfx(inFN, 'meanStd', *(cols + groupCols)))
    if getio:
        return dict(depends_on=sumsFN,
                    creates=outFN,
                    attrs=dict(piperun_short=True))

    return IDotData(sumsFN).addMeanStdCols(cols=cols).save(outFN)
Пример #18
0
def DefineRulesTo_MergeSims(pr,
                            mutAges,
                            mutPops,
                            mutFreqs,
                            noNeutral,
                            nreplicas,
                            Ddata,
                            simsOut,
                            thinExt='.thin',
                            thinSfx=''):
    """Pipeline generator: for each scenario, create a rule to merge SNP info for all SNPs in each replica within that scenario,
    into a single table.
    """

    for scenario in (GetSelectionScenarios if noNeutral else GetScenarios)(
            mutAges, mutPops, mutFreqs):
        print 'generating rule for scenario ', scenario
        pr.addInvokeRule(
            invokeFn=mergeSims,
            invokeArgs=Dict(
                'scenario nreplicas Ddata simsOut thinExt thinSfx'))
Пример #19
0
def computeMeanStd_binned_tsvs(inFNs,
                               valCol,
                               binCol,
                               binMin,
                               binMax,
                               binStep,
                               outFN,
                               getio=None):
    """Compute binned stats for a set of tables"""

    if getio:
        return dict(depends_on=inFNs,
                    creates=outFN,
                    uses=computeMeanStd_binned)

    computeMeanStd_binned(
        inDatas=itertools.imap(
            lambda f: pd.read_table(f, usecols=(valCol, binCol)).dropna(),
            MakeSeq(inFNs)),
        **Dict('valCol binCol binMin binMax binStep')).to_csv(
            outFN, sep='\t', index_label='binId', na_rep='NaN')
Пример #20
0
def computeSumsWithinGroups(inFN,
                            cols,
                            groupCols,
                            groupsAreContiguous=True,
                            outFN=None,
                            getio=None):
    """For a tsv file, compute sums, sumsquares and counts for each of the given columns within groups
  defined by groupCols.

  >>> z = IDotData( names = ( 'a', 'b' ), Records = ( ( 1, 2 ), ( 1, 3 ), ( 2, 4 ), ( 2, 5 ) ) )
  >>> computeSumsWithinGroups( inFN = z, cols = 'b', groupCols = 'a', outFN = sys.stdout )
  ... # doctest: +NORMALIZE_WHITESPACE
  a	b_count	b_sum	b_sumSq	b_numNaN
  1	2	5.0	13.0	0
  2	2	9.0	41.0	0

  """

    cols = tuple(MakeSeq(cols))
    groupCols = tuple(MakeSeq(groupCols))
    if outFN is None:
        outFN = AddFileSubdir('stats',
                              AddFileSfx(inFN, 'sums', *(cols + groupCols)))

    def combiner(inFNs, outFN):
        IDotData.mergeColumnSummaries(iDotDatas=inFNs,
                                      cols=cols,
                                      groupCols=groupCols).save(outFN)

    if getio:
        return dict(depends_on=inFN,
                    creates=outFN,
                    splitByCols={inFN: dict(keyCols=())},
                    combiner={outFN: combiner})

    IDotData(inFN).summarizeColumnsWithinGroups(
        **Dict('cols groupCols groupsAreContiguous')).save(outFN)
Пример #21
0
def DefineRulesTo_RunSweepOnSims(pr,
                                 Ddata,
                                 simsOut,
                                 thinExt='',
                                 thinning='',
                                 suffix='',
                                 mutAges=AllAges,
                                 mutPops=AllPops,
                                 mutFreqs=AllFreqs,
                                 nreplicas=100,
                                 pop2name=pop2name,
                                 tests=('lrh', 'ihs', 'xpop'),
                                 acceptExistingSimConfigFiles=False,
                                 setOptions=(),
                                 appendOptions=(),
                                 inputParamsFiles=[],
                                 runImportsLocally=True,
                                 noImports=False,
                                 powerSfx='',
                                 doOnlyStages=None):
    """Define rules for running simulations and doing Sweep analyses of them.

    Parameters:

       mutAges, mutPops, mutFreqs - parameters defining the selection scenario.
    
    """

    # Define the rules to do Sweep analyses of the simulations.
    # These rules are created by running a Perl script, sim_analysis_pipe.pl .
    # Each invocation of the script defines rules for one type of test.
    # The rules for each test are saved to an .xml file, and then all these files are
    # merged into pr.

    # Rather than explicitly invoking the script three times we define a small pipeline to do this.
    # The output of this pipeline is a pipeline definition for analyzing simulation results by each of the tests,
    # saved to an .xml file.

    simPipeline = PipeRun(name='DefineSims',
                          descr='Define simulation and analysis pipeline')

    if not powerSfx: powerSfx = Sfx(*mutAges)

    if not acceptExistingSimConfigFiles:
        simPipeline.addInvokeRule(
            invokeFn=WriteSimulationInfo,
            invokeArgs=Dict('Ddata mutAges mutPops mutFreqs nreplicas suffix '
                            'inputParamsFiles pop2name powerSfx'))

    test2pipeline = {}
    for test in tests:

        simSfx = ''
        if suffix: simSfx += suffix
        if thinning: simSfx += thinning

        thinExtHere = '' if thinning else thinExt

        testPipeline = os.path.join('Ilya_Temp',
                                    AddFileSfx('p.xml', pr.name, test))
        test2pipeline[test] = testPipeline

        if thinning:
            simPipeline.addRule(
                comment="Copy config file",
                targets="$Ddata/power_$test$simSfx/config$powerSfx.txt",
                sources="$Ddata/power_$test$suffix/config.txt",
                commands="cp $Ddata/power_$test$suffix/config.txt "
                "$Ddata/power_$test$simSfx/config$powerSfx.txt",
                mediumRuleName='copy_config_' + test,
                name='copy_config')

        simPipeline.addRule(
            targets=testPipeline,
            sources=[
                '../Other/Ilya_Other/sweep/sims/scripts/sim_analysis_pipe.pl',
                "$Ddata/power_$test$simSfx/config$powerSfx.txt",
                "$Ddata/config$suffix/sims$powerSfx.txt",
                "$Ddata/config$suffix/scenarios$powerSfx.txt",
                "$Ddata/config$suffix/pops$powerSfx.txt"
            ],
            commands=
            "../Other/Ilya_Other/sweep/sims/scripts/sim_analysis_pipe.pl "
            " --only-write-pipeline $testPipeline " +
            ('--run-imports-locally ' if runImportsLocally else '') +
            ('--no-imports ' if noImports else '') +
            (('--do-only-stages ' + doOnlyStages +
              ' ') if doOnlyStages else '') +
            ('--sim-suffix ' + simSfx if simSfx else '') +
            ('--powerSfx ' + powerSfx if powerSfx else '') +
            (' --thin-ext ' + thinExtHere if thinExtHere else '') +
            reduce(operator.concat, [
                ' --set-option ' + setOption[0] + ' ' + setOption[1]
                for setOption in setOptions
            ], '') + reduce(operator.concat, [
                ' --append-option ' + appendOption[0] + " '" +
                appendOption[1] + "'" for appendOption in appendOptions
            ], '') +
            " --target-test $test $Ddata/$simsOut$simSfx $Ddata/power_$test$simSfx",
            name='simanal',
            mediumRuleName='simanal_$test$simSfx',
            comment='Define rules for analyzing simulations')

    dbg('"RUNNING_simPipeline"')
    simPipeline.runForced(aftRuleDelay=5)
    dbg('"DONE_RUNNING_simPipeline"')

    for test in tests:
        pr.addPipelineFromFile(test2pipeline[test])
Пример #22
0
def DefineRulesTo_runSims(pr,
                          mutAges,
                          mutPops,
                          mutFreqs,
                          nreplicas,
                          allPops=None,
                          Ddata='../Data/Ilya_Data/sim/sfs/working/pardis2',
                          simsOut='simsOut',
                          suffix='',
                          shortSimTime=True,
                          DdataSeeds='',
                          useGenMap=None,
                          includeNeutral=True,
                          withGeneConvBug=False,
                          withNewCosi=False,
                          withCosi=None,
                          DdataMimic=None):
    """Instantiate, for each combination of ( mutAge, mutPop, mutFreq ),   the script that creates simulation parameters
	for simulations with that selected-mutation-age.
	"""

    assert not (DdataSeeds and DdataMimic)

    mutPops = MakeSeq(mutPops)
    mutAges = MakeSeq(mutAges)
    mutFreqs = MakeSeq(mutFreqs)

    if allPops is None: allPops = mutPops

    Dsims = Ddata + '/' + simsOut + suffix

    for scen in GetScenarios(
            **Dict('mutAges mutPops mutFreqs includeNeutral')):
        if DdataSeeds:
            seeds = IDotData(
                os.path.join(DdataSeeds, 'replicastats', scen.scenDir(),
                             'simSeeds.tsv'))

        for replicaNum, seedsLine in zip(
                range(nreplicas),
                seeds if DdataSeeds else itertools.repeat(None, nreplicas)):

            assert not DdataSeeds or seedsLine.replicaNum == replicaNum

            pfx = os.path.join(Dsims, scen.scenDir(),
                               '%d_%s' % (replicaNum, scen.scenName()))
            recombDir = '../Data/Ilya_Data/sim/sfs/working/pardis2'

            attrs = Dict('replicaNum', scenDir=scen.scenDir())
            if not scen.is_neutral():
                attrs.update(mutAge=scen.mutAge,
                             mutPop=scen.mutPop,
                             mutFreq=scen.mutFreq)
            else:
                attrs.update(mutAge=0, mutPop=0, mutFreq=0)
            if shortSimTime: attrs['piperun_short'] = True

            mutAge = '%dky' % (0 if scen.isNeutral() else scen.mutAge)

            useGenMapFile = os.path.join(
                DdataMimic, 'simsOut', scen.scenDir(), '%d_%s.model' %
                (replicaNum, scen.scenName())) if DdataMimic else ''
            useMutRateFile = os.path.join(
                DdataMimic, 'simsOut', scen.scenDir(), '%d_%s.mut' %
                (replicaNum, scen.scenName())) if DdataMimic else ''
            #			dbg( '"GGGGGGGGG" mutPops' )
            pr.addRule( targets = [ pfx + ext for ext in ( [ '.model', '.mut', '.cosiParams' ] +
                        ( [ '.recombParams' ] if not useGenMap else [] ) +
                        [ '.%s-%d' % ( hapOrPos, pop )
                   for hapOrPos in ('hap', 'pos') for pop in allPops ]  +
                        ( [] if ( withNewCosi or withCosi ) else [ os.path.join( Dsims, scen.scenDir(), 'treeinfo',
                         '%d_%s.%s' % ( replicaNum, scen.scenName(), which ) )
                         for which in ( 'regions.tsv', 'mutlist.tsv', 'nodes.dat' )
                         + ( () if scen.isNeutral() else ( 'sweepinfo.tsv', ) ) ] ) ) ],
                 sources = [ Ddata + '/' +  ( 'params_neutral' + suffix if scen.isNeutral()
                  else 'params%s/%s/params_%s' % \
                   ( suffix, mutAge, scen.scenName() ) ) ] \
                  + ( [ useGenMap ] if useGenMap else [ recombDir + '/recParams_bestfit_generic', \
                      recombDir + '/autosomes_decode.distr' ] ) + \
                  ( [ useGenMapFile, useMutRateFile ] if DdataMimic else [] ),
                 commands = ' '.join(('perl ../Operations/Ilya_Operations/sim/sfs/working/pardis2/' \
                  'runOneSim.pl' + ( ' --coalSeed %ld --recombSeed %ld --useMutRate %s'
                       % ( long( seedsLine.coalescentSeed ),
                    long( seedsLine.recombSeed ),
                    seedsLine.GetStrItem( 'mutRate' ) )
                       if DdataSeeds else '' )
                 + ( ' --useGenMap ' + useGenMap if useGenMap else '' )
                 + ( ' --withGeneConversionBug' if withGeneConvBug else '' )
                 + ( ' --withNewCosi' if withNewCosi else '' )
                 + ( ( ' --withCosi ' + withCosi ) if withCosi else '' )
                 + ( ( ' --useGenMap ' + useGenMapFile + ' --useMutRateFile ' + useMutRateFile )
                     if DdataMimic else '' ),
                 scen.scenName(), mutAge,
                 str(replicaNum), Ddata, Dsims, suffix )),
                 name = 'RunOneSim',
                 attrs = attrs,
                 comment = 'Adding simulation', mediumRuleNameSfx = ( scen.scenName(), mutAge, replicaNum ) )
Пример #23
0
def runCmdParallelized(commands,
                       depends_on,
                       creates,
                       comment,
                       splitFunc,
                       joinFunc,
                       saveOutputTo=None,
                       splitFN=None,
                       joinFN=None,
                       name=None,
                       mediumRuleName=None,
                       getio=None):
    """Run the specified command, using parallelization."""
    from Operations.Ilya_Operations.PipeRun.python.PipeRun import PipeRun

    dbg('"IN_RUNCMDPAR_EEEEEE" depends_on creates saveOutputTo')

    if creates is None: creates = ()

    commands = MakeSeq(commands)
    depends_on = MakeSeq(depends_on)
    creates = MakeSeq(creates)

    gio = Dict('depends_on creates comment name mediumRuleName')
    dbg('gio')
    if getio:
        return Dict(
            'depends_on creates comment name mediumRuleName saveOutputTo',
            uses=(splitFunc, joinFunc))

    splitFN = splitFN or list(MakeSeq(depends_on))[0]
    joinFN = RandomString(12) if saveOutputTo else (
        joinFN or list(MakeSeq(creates))[0])

    assert any([splitFN in command for command in commands])
    assert saveOutputTo or any([joinFN in command for command in commands])

    logging.info('calling ' + str(splitFunc) + ' to split ' + splitFN)
    outDir = os.path.join('/broad/hptmp', getpass.getuser(), 'par',
                          os.path.abspath(splitFN)[1:])

    pr = PipeRun(name='splitting', descr='splitting')
    r = pr.addInvokeRule(invokeFn=doSplit,
                         invokeArgs=Dict('splitFunc splitFN outDir'))
    pr.runSubPipeline()

    chunkFNs = SlurpFileLines(r.creates[0])

    logging.info('finished running ' + str(splitFunc) + ' to split ' + splitFN)
    dbg('"CHUNKS_ARE" chunkFNs')

    pr = PipeRun(name='parallelizing', descr='parallelizing')
    chunkOutFNs = []
    for chunkFN in chunkFNs:
        chunkOutFN = AddFileSfx(chunkFN, 'out')
        chunkOutFNs.append(chunkOutFN)

        for command in commands:
            dbg('splitFN chunkFN chunkOutFN command command.replace(splitFN,chunkFN)'
                )

        pr.addRule(
            commands=[
                command.replace(splitFN, chunkFN).replace(joinFN, chunkOutFN)
                for command in commands
            ],
            depends_on=[f if f != splitFN else chunkFN for f in depends_on],
            creates=[f if f != joinFN else chunkOutFN for f in creates],
            saveOutputTo=None if saveOutputTo is None else chunkOutFN)

    pr.runSubPipeline()

    joinFunc(inFNs=chunkOutFNs, outFN=None if saveOutputTo else joinFN)
Пример #24
0
def mergeSims( scenario, Ddata, posFileFN = None, simsOut = 'simsOut', nreplicas = 100, thinExt = '', thinSfx = '',
	       putativeMutPop = None, outFile = None,
	       pop2name = pop2name, statsSfx = '', ihsSfx = '',
               limitToPop = None,
	       getio = None ):
	"""Gathers per-SNP information, for all replicas of a given scenario, and outputs it in a single DotData where each line
	gives info for one SNP.

	Specifically, reads simulation and Sweep output, collects columns needed for composite likehood test (chrom, base pair position, genetic
	distance, anc frequencies for 3 populations, xpop for each pair, and ihs, iHH_A and iHH_D for selected population)

	Input params:

	   scenario - an object of class Scenario, indicating the simulation scenario (either neutral or a selection scenario)
	       from which all replicas were simulated.
	   nreplicas - the number of replicas simulated under this scenario.
	      Each replica represents a chromosome region, with a set of SNPs on it.
	   
	   Ddata - the directory under which the simulations and the Sweep analysis results live.
	     Under this directory we expect to find:
	         iHS analysis results, under power_ihs/
		 XP-EHH analysis results, under power_xpop
		 simulation output giving SNP positions

	   thinExt - the extension appended to simulation files that describe the SNPs in the simulated replica.
	      Sometimes we create simulations and then thin them under different thinning models (to simulate SNP ascertainment
	      by the various stages of HapMap; these differently thinned versions of the same simulations might be stored in
	      simulation files with different extensions.

	   thinSfx - the suffix appended to the power_ihs and power_xpop directory names, telling where to find iHS and XP-EHH
	      analyses of the simulations.   When we analyze the same simulations after applying different thinning scenarios,
	      the iHS and XP-EHH analyses for each thinning scenario go into a separate set of directories.


	   putativeMutPop - the population in which, we think, selection is occurring, aka "putatively selected population".
	      In practice, when localizing a given region, we will usually suspect that selection has occurred in a particular
	      population.   When doing a genome-wide scan, we can do several scans assuming each population in turn to be
	      the selected population, and find regions selected in that population.

        Output params:

	    Ddata - under Ddata writes a DotData named merged_scenName.data, where each line gives info
	        for one SNP, with the following columns (type of data is float unless stated otherwise):

	        CHROM_POS 1 - physical (basepair) position of the SNP within its replica.
	           Note that one merged file contains SNPs for a set of replicas (all for the same scenario),
		   so there could be multiple SNPs with the same position.  The replica number
		   is given in the Chrom column.
		FREQ1 1 - derived allele frequency in pop 1 ( European )
		FREQ1 4 - derived allele frequency in pop 4 ( EastAsian )
		FREQ1 5 - derived allele frequency in pop 5 ( WestAfrican )

		R AllEHH logratio Deviation European_WestAfrican - XP-EHH score to the right of the SNP,
		   between European and WestAfrican pops, normalized to the neutral background.
		   Analogously for the next five columns:
		L AllEHH logratio Deviation European_WestAfrican
		R AllEHH logratio Deviation EastAsian_European
		L AllEHH logratio Deviation EastAsian_European
		R AllEHH logratio Deviation EastAsian_WestAfrican
		L AllEHH logratio Deviation EastAsian_WestAfrican

		SNP pos (cM) European_WestAfrican - genetic map position of this SNP, within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		SNP pos (bases) European_WestAfrican - physical (basepair) position of this SNP within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		Chrom European_WestAfrican - the replica from which this SNP comes; can be nan.
		   (the European_WestAfrican suffix is irrelevant)
		Chrom - the replica from which this SNP comes; can be nan
		SNP pos (bases) - physical (basepair) position of this SNP within its replica.
		SNP pos (cM) - genetic map position of this SNP within its replica
		Both iHH_A - sum of iHH_A for both directions from this SNP
		Both iHH_D - sum of iHH_D for both directions from this SNP
		Both iHS - the value in 'Both Unstandardised iHS' (below), but binned by derived allele frequency
		   and normalized within the bin.
		Left iHH_D - iHH_D to the left of the SNP (the raw integral value).  analogously for the next three.
		Right iHH_D
		Left iHH_A
		Right iHH_A
		Both Unstandardised iHS - log( (iHH_A_left + iHH_A_right) / ( iHH_D_left + iHH_D_right ) )
		   ( see also 'Both iHS' column for the standardized iHS score )
	
	"""

	if not Ddata.endswith('/'): Ddata += '/'

	assert nreplicas > 0
	dbg( 'pop2name' )

	SimDir = os.path.join( Ddata, simsOut + thinSfx )

	scenName = scenario.scenName()
	scenDir = scenario.scenDir()

	if putativeMutPop == None: putativeMutPop = scenario.mutPop
	
	ihsSignifFN = os.path.join( Ddata, 'power_ihs' + thinSfx, scenDir,
				    'ihs_sig_' + pop2name[ putativeMutPop ] + ihsSfx + '.tsv' )

	popNames = sorted( pop2name.values() )
	popNums = sorted( pop2name.keys() )
	minPopNum = popNums[ 0 ]

	posFileKeyCols = ( 'replicaNum', 'CHROM_POS %d' % minPopNum )
	xpopIhsKeyCols = ('Chrom', 'SNP pos (bases)')
	
	popPairs = [ '%s_%s' % ( popNames[ pop1idx ], popNames[ pop2idx ] )
		     for pop1idx in range( len( popNames ) ) for pop2idx in range( pop1idx+1, len( popNames ) )
                     if limitToPop is None or limitToPop in ( popNames[ pop1idx ], popNames[ pop2idx ] )  ]
	
	xpopSignifFNs = [ os.path.join( Ddata, 'power_xpop' + thinSfx, scenDir, 'xpop_significance_' + popPair + '.tsv' )
			  for popPair in popPairs ]

	snpStatsDir = os.path.join( Ddata, 'snpStats' + thinSfx, scenario.scenDir() )
    
	mergedData = outFile if outFile else os.path.join( snpStatsDir, AddFileSfx( 'merged.tsv', statsSfx, putativeMutPop, ihsSfx ) )

	fileDescrs = \
	{ mergedData :
		  ( 'Various per-snp statistics for SNPs in scenario $scenario, replicas 0-$nreplicas, '
		    'assuming selection in ' + pop2name[ putativeMutPop ],
		    ( ( 'CHROM_POS 1', 'physical (basepair) position of the SNP within its replica. '
			'Note that one merged file contains SNPs for a set of replicas (all for the same scenario), '
			'so there could be multiple SNPs with the same position.  The replica number '
			'is given in the Chrom column. ' ), 
		      ( 'FREQ1 1', 'derived allele frequency in pop 1 ( European )' ),
		      ( 'R AllEHH logratio Deviation European_WestAfrican', 'XP-EHH score to the R of the SNP, '
			'between European and WestAfrican pops, normalized to the neutral background.' ),
		      ( 'SNP pos (cM) European_WestAfrican', 'genetic map SNP position' ),
		      ( 'SNP pos (bases) European_WestAfrican', 'physical SNP position' ),
		      ( 'Chrom European_WestAfrican', 'chromosome (or replica number)' ),
		      ( 'Chrom', 'chromosome (or replica number)' ),
		      ( 'SNP pos (bases)', 'physical SNP position' ),
		      ( 'SNP pos (cM)', 'genetic map SNP position' ),
		      ( 'Both iHH_A', 'sum of iHH_A scores for both sides' ),
		      ( 'Both iHH_D', 'sum of iHH_D scores for both sides' ),
		      ( 'Both iHS', 'sum of iHS scores for both sides' ),
		      ( ' Left iHH_D', 'iHH_D score to the left of the SNP' ),
		      ( 'Right iHH_D', 'iHH_D score to the right of the SNP' ),
		      ( 'Left iHH_A', 'iHH_A score to the left of the SNP' ),
		      ( 'Right iHH_A', 'iHH_A score to the right of the SNP' ), 
		      ( 'Both Unstandardised iHS', 'sum of unstandardized iHS scores for both sides' ) ) ) }

	if posFileFN is None: posFileFN = os.path.join( Ddata, 'snpStats' + thinSfx, scenario.scenDir(),
							AddFileSfx( 'mergedPosStacked.tsv', statsSfx, putativeMutPop, ihsSfx ) )
	
	if getio: return dict( depends_on = [ posFileFN, ihsSignifFN ] + xpopSignifFNs, creates = mergedData,
			       splitByCols = dict([ ( posFileFN, dict( keyCols = posFileKeyCols ) ) ]
						  + [ ( signifFN, dict( keyCols = xpopIhsKeyCols ) ) for signifFN in [ ihsSignifFN ] + xpopSignifFNs ] ),
			       mediumRuleNameSfx = ( scenario.scenDir(), putativeMutPop ),
			       fileDescrs = fileDescrs,
                               attrs = Dict( 'putativeMutPop nreplicas pop2name' ) )

	dashFixer = lambda v: v if v != '-' else np.nan

	ihsAll = IDotData(ihsSignifFN, valueFixer = dashFixer)
	ihsAll = ihsAll[('Chrom','SNP pos (bases)','SNP pos (cM)','Both iHH_A','Both iHH_D','Both iHS',
			 'Left iHH_D','Right iHH_D','Left iHH_A','Right iHH_A','Both Unstandardised iHS')]
	def chkReplica( r, n = nreplicas ): return r.Chrom < n 
		
	ihsAll = ihsAll.takewhile( chkReplica )
	
	xpopCols = ('Chrom','SNP pos (bases)','SNP pos (cM)','L AllEHH logratio Deviation','R AllEHH logratio Deviation')

	xpopSignif = tuple( [ IDotData(xpopSignifFN, valueFixer = dashFixer)[ xpopCols].takewhile( chkReplica )
			      for xpopSignifFN in xpopSignifFNs ] )
	
	posCols = ['CHROM_POS %d' % minPopNum ] + ['FREQ1 %d' % popNum for popNum in popNums]

	result = IDotData.merge( iDotDatas =  ( IDotData( posFileFN ), ) + xpopSignif + ( ihsAll, ),
				 cols = (posFileKeyCols,) +
					 (xpopIhsKeyCols,) * ( len( popPairs ) + 1 ),
				 blanks = (None,) + (np.nan,) * ( len( popPairs ) + 1 ),
				 suffixes = ['pos'] + [ ' %s' % popPair for popPair in popPairs ] + [ '' ] )

	aPopPair = 'European_WestAfrican' if 'European_WestAfrican' in popPairs else popPairs[0]
	useCols = [ 'replicaNum' ] + posCols + \
	    [ '%s AllEHH logratio Deviation %s' % ( side, popPair ) for popPair in popPairs for side in ( 'L', 'R' ) ] + \
	    [ 'SNP pos (cM) ' + aPopPair,
	      'SNP pos (bases) ' + aPopPair,
	      'Chrom ' + aPopPair,
	      'Chrom',
	      'SNP pos (bases)',
	      'SNP pos (cM)',
	      'Both iHH_A',
	      'Both iHH_D',
	      'Both iHS' ]

	if len( popPairs ) == 1:
		result = result.renameCols( { 'L AllEHH logratio Deviation' : 'L AllEHH logratio Deviation ' + aPopPair,
					      'R AllEHH logratio Deviation' : 'R AllEHH logratio Deviation ' + aPopPair } )
					      
	result[ useCols ].save( mergedData )
	
	logging.info( 'Finished mergeSims()' )
Пример #25
0
def gather_iHS_scores(selPop,
                      chrom,
                      ihsFN,
                      pop2ancFreqFN,
                      ihsOutFN,
                      dihhOutFN,
                      getio=None):
    """Gather iHS scores"""

    if getio:
        return dict(depends_on=(ihsFN, pop2ancFreqFN),
                    creates=(ihsOutFN, dihhOutFN),
                    attrs=Dict('chrom', pop=selPop, piperun_short=True))

    d0 = pd.read_csv(ihsFN, sep='\t', usecols=('Population', 'Chrom'), nrows=1)
    dbg('d0')

    assert str(d0.loc[0, 'Chrom']) == str(chrom)
    assert d0.loc[0, 'Population'] == selPop

    d = pd.read_csv(ihsFN,
                    sep='\t',
                    usecols=('SNP pos (bases)', 'Ancestral Freq', 'Both iHS',
                             'Both iHH_D', 'Both iHH_A'),
                    index_col='SNP pos (bases)',
                    na_values=('-', ))

    d.index.name = 'pos'

    pop2ancFreq = pd.read_table(pop2ancFreqFN,
                                index_col='pos',
                                usecols=(
                                    'pos',
                                    selPop,
                                ))
    #    snp2ancFreq = pd.read_table( snpInfoFN, index_col = 'SNP pos (bases)',
    #                                 usecols = ( 'SNP pos (bases)', 'Ancestral Freq' ) )
    #    snp2ancFreq.dropna( inplace = True )
    # dbg( 'len(pop2ancFreq) len(snp2ancFreq) pop2ancFreq.index.difference(snp2ancFreq.index)' )
    # dbg( 'len(pop2ancFreq) len(snp2ancFreq) snp2ancFreq.index.difference(pop2ancFreq.index)' )
    # dbg( 'np.all(pop2ancFreq.index.values==snp2ancFreq.index.values)' )
    # dbg( 'np.sum(pop2ancFreq.index.values==snp2ancFreq.index.values)' )
    # dbg( 'len(pop2ancFreq.index.values) len(snp2ancFreq.index.values)' )
    #    pop2ancFreq.index.name = 'pos'
    #    dbg( '3 pop2ancFreq selPop pop2ancFreq.columns' )
    pop2ancFreq.rename(columns={selPop: selPop + '_ancFreq'}, inplace=True)
    #    dbg( '4 pop2ancFreq' )

    #    print "ii:", pop2ancFreq.info()
    #    pop2ancFreq.to_csv( 'pf.tsv', sep = '\t', header = True, na_rep = 'NaN' )
    #    dbg( '1 d' )

    d = d.join(pop2ancFreq, how='right', sort=True)

    #    dbg( '2 d' )

    #    af1 = d['Ancestral Freq']
    af2 = d[selPop + '_ancFreq']
    #    dbg( '"GGGGGGGGGG" (af1-af2).max() (af1.isnull()==af2.isnull()).all()' )

    d_iHS = pd.DataFrame(data=dict(iHS=d['Both iHS']))
    d_iHS.to_csv(ihsOutFN, sep='\t', header=True, na_rep='NaN')

    #    dihh = subs.normalizeByFreq( rawVals = ( d[ 'Both iHH_D' ] - d[ 'Both iHH_A' ] ).values,
    #                                 ancfreq = 1.0 - af2.values )
    d_iHH = pd.DataFrame(data=dict(iHHDiff=d['Both iHH_D'] - d['Both iHH_A'],
                                   normingFreqs=1.0 - af2))

    d_iHH.to_csv(dihhOutFN, sep='\t', header=True, na_rep='NaN')
Пример #26
0
def DefineRulesTo_fastCMS(pr,
                          pops,
                          chroms,
                          selPop,
                          sweepDir,
                          cmsDir,
                          genomeBuild='hg19'):
    """Define rules to do fast CMS computation.

    Params:

       pr - the PipeRun object to which to add rules

       selPop - testing selection in which pop?
       pops - comparing selPop to which pops?
       sweepDir - the sweep directory
       cmsDir - the directory under which CMS stats go
    """

    pops = list(MakeSeq(pops))
    if selPop not in pops: pops.append(selPop)

    allPops = tuple(MakeSeq(pops))
    if selPop not in allPops: allPops += (selPop, )
    cmpPops = [pop for pop in allPops if pop != selPop]

    rawScoresFN = {}

    genMapSfx = genomeBuild2genMapSfx[genomeBuild]
    for pop in allPops:
        for chrom in chroms:
            with pr.settingAttrs('pop chrom'):
                snpInfoFN = os.path.join(
                    sweepDir,
                    'analysis/chr%(chrom)s/snps_%(pop)s.tsv' % locals())
                projDir = os.path.join(sweepDir,
                                       'data/chr%(chrom)s' % locals())
                ancestralImportedFN = os.path.join(projDir,
                                                   'ancestral.tsv.imported')
                genotypesImportedFN = os.path.join(
                    projDir,
                    'genotypes_chr%(chrom)s_%(pop)s_r21_nr_fwd_phased_all.imported'
                    % locals())
                genMapImportedFN = os.path.join(
                    projDir,
                    'genetic_map_chr%(chrom)s_%(genMapSfx)s.txt.imported' %
                    locals())
                pr.addRule(
                    name='extractSnpInfo',
                    commands=
                    'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s'
                    % locals(),
                    commandsOld=
                    'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep/target/sweep-1.0-SNAPSHOT-jar-with-dependencies.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s'
                    % locals(),
                    depends_on=(ancestralImportedFN, genotypesImportedFN,
                                genMapImportedFN),
                    creates=snpInfoFN)

    chr2dihhFN = {}

    for chrom in chroms:
        with pr.settingAttrs('chrom'):

            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            xpopScoresFN = os.path.join(
                chromDir, AddFileSfx('max_xpop.tsv', chrom_s, selPop, pops))

            pr.addInvokeRule(invokeFn=gatherXPOPscores,
                             invokeArgs=Dict('pops chrom selPop sweepDir',
                                             outFN=xpopScoresFN),
                             attrs=dict(pop=allPops,
                                        stat='max_xpop',
                                        piperun_short=True))

            ihsFN = getFN_ihs_signif(**Dict('sweepDir chrom', pop=selPop))

            ihsScoresFN = os.path.join(
                chromDir, AddFileSfx('iHS.tsv', chrom_s, selPop, pops))
            dihhScoresFN = os.path.join(
                chromDir, AddFileSfx('dihh.tsv', chrom_s, selPop, pops))

            chr2dihhFN[chrom] = dihhScoresFN

            pop2ancFreqFN = os.path.join(
                cmsDir, chrom_s, AddFileSfx('pop2ancFreq.tsv', chrom_s, pops))
            pop2sampleSizeFN = os.path.join(
                cmsDir, chrom_s, AddFileSfx('pop2sampleSize.tsv', chrom_s,
                                            pops))

            pop2snpInfoFN = dict([(pop,
                                   os.path.join(sweepDir, 'analysis',
                                                'chr%(chrom)s' % locals(),
                                                'snps_%(pop)s.tsv' % locals()))
                                  for pop in pops])

            pr.addInvokeRule(
                invokeFn=gather_snp_info,
                invokeArgs=Dict(
                    'pops pop2snpInfoFN pop2ancFreqFN pop2sampleSizeFN'))

            pr.addInvokeRule(
                invokeFn=gather_iHS_scores,
                invokeArgs=Dict(
                    'chrom selPop ihsFN pop2ancFreqFN',
                    #                                                 snpInfoFN = pop2snpInfoFN[ selPop ],
                    ihsOutFN=ihsScoresFN,
                    dihhOutFN=dihhScoresFN),
                attrs=dict(pop=selPop,
                           stat=('iHS', 'StdDiff'),
                           piperun_short=True))

            freqDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('freqDiff.tsv', chrom_s, selPop, pops))
            meanFstScoresFN = os.path.join(
                chromDir, AddFileSfx('meanFst.tsv', chrom_s, selPop, pops))

            pr.addInvokeRule(
                invokeFn=computeMeanFstAndFreqDiffScores,
                invokeArgs=Dict(
                    'chrom selPop sweepDir pops pop2ancFreqFN pop2sampleSizeFN',
                    outMeanFstFN=meanFstScoresFN,
                    outFreqDiffFN=freqDiffScoresFN),
                attrs=dict(pop=allPops,
                           stat=('freqDiff', 'meanFst'),
                           piperun_short=True))

            StdDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('StdDiff.tsv', chrom_s, selPop, pops))

            rawScoresFN[chrom] = dict(iHS=ihsScoresFN,
                                      StdDiff=StdDiffScoresFN,
                                      meanFst=meanFstScoresFN,
                                      freqDiff=freqDiffScoresFN,
                                      max_xpop=xpopScoresFN)

        # end: with pr.settingAttrs( 'chrom' )
    # end: for chrom in chroms

    #    ihhStdFN = os.path.join( cmsDir, 'dihhstd.tsv' )

    dihhGlobalStdFN = os.path.join(
        cmsDir, AddFileSfx('dihh_global_std.tsv', selPop, pops))
    dihhBinMeansFN = os.path.join(
        cmsDir, AddFileSfx('dihh_bin_means.tsv', selPop, pops))

    pr.addInvokeRule(invokeFn=normalizeByFreq_getMeanStd_tsv,
                     invokeArgs=dict(
                         iHHDiffFNs=[chr2dihhFN[k] for k in chroms],
                         globalStatFN=dihhGlobalStdFN,
                         binsStatFN=dihhBinMeansFN),
                     name='compute_dihh_meanstd')

    # pr.addInvokeRule( invokeFn = computeMeanStd_binned_tsvs,
    #                   invokeArgs = dict( inFNs = chr2dihhFN.values(), valCol = 'iHHDiff',
    #                                      binCol = 'normingFreqs', binMin = 0.05, binMax = 1.05, binStep = .05,
    #                                      outFN = ihhStdFN ),
    #                   name = 'compute_dihh_std' )

    for chrom in chroms:
        with pr.settingAttrs('chrom'):
            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            StdDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('StdDiff.tsv', chrom_s, selPop, pops))
            dbg('chrom chr2dihhFN[chrom]')
            pr.addInvokeRule(invokeFn=normalizeByFreq_compute_normed_tsv,
                             invokeArgs=dict(iHHDiffFN=chr2dihhFN[chrom],
                                             globalStatFN=dihhGlobalStdFN,
                                             binsStatFN=dihhBinMeansFN,
                                             StdDiffFN=StdDiffScoresFN))

    statLikesRatioFNs = {}

    for stat in CMSBins.CMSstats:
        with pr.settingAttrs(
                stat=stat,
                pop=(selPop, ) if stat in ('iHS', 'StdDiff') else allPops,
                piperun_short=True):
            if stat not in CMSBins.nonNormedStats:
                rawFNs = [rawScoresFN[chrom][stat] for chrom in chroms]
                meanStdFN = os.path.join(
                    cmsDir, AddFileSfx('meanStd.tsv', stat, selPop, pops))

                # DefineRulesTo_computeMeanStd( pr, inFNs = rawFNs, colNum = 1,
                #                               outFN = meanStdFN,
                #                               addRuleArgs = \
                #                               dict( name = 'computeMeanStd_for_stat',
                #                                     attrs = dict( chrom = chroms ) ) )

                #                meanStdBzFN = os.path.join( cmsDir, stat + '_meanStdForStat.tsv' )
                pr.addInvokeRule(invokeFn=computeMeanStd,
                                 invokeArgs=dict(inFNs=rawFNs,
                                                 colName=stat,
                                                 outFN=meanStdFN))

            # end: if stat not in CMSBins.nonNormedStats

            for chrom in chroms:
                with pr.settingAttrs('chrom'):
                    statFN = rawScoresFN[chrom][stat]

                    if stat not in CMSBins.nonNormedStats:
                        normedFN = AddFileSfx(statFN, 'normed')

                        DefineRulesTo_normalizeOneColumn(
                            pr,
                            inFN=statFN,
                            meanStdFN=meanStdFN,
                            colName=stat,
                            outFN=normedFN,
                            addRuleArgs=dict(attrs=Dict('chrom')))
                        statFN = normedFN

                    bins_beg = CMSBins.stat_start[stat]
                    bins_end = CMSBins.stat_end[stat]
                    bins_n = CMSBins.stat_nbin[stat]

                    statLikesRatioFN = AddFileSfx(rawScoresFN[chrom][stat],
                                                  'likesRatio')
                    statLikesRatioFNs[(chrom, stat)] = statLikesRatioFN

                    pr.addInvokeRule(
                        invokeFn=computeLikeRatioForStat,
                        invokeArgs=dict(
                            stat=stat,
                            statValsFN=statFN,
                            hitLikesFN=
                            '../Data/Common_Data/sim/likes/hitsLikes_toneutFixed_1.tsv',
                            missLikesFN=
                            '../Data/Common_Data/sim/likes/missLikes_toneutFixed_1.tsv',
                            stat_start=bins_beg,
                            stat_end=bins_end,
                            stat_nbin=bins_n,
                            statLikesRatioFN=statLikesRatioFN))

                # end: with pr.settingAttrs( 'chrom' )
            # end: for chrom in chroms
        # end: with pr.settingAttrs( stat = stat, piperun_short = True )
    # end: for stat in CMSBins.CMSstats

    for chrom in chroms:
        with pr.settingAttrs(chrom=chrom, stat=CMSBins.CMSstats):
            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            likesRatioFN = os.path.join(
                chromDir,
                AddFileSfx('likesRatio.tsv', CMSBins.CMSstats, selPop, pops))
            pr.addInvokeRule(invokeFn=addLikesRatios,
                             invokeArgs=dict(
                                 inFNs=[
                                     statLikesRatioFNs[(chrom, stat)]
                                     for stat in CMSBins.CMSstats
                                 ],
                                 colNames=[
                                     colName + 'likeRatio'
                                     for colName in CMSBins.CMSstats
                                 ],
                                 outFN=likesRatioFN))