示例#1
0
def StartPushers(use_args=None, as_daemons=False):
    parser = optparse.OptionParser()
    parser.add_option('-P',
                      '--num-result-pushers',
                      type='int',
                      dest='numResultPushers',
                      help='create NUMPROCS parallel result pushers',
                      metavar='NUMPROCS',
                      default=1)
    parser.add_option('-Q',
                      '--queues',
                      dest='queues',
                      default=os.path.join('..', 'Other', 'queues',
                                           getpass.getuser()),
                      help='push results for the specified QUEUES',
                      metavar='QUEUES')
    parser.add_option('-S',
                      '--sleepInterval',
                      type='int',
                      help='between checks, sleep for SEC seconds',
                      metavar='SEC',
                      default=20)
    dbg('use_args')
    (options, args) = parser.parse_args(
        args=sys.argv[1:] if use_args is None else list(use_args))
    dbg('options args')
    assert not args

    for i in range(options.numResultPushers):
        p = multiprocessing.Process(target=PushResults, args=(options, ))
        allPushers.append(p)
        p.daemon = as_daemons
        p.start()
        time.sleep(min(1.0, random.normalvariate(4.0, 1.0)))
示例#2
0
def computeLikeRatioForStat_do(statVals, hitLikes, missLikes, bins):
    """Compute likes ratio"""

    # Precompute the likelihood ratio corresponding to each bin

    dbg('statVals hitLikes missLikes bins')

    indNaN = hitLikes != 1e-10
    missingVal = np.log(np.min(hitLikes[indNaN] / missLikes[indNaN]))

    CLR = [(np.log(hitLike / missLike) if hitLike != 1e-10 else missingVal)
           if hitLike != 0.0 else np.nan
           for hitLike, missLike in zip(hitLikes, missLikes)]
    CLR = np.array([CLR[0]] + CLR + [CLR[-1]])

    binIds = np.digitize(statVals.values, bins)
    st_binSize = (bins[1] - bins[0])
    st_nbins = len(bins) - 1
    binIds2 = np.where(
        np.isfinite(statVals.values),
        np.clip(((statVals.values - bins[0]) / st_binSize).astype(np.int16), 0,
                st_nbins - 1), len(hitLikes)) + 1

    return np.where(np.isnan(statVals.values),
                    np.repeat(np.nan,
                              len(statVals)), CLR[binIds]), binIds, binIds2
示例#3
0
def computeMeanFstAndFreqDiffScores(pops,
                                    chrom,
                                    selPop,
                                    sweepDir,
                                    pop2ancFreqFN,
                                    pop2sampleSizeFN,
                                    outMeanFstFN,
                                    outFreqDiffFN,
                                    getio=None):
    """Compute meanFst and freqDiff scores"""

    if selPop not in pops: pops = tuple(MakeSeq(pops)) + (selPop, )
    cmpPops = [pop for pop in pops if pop != selPop]

    if getio:
        return dict(depends_on=(pop2ancFreqFN, pop2sampleSizeFN),
                    creates=(outMeanFstFN, outFreqDiffFN),
                    attrs=Dict('chrom', pop=pops))

    #    pop2ancFreq.to_csv( 'befdrop.tsv', sep = '\t' )
    #    pop2ancFreq.fillna( value = 1.0, inplace = True )

    #    pop2ancFreq.to_csv( 'aftdrop.tsv', sep = '\t' )

    pop2ancFreq = pd.read_table(pop2ancFreqFN, index_col='pos')
    pop2sampleSize = pd.read_table(pop2sampleSizeFN,
                                   index_col='pop').sampleSize

    dbg('pop2sampleSize')

    #pop2snpInfo.to_csv( 'test.tsv', sep = '\t', header = True )

    derFreq = 1.0 - pop2ancFreq[selPop]
    cmpAncFreqs = pop2ancFreq[[pop for pop in pops if pop != selPop]]
    meanAnc = cmpAncFreqs.mean(axis=1)
    freqDiff = derFreq - (1.0 - meanAnc)
    freqDiff.name = 'freqDiff'
    freqDiff.to_csv(outFreqDiffFN, sep='\t', header=True)

    # compute meanFst

    #    dbg( '"vvvvvvvvvvvw" selPop pop2ancFreq[selPop] pop2ancFreq["JPT+CHB"] pop2ancFreq["YRI"]' )
    #    dbg( 'selPop pop2sampleSize[selPop] pop2sampleSize["JPT+CHB"] pop2sampleSize["YRI"]' )
    d = dict([(pop,
               fst_onePopPair(ancFreqs=np.array(
                   (pop2ancFreq[selPop], pop2ancFreq[pop])),
                              sampleSizes=(pop2sampleSize[selPop],
                                           pop2sampleSize[pop])))
              for pop in cmpPops])
    fstVals = pd.DataFrame(data=d, index=pop2ancFreq.index)
    #    spc = fst_onePopPair( ancFreqs = np.array( ( pop2ancFreq[ 'BEB' ], pop2ancFreq[ 'ASN' ] ) ),
    #                          sampleSizes = ( pop2sampleSize[ 'BEB' ], pop2sampleSize[ 'ASN' ] ) )
    #    dbg( '"ddddddddddd" fstVals.loc[526736] spc' )
    #    dbg( 'fstVals' )
    fstVals.fillna(value=0.0, inplace=True)
    #fstVals.to_csv( 'fstvals.tsv', sep = '\t', header = True, na_rep = 'NaN' )
    fstMean = fstVals.mean(axis=1)
    dbg('fstVals fstMean')
    fstMean.name = 'meanFst'
    fstMean.to_csv(outMeanFstFN, sep='\t', header=True, na_rep='NaN')
示例#4
0
    def LoadComparison(pop):
        """Load comparison with one pop"""

        d0 = pd.read_csv(pop2FN[pop],
                         sep='\t',
                         usecols=('Pop 1', 'Pop 2', 'Chrom'),
                         nrows=1)
        dbg('d0')

        assert str(d0.loc[0, 'Chrom']) == str(chrom)
        assert (d0.loc[0, 'Pop 1'] == selPop and d0.loc[0, 'Pop 2']
                == pop) or (d0.loc[0, 'Pop 1'] == pop
                            and d0.loc[0, 'Pop 2'] == selPop)

        flip = (d0.loc[0, 'Pop 1'] == pop)

        d = pd.read_csv(pop2FN[pop],
                        sep='\t',
                        usecols=('SNP pos (bases)',
                                 'L AllEHH logratio Deviation',
                                 'R AllEHH logratio Deviation'),
                        index_col='SNP pos (bases)',
                        na_values=('-', ))
        d.info()

        if flip:
            d['L AllEHH logratio Deviation'] *= -1
            d['R AllEHH logratio Deviation'] *= -1

        return pd.DataFrame.from_dict({pop: d.max(axis=1)})
示例#5
0
def normalizeInBins_tsv(inDataFN,
                        valCol,
                        binCol,
                        binMin,
                        binMax,
                        binStep,
                        binsFN,
                        outFN,
                        normedCol,
                        getio=None):
    """Normalize data within bins, using previously computed bin means"""

    if getio:
        return dict(depends_on=(inDataFN, binsFN),
                    creates=outFN,
                    uses=normalizeInBins)

    inData = pd.read_table(inDataFN)
    binStats = pd.read_table(binsFN)
    binMeans = binStats.means
    totCount = float(binStats.counts.sum())
    totMean = binStats.sums.sum() / totCount
    commonStd = np.sqrt(binStats.sumsSq.sum() / totCount - totMean * totMean)
    dbg('"CCCCCCCC" commonStd binMeans totCount totMean binStats.sums.sum() binStats.sumsSq.sum()'
        )
    normed = normalizeInBins(**Dict(
        'inData valCol binCol binMin binMax binStep binMeans commonStd'))
    inData.insert(len(inData.columns), normedCol, normed)
    inData.to_csv(outFN, sep='\t', na_rep='NaN', index=False)
示例#6
0
def DefineRulesTo_CreateSimulationParams(
        pr,
        mutAges,
        mutPops,
        mutFreqs,
        Ddata='../Data/Ilya_Data/sim/sfs/working/pardis2',
        suffix='',
        inputParamsFiles=None,
        scen2alternateNeutralParams={}):
    """Create simulation parameters for cross( mutAges, mutPops, mutFreqs ).
	"""

    dbg('"YYYYYYYYYYYYY" inputParamsFiles')
    if inputParamsFiles == None:
        inputParamsFiles = [
            Ddata + '/simParams' + befAft + suffix + '.txt'
            for befAft in 'Bef', 'Aft'
        ]
    dbg('"ZZZZZZZZZZZZZ" inputParamsFiles')

    pr.addInvokeRule(invokeFn=CreateSimsParams_neutral,
                     invokeArgs=Dict('Ddata suffix inputParamsFiles'))
    for mutAge in mutAges:
        for mutPop in mutPops:
            for mutFreq in mutFreqs:
                pr.addInvokeRule(
                    invokeFn=CreateSimsParams_selection,
                    invokeArgs=dict(
                        mutAges=(mutAge, ),
                        mutPops=(mutPop, ),
                        mutFreqs=(mutFreq, ),
                        **Dict('Ddata suffix scen2alternateNeutralParams')),
                    mediumRuleNameSfx=(mutAge, mutPop, mutFreq),
                    attrs=Dict('mutAge mutPop mutFreq'))
示例#7
0
def evalSpatialLoc( Ddata, thinSfx, scenario, putativeMutPop, nreplicas, complikeSfx = '',
                    likesTableSfx = '', selpos = 500000, whichSpatialLoc = 'Spline',
                    getio = None ):
    """Evaluate spatial localization.  Compute relevant replica statistic.  For each replica,
    compute: whether the localized intervals include the causal SNP; statistics about the
    localized intervals; the position of the causal SNP relative to the localized intervals."""

    assert not scenario.isNeutral()
    
    snpStatsDir = os.path.join( Ddata, 'snpStats'+ thinSfx, scenario.scenDir() )
    replicaStatsDir = os.path.join( Ddata, 'replicastats'+ thinSfx, scenario.scenDir() )
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = ( putativeMutPop, complikeSfx, likesTableSfx )

    intervalsListFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervals%sList.tsv' % whichSpatialLoc, *sfxs ) )
    causalGdPosFN = os.path.join( replicaStatsDir, 'causalGdPos.tsv' )
    spatialLocEvalFN = os.path.join( replicaStatsDir, AddFileSfx( 'spatialLocEval%s.tsv' % whichSpatialLoc, *sfxs ) )

    if getio: return dict( depends_on = ( intervalsListFN, causalGdPosFN ),
                           creates = spatialLocEvalFN,
                           mediumRuleNameSfx = scenario.scenDir(),
                           name = 'evalSpatialLoc_' + whichSpatialLoc )

    with IDotData.openForWrite( spatialLocEvalFN,
                                'replicaNum numIntervals totLenBp totLenGd causalIncluded '
                                'distanceToIntervalBoundaryBp distanceToIntervalBoundaryGd' ) as spatialLocEvalFile:

        for ( replicaNum2, replicaIntervals ), ( replicaNum3, causalGdPos, replicaNum4 ) in \
                itertools.izip( IDotData( intervalsListFN ).groupby( 'replicaNum' ),
                                IDotData.merge( iDotDatas = ( IDotData( causalGdPosFN ),
                                                              IDotData( intervalsListFN ).replicaNum.removeDups() ),
                                                cols = ( 'replicaNum', 'replicaNum' ) ) ):

            replicaNum2, replicaNum3, replicaNum4  = map( int, ( replicaNum2, replicaNum3, replicaNum4 ) )
            if not replicaNum2 == replicaNum3 == replicaNum4:
                dbg( 'replicaNum2 replicaNum3 replicaNum4 intervalsListFN complikeFN causalGdPosFN spatialLocEvalFN' )
            assert replicaNum2 == replicaNum3 == replicaNum4

            causalIncluded = False
            totLenBp = 0
            totLenGd = 0.0
            for replicaInterval in replicaIntervals:
                dbg( 'replicaInterval causalGdPos' )
                #assert bool( replicaInterval.bpFrom <= selpos <= replicaInterval.bpTo ) == bool( replicaInterval.gdFrom <= causalPos_gd <= replicaInterval.gdTo )
                assert ( replicaInterval.gdFrom <= causalGdPos <= replicaInterval.gdTo ) == ( replicaInterval.bpFrom <= selpos <= replicaInterval.bpTo )
                if replicaInterval.gdFrom <= causalGdPos <= replicaInterval.gdTo:
                    causalIncluded = True

                totLenGd += ( replicaInterval.gdTo - replicaInterval.gdFrom )
                totLenBp += ( replicaInterval.bpTo - replicaInterval.bpFrom )

            spatialLocEvalFile.writeRecord( replicaNum2, len( replicaIntervals ),
                                            totLenBp, totLenGd, int( causalIncluded ),
                                            np.min( ( np.min( np.abs( replicaIntervals.bpFrom - selpos ) ),
                                                      np.min( np.abs( replicaIntervals.bpTo - selpos ) ) ) ),
                                            np.min( ( np.min( np.abs( replicaIntervals.gdFrom - causalGdPos ) ),
                                                      np.min( np.abs( replicaIntervals.gdTo - causalGdPos ) ) ) ) )
示例#8
0
文件: DotData.py 项目: quank/cms
	def withoutNans( self, cols = None ):
		"""Return a new DotData obtained from this one by removing rows that have nan values
		in any of the given columns"""

		if cols == None: cols = self.dtype.names

		dbg( '#zip( cols, map( type, [ numpy.isnan( self[ c ] ) for c in cols ] ) )' )
		
		return self[ numpy.invert( reduce( operator.or_, [ numpy.isnan( self[ c ] ) for c in cols ] ) ) ]
示例#9
0
文件: fastcms.py 项目: quank/cms
def gather_iHS_scores( selPop, chrom, ihsFN, pop2ancFreqFN, ihsOutFN, dihhOutFN, getio = None ):
    """Gather iHS scores"""

    if getio: return dict( depends_on = ( ihsFN, pop2ancFreqFN ), creates = ( ihsOutFN, dihhOutFN ), 
                           attrs = Dict( 'chrom', pop = selPop, piperun_short = True ) )

    d0 = pd.read_csv( ihsFN, sep = '\t',
                      usecols = ( 'Population', 'Chrom' ), nrows = 1 )
    dbg( 'd0' )

    assert str( d0.loc[ 0, 'Chrom' ] ) == str( chrom )
    assert d0.loc[ 0, 'Population'] == selPop
        
    d = pd.read_csv( ihsFN, sep = '\t',
                     usecols = ( 'SNP pos (bases)', 'Ancestral Freq', 'Both iHS', 'Both iHH_D', 'Both iHH_A' ),
                     index_col = 'SNP pos (bases)',
                     na_values = ( '-', ) )
    
    d.index.name = 'pos'

    pop2ancFreq = pd.read_table( pop2ancFreqFN, index_col = 'pos', usecols = ( 'pos', selPop, ) )
#    snp2ancFreq = pd.read_table( snpInfoFN, index_col = 'SNP pos (bases)',
#                                 usecols = ( 'SNP pos (bases)', 'Ancestral Freq' ) )
#    snp2ancFreq.dropna( inplace = True )
    # dbg( 'len(pop2ancFreq) len(snp2ancFreq) pop2ancFreq.index.difference(snp2ancFreq.index)' )
    # dbg( 'len(pop2ancFreq) len(snp2ancFreq) snp2ancFreq.index.difference(pop2ancFreq.index)' )
    # dbg( 'np.all(pop2ancFreq.index.values==snp2ancFreq.index.values)' )
    # dbg( 'np.sum(pop2ancFreq.index.values==snp2ancFreq.index.values)' )
    # dbg( 'len(pop2ancFreq.index.values) len(snp2ancFreq.index.values)' )
#    pop2ancFreq.index.name = 'pos'
#    dbg( '3 pop2ancFreq selPop pop2ancFreq.columns' )
    pop2ancFreq.rename( columns = { selPop : selPop + '_ancFreq' }, inplace = True )
#    dbg( '4 pop2ancFreq' )

#    print "ii:", pop2ancFreq.info()
#    pop2ancFreq.to_csv( 'pf.tsv', sep = '\t', header = True, na_rep = 'NaN' )
#    dbg( '1 d' )
    
    d = d.join( pop2ancFreq, how = 'right', sort = True )
    
#    dbg( '2 d' )

#    af1 = d['Ancestral Freq']
    af2 = d[selPop + '_ancFreq']
#    dbg( '"GGGGGGGGGG" (af1-af2).max() (af1.isnull()==af2.isnull()).all()' )

    d_iHS = pd.DataFrame( data = dict( iHS = d[ 'Both iHS' ] ) )
    d_iHS.to_csv( ihsOutFN, sep = '\t', header = True, na_rep = 'NaN' )

#    dihh = subs.normalizeByFreq( rawVals = ( d[ 'Both iHH_D' ] - d[ 'Both iHH_A' ] ).values,
#                                 ancfreq = 1.0 - af2.values )
    d_iHH = pd.DataFrame( data = dict( iHHDiff = d[ 'Both iHH_D' ] - d[ 'Both iHH_A' ],
                                       normingFreqs = 1.0 - af2 ) )
    
    d_iHH.to_csv( dihhOutFN, sep = '\t', header = True, na_rep = 'NaN' )
示例#10
0
文件: DotData.py 项目: venkoyoung/cms
    def withoutNans(self, cols=None):
        """Return a new DotData obtained from this one by removing rows that have nan values
		in any of the given columns"""

        if cols == None: cols = self.dtype.names

        dbg('#zip( cols, map( type, [ numpy.isnan( self[ c ] ) for c in cols ] ) )'
            )

        return self[numpy.invert(
            reduce(operator.or_, [numpy.isnan(self[c]) for c in cols]))]
示例#11
0
文件: fastcms.py 项目: quank/cms
def computeMeanFstAndFreqDiffScores( pops, chrom, selPop, sweepDir,
                                     pop2ancFreqFN, pop2sampleSizeFN, outMeanFstFN, outFreqDiffFN, getio = None ):
    """Compute meanFst and freqDiff scores"""

    if selPop not in pops: pops = tuple( MakeSeq( pops ) ) + ( selPop, )
    cmpPops = [ pop for pop in pops if pop != selPop ]


    if getio: return dict( depends_on = ( pop2ancFreqFN, pop2sampleSizeFN ),
                           creates = ( outMeanFstFN, outFreqDiffFN ),
                           attrs = Dict( 'chrom', pop = pops ) )    

#    pop2ancFreq.to_csv( 'befdrop.tsv', sep = '\t' )
#    pop2ancFreq.fillna( value = 1.0, inplace = True )
    
#    pop2ancFreq.to_csv( 'aftdrop.tsv', sep = '\t' )

    pop2ancFreq = pd.read_table( pop2ancFreqFN, index_col = 'pos' )
    pop2sampleSize = pd.read_table( pop2sampleSizeFN, index_col = 'pop' ).sampleSize

    dbg( 'pop2sampleSize' )

    #pop2snpInfo.to_csv( 'test.tsv', sep = '\t', header = True )

    derFreq = 1.0 - pop2ancFreq[ selPop ]
    cmpAncFreqs = pop2ancFreq[ [ pop for pop in pops if pop != selPop ] ]
    meanAnc = cmpAncFreqs.mean( axis = 1 )
    freqDiff = derFreq - ( 1.0 - meanAnc )
    freqDiff.name = 'freqDiff'
    freqDiff.to_csv( outFreqDiffFN, sep = '\t', header = True )

    # compute meanFst

#    dbg( '"vvvvvvvvvvvw" selPop pop2ancFreq[selPop] pop2ancFreq["JPT+CHB"] pop2ancFreq["YRI"]' )
#    dbg( 'selPop pop2sampleSize[selPop] pop2sampleSize["JPT+CHB"] pop2sampleSize["YRI"]' )
    d = dict([ ( pop, fst_onePopPair( ancFreqs = np.array( ( pop2ancFreq[ selPop ], pop2ancFreq[ pop ] ) ),
                                      sampleSizes = ( pop2sampleSize[ selPop ], pop2sampleSize[ pop ] ) ) )
               for pop in cmpPops ])
    fstVals = pd.DataFrame( data = d, index = pop2ancFreq.index )
#    spc = fst_onePopPair( ancFreqs = np.array( ( pop2ancFreq[ 'BEB' ], pop2ancFreq[ 'ASN' ] ) ),
#                          sampleSizes = ( pop2sampleSize[ 'BEB' ], pop2sampleSize[ 'ASN' ] ) )
#    dbg( '"ddddddddddd" fstVals.loc[526736] spc' )
#    dbg( 'fstVals' )
    fstVals.fillna( value = 0.0, inplace = True )
    #fstVals.to_csv( 'fstvals.tsv', sep = '\t', header = True, na_rep = 'NaN' )
    fstMean = fstVals.mean( axis = 1 )
    dbg( 'fstVals fstMean' )
    fstMean.name = 'meanFst'
    fstMean.to_csv( outMeanFstFN, sep = '\t', header = True, na_rep = 'NaN' )
示例#12
0
def checkTableKey(inFN,
                  cols,
                  comparison='lt',
                  writeCheckedFile=True,
                  tsvOpts={},
                  lineFilter=None,
                  lineFilterCols=(),
                  getio=None):
    """Check that in the given table, record identifiers increase uniformly.

  Params:

     cols - the columns whose tuple should uniformly inrease
     comparison - this comparison must be true between each record and the next.
       the comparison is the name of a routine in the operator module.
  """

    cols = tuple(MakeSeq(cols))
    lineFilterCols = tuple(MakeSeq(lineFilterCols))
    checkedFN = Str('$inFN.checked_${comparison}') + Sfx(*cols)
    if getio:
        return dict(depends_on=inFN,
                    creates=checkedFN if writeCheckedFile else (),
                    attrs=dict(piperun_short=True))

    comparisonFunc = getattr(operator, comparison)
    prevRec = None
    loadCols = cols + lineFilterCols

    nskipped = 0
    nchecked = 0
    for i, r in enumerate(IDotData(inFN, ToLoad=loadCols, **tsvOpts)):
        if lineFilter and not lineFilter(r):
            nskipped += 1
            continue

        thisRec = r[cols] if IsSeq(r) else (r, )
        if i > 0 and not comparisonFunc(prevRec, thisRec):
            logging.error(
                Str('at line $i of $inFN, looking at $cols: $prevRec is not $comparison $thisRec'
                    ))
            assert False
        else:
            nchecked += 1
        prevRec = thisRec

    dbg('nchecked nskipped')
    DumpFile(checkedFN, 'checked ok.')
示例#13
0
def DefineRulesTo_RunSimsAndSweep(pr,
                                  Ddata,
                                  simsOut='simsOut',
                                  mutAges=AllAges,
                                  mutPops=AllPops,
                                  mutFreqs=AllFreqs,
                                  nreplicas=100,
                                  suffix='',
                                  thinning='',
                                  thinExt='',
                                  tests=('lrh', 'ihs', 'xpop'),
                                  doRunSims=True,
                                  doRunThinning=True,
                                  doRunSweep=True,
                                  inputParamsFiles=[],
                                  acceptExistingSimConfigFiles=False,
                                  pop2name=pop2name,
                                  runImportsLocally=True,
                                  doOnlyStages=None,
                                  DdataSeeds='',
                                  setOptions=(),
                                  appendOptions=(),
                                  scen2alternateNeutralParams={},
                                  useGenMap=None,
                                  powerSfx='',
                                  withGeneConvBug=False,
                                  withNewCosi=False,
                                  withCosi=None,
                                  DdataMimic=None):
    """Define rules for running simulations and doing Sweep analyses of them.

    Parameters:

       mutAges, mutPops, mutFreqs - parameters defining the selection scenario.
    
    """

    dbg('"Running_doRunSims"')
    if doRunSims:
        DefineRulesTo_RunSimsOnly(**Dict(
            'pr mutAges mutPops mutFreqs nreplicas Ddata simsOut suffix '
            'inputParamsFiles DdataSeeds useGenMap scen2alternateNeutralParams '
            'withGeneConvBug withNewCosi withCosi DdataMimic'))
    dbg('"Running_doThinning"')
    if doRunThinning:
        DefineRulesTo_DoThinning(**Dict(
            'pr mutAges mutPops mutFreqs nreplicas Ddata simsOut thinning '
            'thinExt suffix'))
    dbg('"Running_doRunSweep"')
    if doRunSweep:
        DefineRulesTo_RunSweepOnSims(**Dict(
            'pr mutAges mutPops mutFreqs nreplicas Ddata simsOut thinning '
            'suffix tests inputParamsFiles thinExt setOptions appendOptions '
            'acceptExistingSimConfigFiles pop2name runImportsLocally '
            'doOnlyStages powerSfx'))
    dbg('"FINISHED_runSimsAndSweep"')
示例#14
0
文件: tsvutils.py 项目: quank/cms
def normalizeInBins_tsv( inDataFN, valCol, binCol, binMin, binMax, binStep, binsFN, outFN,
                         normedCol,
                         getio = None):
    """Normalize data within bins, using previously computed bin means"""

    if getio: return dict( depends_on = ( inDataFN, binsFN ), creates = outFN, uses = normalizeInBins )

    inData = pd.read_table( inDataFN )
    binStats = pd.read_table( binsFN )
    binMeans = binStats.means
    totCount = float( binStats.counts.sum() )
    totMean = binStats.sums.sum() / totCount
    commonStd = np.sqrt( binStats.sumsSq.sum() / totCount - totMean * totMean )
    dbg( '"CCCCCCCC" commonStd binMeans totCount totMean binStats.sums.sum() binStats.sumsSq.sum()' )
    normed = normalizeInBins( **Dict( 'inData valCol binCol binMin binMax binStep binMeans commonStd' ) )
    inData.insert( len( inData.columns ), normedCol, normed )
    inData.to_csv( outFN, sep = '\t', na_rep = 'NaN', index = False )
示例#15
0
文件: prun_par.py 项目: quank/cms
def runCmdParallelized( commands, depends_on, creates, comment, 
                        splitFunc, joinFunc, saveOutputTo = None, splitFN = None, joinFN = None,
                        name = None, mediumRuleName = None, getio = None ):
    """Run the specified command, using parallelization."""
    from Operations.Ilya_Operations.PipeRun.python.PipeRun import PipeRun

    dbg( '"IN_RUNCMDPAR_EEEEEE" depends_on creates saveOutputTo' )

    if creates is None: creates = ()
    
    commands = MakeSeq( commands )
    depends_on = MakeSeq( depends_on )
    creates = MakeSeq( creates )

    gio = Dict( 'depends_on creates comment name mediumRuleName' )
    dbg( 'gio' )
    if getio: return Dict( 'depends_on creates comment name mediumRuleName saveOutputTo',
                           uses = ( splitFunc, joinFunc ) )

    splitFN = splitFN or list( MakeSeq( depends_on ) )[0]
    joinFN = RandomString(12) if saveOutputTo else ( joinFN or list( MakeSeq( creates ) )[0] )

    assert any([ splitFN in command for command in commands ])
    assert saveOutputTo or any([ joinFN in command for command in commands ])


    logging.info( 'calling ' + str( splitFunc ) + ' to split ' + splitFN )
    outDir = os.path.join( '/broad/hptmp', getpass.getuser(), 'par', os.path.abspath( splitFN )[1:] )

    pr = PipeRun( name = 'splitting', descr = 'splitting' )
    r = pr.addInvokeRule( invokeFn = doSplit, invokeArgs = Dict( 'splitFunc splitFN outDir' ) )
    pr.runSubPipeline()
    
    chunkFNs = SlurpFileLines( r.creates[0] )

    logging.info( 'finished running ' + str( splitFunc ) + ' to split ' + splitFN )
    dbg( '"CHUNKS_ARE" chunkFNs' )

    pr = PipeRun( name = 'parallelizing', descr = 'parallelizing' )
    chunkOutFNs = []
    for chunkFN in chunkFNs:
        chunkOutFN = AddFileSfx( chunkFN, 'out' )
        chunkOutFNs.append( chunkOutFN )

        for command in commands:
            dbg( 'splitFN chunkFN chunkOutFN command command.replace(splitFN,chunkFN)' )
        
        pr.addRule( commands = [ command.replace( splitFN, chunkFN ).replace( joinFN, chunkOutFN ) for command in commands ],
                    depends_on = [ f if f != splitFN else chunkFN for f in depends_on ],
                    creates = [ f if f != joinFN else chunkOutFN for f in creates ],
                    saveOutputTo = None if saveOutputTo is None else chunkOutFN )

    pr.runSubPipeline()

    joinFunc( inFNs = chunkOutFNs, outFN = None if saveOutputTo else joinFN )
示例#16
0
文件: fastcms.py 项目: quank/cms
def normalizeByFreq_compute_normed(rawVals,ancfreq, StdDev, expectation):

    Frequency = np.arange(0.05, 1.05, 0.05)
    #print Frequency

    der_freq = 1 - ancfreq
    normVal = np.repeat( np.nan, len( rawVals ) )

    # Bookkeeping

    dbg( 'StdDev' )
    dbg( 'der_freq' )
    for i in range(len(Frequency)):
        idx = ((Frequency[i] - der_freq) < .05) & ( (Frequency[i] - der_freq) >= 0 ) & np.isfinite( rawVals )
        normVal[ idx ] = (rawVals[ idx ] - expectation[i])/StdDev
#        dbg( '"KKKKK" i Frequency[i] expectation[i] idx.nonzero() rawVals[idx] normVal[idx]' )
    
    return normVal
示例#17
0
文件: fastcms.py 项目: quank/cms
def joinStats( snpInfoFN, statLikesFNs, likesRatioFN, outFN, getio = None ):
    """Join stats into one file"""

    if getio:
        return dict( depends_on = ( snpInfoFN, likesRatioFN ) + tuple( MakeSeq( statLikesFNs ) ),
                     creates = outFN )

    snpInfo = pd.read_table( snpInfoFN, index_col = 'SNP pos (bases)' )
    snpInfo.index.rename( 'pos', inplace = True )

    statLikes = [ pd.read_table( statLikeFN, index_col = 'pos' ) for statLikeFN in statLikesFNs ]
    likesRatio = pd.read_table( likesRatioFN, index_col = 'pos' )

    result = snpInfo.join( statLikes + [ likesRatio ], how = 'outer' )
    result.info()
    dbg( 'result.describe()' )
    
    result.to_csv( outFN, sep = '\t', na_rep = 'NaN', header = True )
示例#18
0
文件: resultPusher.py 项目: quank/cms
def StartPushers( use_args = None, as_daemons = False ):
    parser = optparse.OptionParser()
    parser.add_option( '-P', '--num-result-pushers', type='int', dest = 'numResultPushers',
                       help='create NUMPROCS parallel result pushers', metavar='NUMPROCS',
                       default=1 )
    parser.add_option( '-Q', '--queues', dest = 'queues',
                       default = os.path.join( '..', 'Other', 'queues', getpass.getuser() ),
                       help='push results for the specified QUEUES', metavar='QUEUES' )
    parser.add_option( '-S', '--sleepInterval', type='int',
                       help = 'between checks, sleep for SEC seconds', metavar = 'SEC', default = 20 )
    dbg( 'use_args' )
    (options, args) = parser.parse_args( args = sys.argv[1:] if use_args is None else list( use_args ) )
    dbg( 'options args' )
    assert not args
    
    for i in range( options.numResultPushers ):
        p = multiprocessing.Process( target = PushResults, args = ( options, ) )
        allPushers.append( p )
        p.daemon = as_daemons
        p.start()
        time.sleep( min( 1.0, random.normalvariate( 4.0, 1.0 ) ) )
示例#19
0
def normalizeByFreq_compute_normed(rawVals, ancfreq, StdDev, expectation):

    Frequency = np.arange(0.05, 1.05, 0.05)
    #print Frequency

    der_freq = 1 - ancfreq
    normVal = np.repeat(np.nan, len(rawVals))

    # Bookkeeping

    dbg('StdDev')
    dbg('der_freq')
    for i in range(len(Frequency)):
        idx = ((Frequency[i] - der_freq) < .05) & (
            (Frequency[i] - der_freq) >= 0) & np.isfinite(rawVals)
        normVal[idx] = (rawVals[idx] - expectation[i]) / StdDev


#        dbg( '"KKKKK" i Frequency[i] expectation[i] idx.nonzero() rawVals[idx] normVal[idx]' )

    return normVal
示例#20
0
文件: fastcms.py 项目: venkoyoung/cms
def joinStats(snpInfoFN, statLikesFNs, likesRatioFN, outFN, getio=None):
    """Join stats into one file"""

    if getio:
        return dict(depends_on=(snpInfoFN, likesRatioFN) +
                    tuple(MakeSeq(statLikesFNs)),
                    creates=outFN)

    snpInfo = pd.read_table(snpInfoFN, index_col='SNP pos (bases)')
    snpInfo.index.rename('pos', inplace=True)

    statLikes = [
        pd.read_table(statLikeFN, index_col='pos')
        for statLikeFN in statLikesFNs
    ]
    likesRatio = pd.read_table(likesRatioFN, index_col='pos')

    result = snpInfo.join(statLikes + [likesRatio], how='outer')
    result.info()
    dbg('result.describe()')

    result.to_csv(outFN, sep='\t', na_rep='NaN', header=True)
示例#21
0
def DefineRulesTo_CreateSimulationParams( pr, mutAges, mutPops, mutFreqs,
					  Ddata = '../Data/Ilya_Data/sim/sfs/working/pardis2',
					  suffix = '', inputParamsFiles = None,
					  scen2alternateNeutralParams = {} ):

	"""Create simulation parameters for cross( mutAges, mutPops, mutFreqs ).
	"""

	dbg( '"YYYYYYYYYYYYY" inputParamsFiles' )
	if inputParamsFiles == None:
		inputParamsFiles = [ Ddata + '/simParams' + befAft + suffix + '.txt' for befAft in 'Bef', 'Aft' ]
	dbg( '"ZZZZZZZZZZZZZ" inputParamsFiles' )

	pr.addInvokeRule( invokeFn = CreateSimsParams_neutral, invokeArgs = Dict( 'Ddata suffix inputParamsFiles' ) )
	for mutAge in mutAges:
		for mutPop in mutPops:
			for mutFreq in mutFreqs:
				pr.addInvokeRule( invokeFn = CreateSimsParams_selection,
						  invokeArgs = dict( mutAges = ( mutAge, ), mutPops = (mutPop,),
								     mutFreqs = (mutFreq,),
								     **Dict( 'Ddata suffix scen2alternateNeutralParams' ) ),
						  mediumRuleNameSfx = ( mutAge, mutPop, mutFreq ),
                                                  attrs = Dict( 'mutAge mutPop mutFreq' ) )
示例#22
0
文件: fastcms.py 项目: quank/cms
    def LoadComparison( pop ):
        """Load comparison with one pop"""

        d0 = pd.read_csv( pop2FN[ pop ], sep = '\t',
                          usecols = ( 'Pop 1', 'Pop 2', 'Chrom' ), nrows = 1 )
        dbg( 'd0' )

        assert str( d0.loc[ 0, 'Chrom' ] ) == str( chrom )
        assert ( d0.loc[ 0, 'Pop 1'] == selPop and d0.loc[0, 'Pop 2'] == pop ) or ( d0.loc[0,'Pop 1'] == pop and d0.loc[0,'Pop 2'] == selPop )

        flip = ( d0.loc[0,'Pop 1'] == pop )
        
        d = pd.read_csv( pop2FN[ pop ], sep = '\t',
                         usecols = ( 'SNP pos (bases)', 'L AllEHH logratio Deviation', 'R AllEHH logratio Deviation' ),
                         index_col = 'SNP pos (bases)',
                         na_values = ( '-', ) )
        d.info()

        if flip:
            d[ 'L AllEHH logratio Deviation' ] *= -1
            d[ 'R AllEHH logratio Deviation' ] *= -1

        return pd.DataFrame.from_dict({ pop : d.max( axis = 1 ) })
示例#23
0
文件: tsvutils.py 项目: quank/cms
def checkTableKey( inFN, cols, comparison = 'lt', writeCheckedFile = True,
                   tsvOpts = {}, lineFilter = None, lineFilterCols = (), getio = None ):
  """Check that in the given table, record identifiers increase uniformly.

  Params:

     cols - the columns whose tuple should uniformly inrease
     comparison - this comparison must be true between each record and the next.
       the comparison is the name of a routine in the operator module.
  """

  cols = tuple( MakeSeq( cols ) )
  lineFilterCols = tuple( MakeSeq( lineFilterCols ) )
  checkedFN = Str( '$inFN.checked_${comparison}' ) + Sfx( *cols )
  if getio: return dict( depends_on = inFN, creates = checkedFN if writeCheckedFile else (),
                         attrs = dict( piperun_short = True ) )

  comparisonFunc = getattr( operator, comparison )
  prevRec = None
  loadCols = cols + lineFilterCols

  nskipped = 0
  nchecked = 0
  for i, r in enumerate( IDotData( inFN, ToLoad = loadCols, **tsvOpts ) ):
    if lineFilter and not lineFilter( r ):
      nskipped += 1
      continue
    
    thisRec = r[ cols ] if IsSeq( r ) else ( r, )
    if i > 0 and not comparisonFunc( prevRec, thisRec ):
      logging.error( Str( 'at line $i of $inFN, looking at $cols: $prevRec is not $comparison $thisRec' ) )
      assert False
    else: nchecked += 1
    prevRec = thisRec

  dbg( 'nchecked nskipped' )
  DumpFile( checkedFN, 'checked ok.' )
示例#24
0
文件: fastcms.py 项目: quank/cms
def computeLikeRatioForStat_do( statVals, hitLikes, missLikes, bins ):
    """Compute likes ratio"""

    # Precompute the likelihood ratio corresponding to each bin
    
    dbg( 'statVals hitLikes missLikes bins' )
         
    indNaN = hitLikes != 1e-10
    missingVal = np.log( np.min( hitLikes[indNaN] / missLikes[indNaN] ) )

    
    CLR = [ ( np.log( hitLike / missLike ) if hitLike != 1e-10 else missingVal ) if hitLike != 0.0 else np.nan
            for hitLike, missLike in zip( hitLikes, missLikes ) ]
    CLR = np.array( [ CLR[0] ] + CLR + [ CLR[-1] ] )

    binIds = np.digitize( statVals.values, bins )
    st_binSize = ( bins[1] - bins[0] )
    st_nbins = len( bins ) -1
    binIds2 = np.where( np.isfinite( statVals.values ),
                        np.clip( ( ( statVals.values - bins[0] ) / st_binSize ).astype( np.int16 ), 0, st_nbins-1 ),
                        len( hitLikes ) ) + 1

    return np.where( np.isnan( statVals.values ),
                     np.repeat( np.nan, len( statVals ) ), CLR[ binIds ] ), binIds, binIds2
示例#25
0
文件: fastcms.py 项目: quank/cms
def computeMeanStd( inFNs, colName, outFN, getio = None ):
    """Compute mean and std using blaze"""
    if getio: return dict( depends_on = inFNs, creates = outFN )

    filenames = inFNs
    dbg( 'inFNs' )

    sk = StatKeeper()
    for f in filenames:
        dbg( 'f' )
        d = pd.read_table( f )
        dbg( 'f len(d)' )
        sk.addVals( d[ colName ].values )

    pd.DataFrame( dict( stat = 'mean std count numNaNs'.split(),
                        val = ( sk.getMean(), sk.getStd(), sk.getCount(), sk.getNumNaNs() ) ) ).to_csv( outFN,
                                                                                                        sep = '\t',
                                                                                                        index = False,
                                                                                                        na_rep = 'NaN' )
示例#26
0
def computeMeanStd(inFNs, colName, outFN, getio=None):
    """Compute mean and std using blaze"""
    if getio: return dict(depends_on=inFNs, creates=outFN)

    filenames = inFNs
    dbg('inFNs')

    sk = StatKeeper()
    for f in filenames:
        dbg('f')
        d = pd.read_table(f)
        dbg('f len(d)')
        sk.addVals(d[colName].values)

    pd.DataFrame(
        dict(stat='mean std count numNaNs'.split(),
             val=(sk.getMean(), sk.getStd(), sk.getCount(),
                  sk.getNumNaNs()))).to_csv(outFN,
                                            sep='\t',
                                            index=False,
                                            na_rep='NaN')
示例#27
0
def gather_iHS_scores(selPop,
                      chrom,
                      ihsFN,
                      pop2ancFreqFN,
                      ihsOutFN,
                      dihhOutFN,
                      getio=None):
    """Gather iHS scores"""

    if getio:
        return dict(depends_on=(ihsFN, pop2ancFreqFN),
                    creates=(ihsOutFN, dihhOutFN),
                    attrs=Dict('chrom', pop=selPop, piperun_short=True))

    d0 = pd.read_csv(ihsFN, sep='\t', usecols=('Population', 'Chrom'), nrows=1)
    dbg('d0')

    assert str(d0.loc[0, 'Chrom']) == str(chrom)
    assert d0.loc[0, 'Population'] == selPop

    d = pd.read_csv(ihsFN,
                    sep='\t',
                    usecols=('SNP pos (bases)', 'Ancestral Freq', 'Both iHS',
                             'Both iHH_D', 'Both iHH_A'),
                    index_col='SNP pos (bases)',
                    na_values=('-', ))

    d.index.name = 'pos'

    pop2ancFreq = pd.read_table(pop2ancFreqFN,
                                index_col='pos',
                                usecols=(
                                    'pos',
                                    selPop,
                                ))
    #    snp2ancFreq = pd.read_table( snpInfoFN, index_col = 'SNP pos (bases)',
    #                                 usecols = ( 'SNP pos (bases)', 'Ancestral Freq' ) )
    #    snp2ancFreq.dropna( inplace = True )
    # dbg( 'len(pop2ancFreq) len(snp2ancFreq) pop2ancFreq.index.difference(snp2ancFreq.index)' )
    # dbg( 'len(pop2ancFreq) len(snp2ancFreq) snp2ancFreq.index.difference(pop2ancFreq.index)' )
    # dbg( 'np.all(pop2ancFreq.index.values==snp2ancFreq.index.values)' )
    # dbg( 'np.sum(pop2ancFreq.index.values==snp2ancFreq.index.values)' )
    # dbg( 'len(pop2ancFreq.index.values) len(snp2ancFreq.index.values)' )
    #    pop2ancFreq.index.name = 'pos'
    #    dbg( '3 pop2ancFreq selPop pop2ancFreq.columns' )
    pop2ancFreq.rename(columns={selPop: selPop + '_ancFreq'}, inplace=True)
    #    dbg( '4 pop2ancFreq' )

    #    print "ii:", pop2ancFreq.info()
    #    pop2ancFreq.to_csv( 'pf.tsv', sep = '\t', header = True, na_rep = 'NaN' )
    #    dbg( '1 d' )

    d = d.join(pop2ancFreq, how='right', sort=True)

    #    dbg( '2 d' )

    #    af1 = d['Ancestral Freq']
    af2 = d[selPop + '_ancFreq']
    #    dbg( '"GGGGGGGGGG" (af1-af2).max() (af1.isnull()==af2.isnull()).all()' )

    d_iHS = pd.DataFrame(data=dict(iHS=d['Both iHS']))
    d_iHS.to_csv(ihsOutFN, sep='\t', header=True, na_rep='NaN')

    #    dihh = subs.normalizeByFreq( rawVals = ( d[ 'Both iHH_D' ] - d[ 'Both iHH_A' ] ).values,
    #                                 ancfreq = 1.0 - af2.values )
    d_iHH = pd.DataFrame(data=dict(iHHDiff=d['Both iHH_D'] - d['Both iHH_A'],
                                   normingFreqs=1.0 - af2))

    d_iHH.to_csv(dihhOutFN, sep='\t', header=True, na_rep='NaN')
示例#28
0
def mergeSims( scenario, Ddata, posFileFN = None, simsOut = 'simsOut', nreplicas = 100, thinExt = '', thinSfx = '',
	       putativeMutPop = None, outFile = None,
	       pop2name = pop2name, statsSfx = '', ihsSfx = '',
               limitToPop = None,
	       getio = None ):
	"""Gathers per-SNP information, for all replicas of a given scenario, and outputs it in a single DotData where each line
	gives info for one SNP.

	Specifically, reads simulation and Sweep output, collects columns needed for composite likehood test (chrom, base pair position, genetic
	distance, anc frequencies for 3 populations, xpop for each pair, and ihs, iHH_A and iHH_D for selected population)

	Input params:

	   scenario - an object of class Scenario, indicating the simulation scenario (either neutral or a selection scenario)
	       from which all replicas were simulated.
	   nreplicas - the number of replicas simulated under this scenario.
	      Each replica represents a chromosome region, with a set of SNPs on it.
	   
	   Ddata - the directory under which the simulations and the Sweep analysis results live.
	     Under this directory we expect to find:
	         iHS analysis results, under power_ihs/
		 XP-EHH analysis results, under power_xpop
		 simulation output giving SNP positions

	   thinExt - the extension appended to simulation files that describe the SNPs in the simulated replica.
	      Sometimes we create simulations and then thin them under different thinning models (to simulate SNP ascertainment
	      by the various stages of HapMap; these differently thinned versions of the same simulations might be stored in
	      simulation files with different extensions.

	   thinSfx - the suffix appended to the power_ihs and power_xpop directory names, telling where to find iHS and XP-EHH
	      analyses of the simulations.   When we analyze the same simulations after applying different thinning scenarios,
	      the iHS and XP-EHH analyses for each thinning scenario go into a separate set of directories.


	   putativeMutPop - the population in which, we think, selection is occurring, aka "putatively selected population".
	      In practice, when localizing a given region, we will usually suspect that selection has occurred in a particular
	      population.   When doing a genome-wide scan, we can do several scans assuming each population in turn to be
	      the selected population, and find regions selected in that population.

        Output params:

	    Ddata - under Ddata writes a DotData named merged_scenName.data, where each line gives info
	        for one SNP, with the following columns (type of data is float unless stated otherwise):

	        CHROM_POS 1 - physical (basepair) position of the SNP within its replica.
	           Note that one merged file contains SNPs for a set of replicas (all for the same scenario),
		   so there could be multiple SNPs with the same position.  The replica number
		   is given in the Chrom column.
		FREQ1 1 - derived allele frequency in pop 1 ( European )
		FREQ1 4 - derived allele frequency in pop 4 ( EastAsian )
		FREQ1 5 - derived allele frequency in pop 5 ( WestAfrican )

		R AllEHH logratio Deviation European_WestAfrican - XP-EHH score to the right of the SNP,
		   between European and WestAfrican pops, normalized to the neutral background.
		   Analogously for the next five columns:
		L AllEHH logratio Deviation European_WestAfrican
		R AllEHH logratio Deviation EastAsian_European
		L AllEHH logratio Deviation EastAsian_European
		R AllEHH logratio Deviation EastAsian_WestAfrican
		L AllEHH logratio Deviation EastAsian_WestAfrican

		SNP pos (cM) European_WestAfrican - genetic map position of this SNP, within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		SNP pos (bases) European_WestAfrican - physical (basepair) position of this SNP within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		Chrom European_WestAfrican - the replica from which this SNP comes; can be nan.
		   (the European_WestAfrican suffix is irrelevant)
		Chrom - the replica from which this SNP comes; can be nan
		SNP pos (bases) - physical (basepair) position of this SNP within its replica.
		SNP pos (cM) - genetic map position of this SNP within its replica
		Both iHH_A - sum of iHH_A for both directions from this SNP
		Both iHH_D - sum of iHH_D for both directions from this SNP
		Both iHS - the value in 'Both Unstandardised iHS' (below), but binned by derived allele frequency
		   and normalized within the bin.
		Left iHH_D - iHH_D to the left of the SNP (the raw integral value).  analogously for the next three.
		Right iHH_D
		Left iHH_A
		Right iHH_A
		Both Unstandardised iHS - log( (iHH_A_left + iHH_A_right) / ( iHH_D_left + iHH_D_right ) )
		   ( see also 'Both iHS' column for the standardized iHS score )
	
	"""

	if not Ddata.endswith('/'): Ddata += '/'

	assert nreplicas > 0
	dbg( 'pop2name' )

	SimDir = os.path.join( Ddata, simsOut + thinSfx )

	scenName = scenario.scenName()
	scenDir = scenario.scenDir()

	if putativeMutPop == None: putativeMutPop = scenario.mutPop
	
	ihsSignifFN = os.path.join( Ddata, 'power_ihs' + thinSfx, scenDir,
				    'ihs_sig_' + pop2name[ putativeMutPop ] + ihsSfx + '.tsv' )

	popNames = sorted( pop2name.values() )
	popNums = sorted( pop2name.keys() )
	minPopNum = popNums[ 0 ]

	posFileKeyCols = ( 'replicaNum', 'CHROM_POS %d' % minPopNum )
	xpopIhsKeyCols = ('Chrom', 'SNP pos (bases)')
	
	popPairs = [ '%s_%s' % ( popNames[ pop1idx ], popNames[ pop2idx ] )
		     for pop1idx in range( len( popNames ) ) for pop2idx in range( pop1idx+1, len( popNames ) )
                     if limitToPop is None or limitToPop in ( popNames[ pop1idx ], popNames[ pop2idx ] )  ]
	
	xpopSignifFNs = [ os.path.join( Ddata, 'power_xpop' + thinSfx, scenDir, 'xpop_significance_' + popPair + '.tsv' )
			  for popPair in popPairs ]

	snpStatsDir = os.path.join( Ddata, 'snpStats' + thinSfx, scenario.scenDir() )
    
	mergedData = outFile if outFile else os.path.join( snpStatsDir, AddFileSfx( 'merged.tsv', statsSfx, putativeMutPop, ihsSfx ) )

	fileDescrs = \
	{ mergedData :
		  ( 'Various per-snp statistics for SNPs in scenario $scenario, replicas 0-$nreplicas, '
		    'assuming selection in ' + pop2name[ putativeMutPop ],
		    ( ( 'CHROM_POS 1', 'physical (basepair) position of the SNP within its replica. '
			'Note that one merged file contains SNPs for a set of replicas (all for the same scenario), '
			'so there could be multiple SNPs with the same position.  The replica number '
			'is given in the Chrom column. ' ), 
		      ( 'FREQ1 1', 'derived allele frequency in pop 1 ( European )' ),
		      ( 'R AllEHH logratio Deviation European_WestAfrican', 'XP-EHH score to the R of the SNP, '
			'between European and WestAfrican pops, normalized to the neutral background.' ),
		      ( 'SNP pos (cM) European_WestAfrican', 'genetic map SNP position' ),
		      ( 'SNP pos (bases) European_WestAfrican', 'physical SNP position' ),
		      ( 'Chrom European_WestAfrican', 'chromosome (or replica number)' ),
		      ( 'Chrom', 'chromosome (or replica number)' ),
		      ( 'SNP pos (bases)', 'physical SNP position' ),
		      ( 'SNP pos (cM)', 'genetic map SNP position' ),
		      ( 'Both iHH_A', 'sum of iHH_A scores for both sides' ),
		      ( 'Both iHH_D', 'sum of iHH_D scores for both sides' ),
		      ( 'Both iHS', 'sum of iHS scores for both sides' ),
		      ( ' Left iHH_D', 'iHH_D score to the left of the SNP' ),
		      ( 'Right iHH_D', 'iHH_D score to the right of the SNP' ),
		      ( 'Left iHH_A', 'iHH_A score to the left of the SNP' ),
		      ( 'Right iHH_A', 'iHH_A score to the right of the SNP' ), 
		      ( 'Both Unstandardised iHS', 'sum of unstandardized iHS scores for both sides' ) ) ) }

	if posFileFN is None: posFileFN = os.path.join( Ddata, 'snpStats' + thinSfx, scenario.scenDir(),
							AddFileSfx( 'mergedPosStacked.tsv', statsSfx, putativeMutPop, ihsSfx ) )
	
	if getio: return dict( depends_on = [ posFileFN, ihsSignifFN ] + xpopSignifFNs, creates = mergedData,
			       splitByCols = dict([ ( posFileFN, dict( keyCols = posFileKeyCols ) ) ]
						  + [ ( signifFN, dict( keyCols = xpopIhsKeyCols ) ) for signifFN in [ ihsSignifFN ] + xpopSignifFNs ] ),
			       mediumRuleNameSfx = ( scenario.scenDir(), putativeMutPop ),
			       fileDescrs = fileDescrs,
                               attrs = Dict( 'putativeMutPop nreplicas pop2name' ) )

	dashFixer = lambda v: v if v != '-' else np.nan

	ihsAll = IDotData(ihsSignifFN, valueFixer = dashFixer)
	ihsAll = ihsAll[('Chrom','SNP pos (bases)','SNP pos (cM)','Both iHH_A','Both iHH_D','Both iHS',
			 'Left iHH_D','Right iHH_D','Left iHH_A','Right iHH_A','Both Unstandardised iHS')]
	def chkReplica( r, n = nreplicas ): return r.Chrom < n 
		
	ihsAll = ihsAll.takewhile( chkReplica )
	
	xpopCols = ('Chrom','SNP pos (bases)','SNP pos (cM)','L AllEHH logratio Deviation','R AllEHH logratio Deviation')

	xpopSignif = tuple( [ IDotData(xpopSignifFN, valueFixer = dashFixer)[ xpopCols].takewhile( chkReplica )
			      for xpopSignifFN in xpopSignifFNs ] )
	
	posCols = ['CHROM_POS %d' % minPopNum ] + ['FREQ1 %d' % popNum for popNum in popNums]

	result = IDotData.merge( iDotDatas =  ( IDotData( posFileFN ), ) + xpopSignif + ( ihsAll, ),
				 cols = (posFileKeyCols,) +
					 (xpopIhsKeyCols,) * ( len( popPairs ) + 1 ),
				 blanks = (None,) + (np.nan,) * ( len( popPairs ) + 1 ),
				 suffixes = ['pos'] + [ ' %s' % popPair for popPair in popPairs ] + [ '' ] )

	aPopPair = 'European_WestAfrican' if 'European_WestAfrican' in popPairs else popPairs[0]
	useCols = [ 'replicaNum' ] + posCols + \
	    [ '%s AllEHH logratio Deviation %s' % ( side, popPair ) for popPair in popPairs for side in ( 'L', 'R' ) ] + \
	    [ 'SNP pos (cM) ' + aPopPair,
	      'SNP pos (bases) ' + aPopPair,
	      'Chrom ' + aPopPair,
	      'Chrom',
	      'SNP pos (bases)',
	      'SNP pos (cM)',
	      'Both iHH_A',
	      'Both iHH_D',
	      'Both iHS' ]

	if len( popPairs ) == 1:
		result = result.renameCols( { 'L AllEHH logratio Deviation' : 'L AllEHH logratio Deviation ' + aPopPair,
					      'R AllEHH logratio Deviation' : 'R AllEHH logratio Deviation ' + aPopPair } )
					      
	result[ useCols ].save( mergedData )
	
	logging.info( 'Finished mergeSims()' )
示例#29
0
def fst_onePopPair(ancFreqs, sampleSizes):
        """Compute fst between two pops, for each SNP, given the ancestral freq in each pop and the sample sizes.
        """

        dbg( 'ancFreqs sampleSizes' )

        n = sampleSizes
        n_tot = n[0] + n[1]
#        nanc =  np.ceil([ ancFreqs[0] * sampleSizes[0], ancFreqs[1] * sampleSizes[1] ])
        nanc =  ( np.array([ ancFreqs[0] * sampleSizes[0], ancFreqs[1] * sampleSizes[1] ]) + .5 ).astype( int )

        dbg( 'nanc' )

        f1 = ancFreqs
        f2 = 1 - ancFreqs

        dbg( 'f1 f2 f1.shape f2.shape' )

        # Use Weir-Hill estimator for Fst

        pmean = (nanc[0] + nanc[1]) / n_tot

        nic = n[0] - (n[0]*n[0])/n_tot
        njc = n[1] - (n[1]*n[1])/n_tot
        dbg( 'pmean.shape nic.shape njc.shape nic njc' )
        #assert nic == njc == 60
        nc = nic + njc
        #print nc, f1[0], f1[1]
        #print f2[0], f2[1]
        msp = n[0] * (f1[0] - pmean) * (f1[0] - pmean) \
            + n[1] * (f1[1] - pmean) * (f1[1] - pmean)
        msg = ((n[0] * f1[0]* f2[0]) + (n[1] * f1[1] * f2[1])) \
            / (n[0] - 1 + n[1] - 1)
        dbg( 'msp.shape msg.shape' )
        num = msp - msg
        denom = msp + (msg*(nc - 1))
        #print msp, msg
        #print num, denom
        dbg( 'num.shape denom.shape' )
        denom [ denom == 0 ] = nan
        an_fst = num / denom

        dbg( 'an_fst.shape' )

        ipop = 0
        jpop = 1
        dbg( '"KKKKKKKKKKKKK" pmean nanc[ipop] nanc[jpop] n[ipop] n[jpop] nic njc nc msp msg num denom an_fst' )
        
        return an_fst
示例#30
0
 def SetStopSignal(sigNum, stkFrm):
     logging.info('Setting stop signal to stop pushers')
     stopSignal[0] = True
     dbg('"aftset" stopSignal')
示例#31
0
def runCmdParallelized(commands,
                       depends_on,
                       creates,
                       comment,
                       splitFunc,
                       joinFunc,
                       saveOutputTo=None,
                       splitFN=None,
                       joinFN=None,
                       name=None,
                       mediumRuleName=None,
                       getio=None):
    """Run the specified command, using parallelization."""
    from Operations.Ilya_Operations.PipeRun.python.PipeRun import PipeRun

    dbg('"IN_RUNCMDPAR_EEEEEE" depends_on creates saveOutputTo')

    if creates is None: creates = ()

    commands = MakeSeq(commands)
    depends_on = MakeSeq(depends_on)
    creates = MakeSeq(creates)

    gio = Dict('depends_on creates comment name mediumRuleName')
    dbg('gio')
    if getio:
        return Dict(
            'depends_on creates comment name mediumRuleName saveOutputTo',
            uses=(splitFunc, joinFunc))

    splitFN = splitFN or list(MakeSeq(depends_on))[0]
    joinFN = RandomString(12) if saveOutputTo else (
        joinFN or list(MakeSeq(creates))[0])

    assert any([splitFN in command for command in commands])
    assert saveOutputTo or any([joinFN in command for command in commands])

    logging.info('calling ' + str(splitFunc) + ' to split ' + splitFN)
    outDir = os.path.join('/broad/hptmp', getpass.getuser(), 'par',
                          os.path.abspath(splitFN)[1:])

    pr = PipeRun(name='splitting', descr='splitting')
    r = pr.addInvokeRule(invokeFn=doSplit,
                         invokeArgs=Dict('splitFunc splitFN outDir'))
    pr.runSubPipeline()

    chunkFNs = SlurpFileLines(r.creates[0])

    logging.info('finished running ' + str(splitFunc) + ' to split ' + splitFN)
    dbg('"CHUNKS_ARE" chunkFNs')

    pr = PipeRun(name='parallelizing', descr='parallelizing')
    chunkOutFNs = []
    for chunkFN in chunkFNs:
        chunkOutFN = AddFileSfx(chunkFN, 'out')
        chunkOutFNs.append(chunkOutFN)

        for command in commands:
            dbg('splitFN chunkFN chunkOutFN command command.replace(splitFN,chunkFN)'
                )

        pr.addRule(
            commands=[
                command.replace(splitFN, chunkFN).replace(joinFN, chunkOutFN)
                for command in commands
            ],
            depends_on=[f if f != splitFN else chunkFN for f in depends_on],
            creates=[f if f != joinFN else chunkOutFN for f in creates],
            saveOutputTo=None if saveOutputTo is None else chunkOutFN)

    pr.runSubPipeline()

    joinFunc(inFNs=chunkOutFNs, outFN=None if saveOutputTo else joinFN)
示例#32
0
def evalSpatialLoc(Ddata,
                   thinSfx,
                   scenario,
                   putativeMutPop,
                   nreplicas,
                   complikeSfx='',
                   likesTableSfx='',
                   selpos=500000,
                   whichSpatialLoc='Spline',
                   getio=None):
    """Evaluate spatial localization.  Compute relevant replica statistic.  For each replica,
    compute: whether the localized intervals include the causal SNP; statistics about the
    localized intervals; the position of the causal SNP relative to the localized intervals."""

    assert not scenario.isNeutral()

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = (putativeMutPop, complikeSfx, likesTableSfx)

    intervalsListFN = os.path.join(
        replicaStatsDir,
        AddFileSfx('intervals%sList.tsv' % whichSpatialLoc, *sfxs))
    causalGdPosFN = os.path.join(replicaStatsDir, 'causalGdPos.tsv')
    spatialLocEvalFN = os.path.join(
        replicaStatsDir,
        AddFileSfx('spatialLocEval%s.tsv' % whichSpatialLoc, *sfxs))

    if getio:
        return dict(depends_on=(intervalsListFN, causalGdPosFN),
                    creates=spatialLocEvalFN,
                    mediumRuleNameSfx=scenario.scenDir(),
                    name='evalSpatialLoc_' + whichSpatialLoc)

    with IDotData.openForWrite(
            spatialLocEvalFN,
            'replicaNum numIntervals totLenBp totLenGd causalIncluded '
            'distanceToIntervalBoundaryBp distanceToIntervalBoundaryGd'
    ) as spatialLocEvalFile:

        for ( replicaNum2, replicaIntervals ), ( replicaNum3, causalGdPos, replicaNum4 ) in \
                zip( IDotData( intervalsListFN ).groupby( 'replicaNum' ),
                                IDotData.merge( iDotDatas = ( IDotData( causalGdPosFN ),
                                                              IDotData( intervalsListFN ).replicaNum.removeDups() ),
                                                cols = ( 'replicaNum', 'replicaNum' ) ) ):

            replicaNum2, replicaNum3, replicaNum4 = list(
                map(int, (replicaNum2, replicaNum3, replicaNum4)))
            if not replicaNum2 == replicaNum3 == replicaNum4:
                dbg('replicaNum2 replicaNum3 replicaNum4 intervalsListFN complikeFN causalGdPosFN spatialLocEvalFN'
                    )
            assert replicaNum2 == replicaNum3 == replicaNum4

            causalIncluded = False
            totLenBp = 0
            totLenGd = 0.0
            for replicaInterval in replicaIntervals:
                dbg('replicaInterval causalGdPos')
                #assert bool( replicaInterval.bpFrom <= selpos <= replicaInterval.bpTo ) == bool( replicaInterval.gdFrom <= causalPos_gd <= replicaInterval.gdTo )
                assert (replicaInterval.gdFrom <= causalGdPos <=
                        replicaInterval.gdTo) == (replicaInterval.bpFrom <=
                                                  selpos <=
                                                  replicaInterval.bpTo)
                if replicaInterval.gdFrom <= causalGdPos <= replicaInterval.gdTo:
                    causalIncluded = True

                totLenGd += (replicaInterval.gdTo - replicaInterval.gdFrom)
                totLenBp += (replicaInterval.bpTo - replicaInterval.bpFrom)

            spatialLocEvalFile.writeRecord(
                replicaNum2, len(replicaIntervals), totLenBp, totLenGd,
                int(causalIncluded),
                np.min((np.min(np.abs(replicaIntervals.bpFrom - selpos)),
                        np.min(np.abs(replicaIntervals.bpTo - selpos)))),
                np.min((np.min(np.abs(replicaIntervals.gdFrom - causalGdPos)),
                        np.min(np.abs(replicaIntervals.gdTo - causalGdPos)))))
示例#33
0
def PushResults(options):
    """Push finished results"""

    logging.info(
        'Starting result pusher as process %d on host %s with options %s' %
        (os.getpid(), GetHostName(), options))

    # check that all queues exist
    assert all(
        os.path.exists(queue) and os.path.isdir(queue)
        for queue in options.queues.split(os.pathsep))

    stopSignal = [False]

    def SetStopSignal(sigNum, stkFrm):
        logging.info('Setting stop signal to stop pushers')
        stopSignal[0] = True
        dbg('"aftset" stopSignal')

    signal.signal(signal.SIGUSR2, SetStopSignal)

    while not stopSignal[0]:

        for queue in options.queues.split(os.pathsep):

            logging.info('Pushing results in ' + queue + '...')

            # find an unclaimed task in this queue, and try to claim it
            try:
                taskDirs = [f for f in os.listdir(queue) if f.startswith('mq')]
            except EnvironmentError as e:
                logging.info('Error getting list of tasks in queue ' + queue +
                             ': ' + str(e))
                # sleep a bit -- maybe it's some transient condition that will resolve itself
                time.sleep(60 + random.normalvariate(3.0, 1.0))
                continue

            for taskDir in taskDirs:
                fullTaskDir = os.path.join(queue, taskDir)

                try:
                    pushingFN = os.path.join(fullTaskDir, 'pushing.dat')
                    submithostFN = os.path.join(fullTaskDir, 'submithost.txt')
                    if os.path.exists( os.path.join( fullTaskDir, 'completed.dat' ) ) \
                            and os.path.exists( submithostFN ) and GetSubmitHost( submithostFN ) == GetHostName() \
                            and not os.path.exists( pushingFN ):
                        try:
                            fd = os.open(pushingFN,
                                         os.O_CREAT | os.O_EXCL | os.O_WRONLY)
                        except EnvironmentError as e:
                            if e.errno not in (errno.EEXIST, errno.EACCES,
                                               errno.EAGAIN):
                                raise
                            # another resultPusher beat us to this task -- go and check other tasks
                            continue

                        os.write(
                            fd,
                            'result being pushed by process %d on host %s' %
                            (os.getpid(), GetHostName()))
                        os.close(fd)

                        taskDescr = ''

                        try:
                            attrsFN = os.path.join(fullTaskDir, 'attrs.tsv')
                            if os.path.exists(attrsFN):
                                taskDescr += ' output in ' + GetTaskAttr(
                                    attrsFN, 'piperun_outputSavedTo')
                        except EnvironmentError as e:
                            logging.info('Could not read attrs for task in ' +
                                         fullTaskDir + ': ' + str(e))

                        try:
                            infoFNs = [
                                os.path.join(fullTaskDir, f)
                                for f in ('command.dat', 'attrs.tsv',
                                          'claimed.dat')
                            ]
                            infoContents = '\n'.join([
                                SlurpFile(f)
                                if os.path.exists(f) else 'missing file: ' + f
                                for f in infoFNs
                            ])

                            thePipe = os.open(
                                os.path.join(fullTaskDir, 'getresult'),
                                os.O_WRONLY | os.O_NONBLOCK)
                            exitCodeReadOk = False
                            writeOk = False
                            closeOk = False
                            exitCode = 'UNKNOWN'
                            try:
                                exitCode = SlurpFile(
                                    os.path.join(fullTaskDir,
                                                 'exitCode.dat')).strip()
                                exitCodeReadOk = True
                                os.write(thePipe, exitCode)
                                writeOk = True
                            finally:
                                os.close(thePipe)
                                closeOk = True

                            logging.info(
                                'Pushed result in ' + fullTaskDir + ': ' +
                                ('nonzero ' if exitCode != '0' else '') +
                                'exit code ' + exitCode + taskDescr)

                            if not writeOk or not closeOk or not exitCodeReadOk:
                                dbg('exitCodeReadOk writeOk closeOk')
                            if exitCodeReadOk and exitCode != '0':
                                logging.info(infoContents)

                        except EnvironmentError as e:
                            logging.info('The task at ' + fullTaskDir +
                                         ' seems to have been orphaned: ' +
                                         e.strerror)

                except EnvironmentError as e:
                    logging.info('Error processing queue ' + queue + ' task ' +
                                 taskDir + ': ' + str(e))
                    # sleep a bit -- maybe it's some transient condition that will resolve itself
                    time.sleep(60 + random.normalvariate(3.0, 1.0))

        # if we pushed at least something, go back and try again.  if not, wait.
        time.sleep(options.sleepInterval + random.normalvariate(3.0, 1.0))
示例#34
0
parser.add_argument( '--test-name', help = 'name of test; is name of directory under tests/ where test is saved' )
parser.add_argument( '--bootstrap-count', type = int, default = 1000, help = 'number of boostrap iters' )
parser.add_argument( '--cosi-binary', default = './coalescent', help = 'cosi binary to run' )

print 'calling parser'
args = parser.parse_args()
print 'parser done'
# do a reference run

print 'generating reference'
SystemSucceed( ' '.join(map( str, (args.cosi_binary, '-p', '1_simple.cosiParams', '-n', 100, '-m' ))) + ' | sample_stats_extra > ref.tsv' )
refData = DotData( SVPath = 'ref.tsv' )
min_p = np.ones( len( refData.dtype.names ) )
max_D = np.repeat( -np.inf, len( refData.dtype.names ) )
for i in range( 10 ):
    dbg( 'i' )
    refFN = 'reftest%d.tsv' % i
    SystemSucceed( ' '.join(map( str, ( args.cosi_binary, '-p', '0_simple.cosiParams', '-n', 100, '-m' ))) + ' | sample_stats_extra > ' + refFN )
    z = DotData( SVPath = refFN )
    for colNum, col in enumerate( z.dtype.names ):
        ks_D, ks_p = stats.ks_2samp( refData[ col ], z[ col ] )
        min_p[ colNum ] = np.min(( min_p[ colNum ], ks_p ))
        max_D[ colNum ] = np.max(( max_D[ colNum ], ks_D ))
    dbg( 'i min_p max_D' )

    

        
        
    
    
示例#35
0
文件: fastcms.py 项目: quank/cms
def DefineRulesTo_fastCMS( pr, pops, chroms, selPop, sweepDir, cmsDir, genomeBuild = 'hg19' ):
    """Define rules to do fast CMS computation.

    Params:

       pr - the PipeRun object to which to add rules

       selPop - testing selection in which pop?
       pops - comparing selPop to which pops?
       sweepDir - the sweep directory
       cmsDir - the directory under which CMS stats go
    """

    pops = list( MakeSeq( pops ) )
    if selPop not in pops: pops.append( selPop )

    allPops = tuple( MakeSeq( pops ) )
    if selPop not in allPops: allPops += ( selPop, )
    cmpPops = [ pop for pop in allPops if pop != selPop ]

    rawScoresFN = {}

    genMapSfx = genomeBuild2genMapSfx[ genomeBuild ]
    for pop in allPops:
        for chrom in chroms:
            with pr.settingAttrs( 'pop chrom' ):
                snpInfoFN = os.path.join( sweepDir, 'analysis/chr%(chrom)s/snps_%(pop)s.tsv' % locals() )
                projDir = os.path.join( sweepDir, 'data/chr%(chrom)s' % locals() )
                ancestralImportedFN = os.path.join( projDir, 'ancestral.tsv.imported' )
                genotypesImportedFN = os.path.join( projDir, 'genotypes_chr%(chrom)s_%(pop)s_r21_nr_fwd_phased_all.imported' % locals() )
                genMapImportedFN = os.path.join( projDir, 'genetic_map_chr%(chrom)s_%(genMapSfx)s.txt.imported' % locals() )
                pr.addRule( name = 'extractSnpInfo',
                            commands = 'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s' % locals(),

                            commandsOld = 'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep/target/sweep-1.0-SNAPSHOT-jar-with-dependencies.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s' % locals(),                            
                            depends_on = ( ancestralImportedFN, genotypesImportedFN, genMapImportedFN ),
                            creates = snpInfoFN )

    chr2dihhFN = {}

    for chrom in chroms:
        with pr.settingAttrs( 'chrom' ):
      
            chrom_s = 'chr' + str( chrom )
            chromDir = os.path.join( cmsDir, chrom_s )

            xpopScoresFN = os.path.join( chromDir, AddFileSfx( 'max_xpop.tsv', chrom_s, selPop, pops ) )

            pr.addInvokeRule( invokeFn = gatherXPOPscores,
                              invokeArgs = Dict( 'pops chrom selPop sweepDir', outFN = xpopScoresFN ),
                              attrs = dict( pop = allPops, stat = 'max_xpop', piperun_short = True ) )

            ihsFN = getFN_ihs_signif( **Dict( 'sweepDir chrom', pop = selPop ) )
                                      
            ihsScoresFN = os.path.join( chromDir, AddFileSfx( 'iHS.tsv', chrom_s, selPop, pops ) )
            dihhScoresFN = os.path.join( chromDir, AddFileSfx( 'dihh.tsv', chrom_s, selPop, pops ) )

            chr2dihhFN[ chrom ] = dihhScoresFN

            pop2ancFreqFN = os.path.join( cmsDir, chrom_s, AddFileSfx( 'pop2ancFreq.tsv', chrom_s, pops ) )
            pop2sampleSizeFN = os.path.join( cmsDir, chrom_s, AddFileSfx( 'pop2sampleSize.tsv', chrom_s, pops ) )

            pop2snpInfoFN = dict([ ( pop, os.path.join( sweepDir, 'analysis', chrom_s,
                                                        'snps_%(pop)s.tsv' % locals() ) )
                                   for pop in pops ])

            pr.addInvokeRule( invokeFn = gather_snp_info,
                              invokeArgs = Dict( 'pops pop2snpInfoFN pop2ancFreqFN pop2sampleSizeFN' ) )

            pr.addInvokeRule( invokeFn = gather_iHS_scores,
                              invokeArgs = Dict( 'chrom selPop ihsFN pop2ancFreqFN',
#                                                 snpInfoFN = pop2snpInfoFN[ selPop ],
                                                 ihsOutFN = ihsScoresFN, dihhOutFN = dihhScoresFN ),
                              attrs = dict( pop = selPop, stat = ( 'iHS', 'StdDiff' ), piperun_short = True ) )


            freqDiffScoresFN = os.path.join( chromDir, AddFileSfx( 'freqDiff.tsv', chrom_s, selPop, pops ) )
            meanFstScoresFN = os.path.join( chromDir, AddFileSfx( 'meanFst.tsv', chrom_s, selPop, pops ) )

            pr.addInvokeRule( invokeFn = computeMeanFstAndFreqDiffScores,
                              invokeArgs = Dict( 'chrom selPop sweepDir pops pop2ancFreqFN pop2sampleSizeFN',
                                                 outMeanFstFN = meanFstScoresFN,
                                                 outFreqDiffFN = freqDiffScoresFN ),
                              attrs = dict( pop = allPops, stat = ( 'freqDiff', 'meanFst' ), piperun_short = True ) )

            StdDiffScoresFN = os.path.join( chromDir, AddFileSfx( 'StdDiff.tsv', chrom_s, selPop, pops ) )

            rawScoresFN[ chrom ] = dict( iHS = ihsScoresFN, StdDiff = StdDiffScoresFN, meanFst = meanFstScoresFN,
                                         freqDiff = freqDiffScoresFN,
                                         max_xpop = xpopScoresFN )

        # end: with pr.settingAttrs( 'chrom' )
    # end: for chrom in chroms

    #    ihhStdFN = os.path.join( cmsDir, 'dihhstd.tsv' )

    dihhGlobalStdFN = os.path.join( cmsDir, AddFileSfx( 'dihh_global_std.tsv', selPop, pops ) )
    dihhBinMeansFN = os.path.join( cmsDir, AddFileSfx( 'dihh_bin_means.tsv', selPop, pops ) )

    pr.addInvokeRule( invokeFn = normalizeByFreq_getMeanStd_tsv,
                      invokeArgs = dict( iHHDiffFNs = [ chr2dihhFN[k] for k in chroms ],
                                         globalStatFN = dihhGlobalStdFN, binsStatFN = dihhBinMeansFN ),
                      name = 'compute_dihh_meanstd' )
    
    # pr.addInvokeRule( invokeFn = computeMeanStd_binned_tsvs,
    #                   invokeArgs = dict( inFNs = chr2dihhFN.values(), valCol = 'iHHDiff',
    #                                      binCol = 'normingFreqs', binMin = 0.05, binMax = 1.05, binStep = .05,
    #                                      outFN = ihhStdFN ),
    #                   name = 'compute_dihh_std' )

    for chrom in chroms:
        with pr.settingAttrs( 'chrom' ):
            chrom_s = 'chr' + str( chrom )
            chromDir = os.path.join( cmsDir, chrom_s )
            
            StdDiffScoresFN = os.path.join( chromDir, AddFileSfx( 'StdDiff.tsv', chrom_s, selPop, pops ) )
            dbg( 'chrom chr2dihhFN[chrom]' )
            pr.addInvokeRule( invokeFn = normalizeByFreq_compute_normed_tsv,
                              invokeArgs = dict( iHHDiffFN = chr2dihhFN[ chrom ],
                                                 globalStatFN = dihhGlobalStdFN,
                                                 binsStatFN = dihhBinMeansFN,
                                                 StdDiffFN = StdDiffScoresFN ) )

    statFNs = {}
    statLikesRatioFNs = {}

    for stat in CMSBins.CMSstats:
        with pr.settingAttrs( stat = stat, pop = ( selPop, ) if stat in ( 'iHS', 'StdDiff' ) else allPops, piperun_short = True ):
            if stat not in CMSBins.nonNormedStats:
                rawFNs = [ rawScoresFN[ chrom ][ stat ] for chrom in chroms ]
                meanStdFN = os.path.join( cmsDir, AddFileSfx( 'meanStd.tsv', stat, selPop, pops ) )

                # DefineRulesTo_computeMeanStd( pr, inFNs = rawFNs, colNum = 1,
                #                               outFN = meanStdFN,
                #                               addRuleArgs = \
                #                               dict( name = 'computeMeanStd_for_stat',
                #                                     attrs = dict( chrom = chroms ) ) )

#                meanStdBzFN = os.path.join( cmsDir, stat + '_meanStdForStat.tsv' )
                pr.addInvokeRule( invokeFn = computeMeanStd,
                                  invokeArgs = dict( inFNs = rawFNs, colName = stat, outFN = meanStdFN ) )
                
            # end: if stat not in CMSBins.nonNormedStats

            for chrom in chroms:
                with pr.settingAttrs( 'chrom' ):
                    statFN = rawScoresFN[ chrom ][ stat ]

                    if stat not in CMSBins.nonNormedStats:
                        normedFN = AddFileSfx( statFN, 'normed' )
                        
                        DefineRulesTo_normalizeOneColumn( pr, inFN = statFN,
                                                          meanStdFN = meanStdFN,
                                                          colName = stat,
                                                          outFN = normedFN,
                                                          addRuleArgs = dict( attrs = Dict( 'chrom' ) ) )
                        statFN = normedFN
                        
                    bins_beg = CMSBins.stat_start[ stat ]
                    bins_end = CMSBins.stat_end[ stat ]
                    bins_n = CMSBins.stat_nbin[ stat ]

                    statFNs[ ( chrom, stat ) ] = statFN

                    statLikesRatioFN = AddFileSfx( rawScoresFN[ chrom ][ stat ], 'likesRatio' )
                    statLikesRatioFNs[ ( chrom, stat ) ] = statLikesRatioFN
                    
                    pr.addInvokeRule( invokeFn = computeLikeRatioForStat,
                                      invokeArgs = dict( stat = stat,
                                                         statValsFN = statFN,
                                                         hitLikesFN = '../Data/Common_Data/sim/likes/hitsLikes_toneutFixed_1.tsv',
                                                         missLikesFN = '../Data/Common_Data/sim/likes/missLikes_toneutFixed_1.tsv',
                                                         stat_start = bins_beg,
                                                         stat_end = bins_end,
                                                         stat_nbin = bins_n,
                                                         statLikesRatioFN = statLikesRatioFN ) )

                                      
                # end: with pr.settingAttrs( 'chrom' )
            # end: for chrom in chroms
        # end: with pr.settingAttrs( stat = stat, piperun_short = True )
    # end: for stat in CMSBins.CMSstats

    for chrom in chroms:
        with pr.settingAttrs( chrom = chrom, stat = CMSBins.CMSstats ):
            chrom_s = 'chr' + str( chrom )
            chromDir = os.path.join( cmsDir, chrom_s )
            
            likesRatioFN = os.path.join( chromDir, AddFileSfx( 'likesRatio.tsv', CMSBins.CMSstats, selPop, pops ) )
            pr.addInvokeRule( invokeFn = addLikesRatios,
                              invokeArgs = dict( inFNs = [ statLikesRatioFNs[ ( chrom, stat ) ] for stat in CMSBins.CMSstats ],
                                                 colNames = [ colName + 'likeRatio' for colName in CMSBins.CMSstats ],
                                                 outFN = likesRatioFN ) )

            joinStatsFN = os.path.join( chromDir, AddFileSfx( 'joinStats.tsv', CMSBins.CMSstats, selPop, pops ) )
            snpInfoFN = os.path.join( sweepDir, 'analysis/chr%(chrom)s/snps_%(selPop)s.tsv' % locals() )
            pr.addInvokeRule( invokeFn = joinStats,
                              invokeArgs = dict( snpInfoFN = snpInfoFN,

                                                 statLikesFNs = [ statLikesRatioFNs[ ( chrom, stat ) ] for stat in CMSBins.CMSstats ],
                                                 likesRatioFN = likesRatioFN,
                                                 outFN = joinStatsFN ),
                              attrs = dict( stat = CMSBins.CMSstats, chrom = chrom ) )
示例#36
0
文件: tsvutils.py 项目: quank/cms
def computeMeanStd_binned( inDatas, valCol, binCol, binMin, binMax, binStep ):
  """Compute binned stats for a set of tables"""

  binCount = int( ( binMax - binMin ) / binStep )
  dbg( 'binCount' )
  sums = np.zeros( binCount )
  sumsSq = np.zeros_like( sums )
  counts = np.zeros_like( sums )
  bins = np.arange( binMin, binMax, binStep )
  for d_idx, d in enumerate( inDatas ):
    dbg( 'd_idx d binStep' )
    dbg( 'd[binCol]' )

    binColValues = 1.0 - ( 1.0 - d[ binCol ].values )

    for i in range( binCount ):
#        binBot = bins[i]
        binTop = bins[i]
        theIdx = ( (binTop - binColValues) < binStep ) & ( ( binTop - binColValues ) > 0 )
#        theIdx = ( binBot < d[ binCol ].values ) & ( d[ binCol ].values <= binTop )
 #       DotData( names = ('rows',), Columns = theIdx.nonzero() ).saveToSV( 'nz%02d.tsv' % i )
        #rowsStr = ','.join(map(str,list(theIdx.nonzero())))
        #print 'binnedRows=', rowsStr
        hereVals = d[ theIdx ][ valCol ]
#        DotData( names = ( 'temp', ), Columns = ( hereVals, ) ).saveToSV( 'temp2%2d.tsv' % i )
        
        dbg( '"BEF" theIdx.sum() i bins[i] len(hereVals)' )
        counts[i] += len( hereVals )
        sums[i] += np.sum( hereVals )
        sumsSq[i] += np.sum( hereVals * hereVals )
#        dbg( '"AFT" i bins[i] bins[i+1] len(hereVals)' )

    if False:
        # fast version
        binsHere = np.digitize( d[ binCol ], bins ) - 1
        dbg( 'len(binsHere) binsHere' )
        np.clip( binsHere, 0, binCount-1, out = binsHere );
        dbg( 'binsHere' )

        counts += np.bincount( binsHere, minlength = binCount )
        sums += np.bincount( binsHere, weights = d[ valCol ], minlength = binCount )
        sumsSq += np.bincount( binsHere, weights = d[ valCol ] * d[ valCol ], minlength = binCount )

  countsOrig = counts.astype( int )
  counts[ counts == 0 ] = np.nan
  means = sums / counts
  stds = sumsSq / counts - means * means

  return pd.DataFrame( dict( binBeg = bins - binStep,
                             binEnd = bins,
                             counts = countsOrig, sums = sums, sumsSq = sumsSq,
                             means = means, stds = stds ) )
示例#37
0
def localizeSpatiallyBySplineFitting( Ddata, scenario, nreplicas, thinSfx = '',
                                      putativeMutPop = None, complikeSfx = '', likesTableSfx = '',
                                      confidence = .9, minBins = 20, nbins = 200, smoothing = 0.0,
                                      getio = None ):
    """For each replica within a given scenario,
    localize the selected SNP spatially, by fitting a spline to a (smoothed version of) the CMS scores, dividing the
    region into bins, and finding the set of bins that cover 90% (or specified fraction of) area under the spline.

    Params:

       confidence - the spatially localized region will (hopefully) have this probability of containing the causal SNP;
         here, specifically, this means we'll include bins in the region that collectively cover this fraction of area
         under the posterior density curve.

       minBins - the region will include at least this many of the highest-average bins.
       
    """

    snpStatsDir = os.path.join( Ddata, 'snpStats'+ thinSfx, scenario.scenDir() )
    replicaStatsDir = os.path.join( Ddata, 'replicastats'+ thinSfx, scenario.scenDir() )
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = ( putativeMutPop, complikeSfx, likesTableSfx )
    complikeFN = os.path.join( snpStatsDir, AddFileSfx( 'complike.data/', *sfxs ) )

    intervalsListFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsSplineList.tsv', *sfxs ) )
    intervalsStatsFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsSplineStats.tsv', *sfxs ) )

    posteriorSplineFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsSplineSpline.tsv', *sfxs ) )
    binInfoFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsSplineBinInfo.tsv', *sfxs ) )

    if getio:
        return dict( depends_on = complikeFN,
                     creates = ( intervalsListFN, intervalsStatsFN, posteriorSplineFN, binInfoFN ),
                     mediumRuleNameSfx = ( scenario.scenDir(), ) + sfxs,
                     fileDescrs = { intervalsListFN:
                                        'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                                    ' For each replica this table has one or more lines, giving intervals in that replica.',
                                    intervalsStatsFN: 'Per-replica statistic about confidence intervals in that replica',
                                    posteriorSplineFN: 'The results of interpolating posterior density as a spline',
                                    binInfoFN: 'Information about individual bins under the spline' } )
    
    complike = IDotData( complikeFN ).filter( lambda r: all( np.isfinite( ( r.iHS, r.meanFst, r.max_xpop ) ) ) )

    binInfoHeadings = 'replicaNum binNum binStart binEnd included binCenters binAvgCMS binMaxCMS binIntegral '\
        'binIntegralNormed binRank'

    nbins_orig = nbins

    with contextlib.nested( IDotData.openForWrite( posteriorSplineFN, 'replicaNum gdPos complikeExp' ),
                            IDotData.openForWrite( binInfoFN, binInfoHeadings ),
                            IDotData.openForWrite( intervalsListFN, 'replicaNum gdFrom gdTo gdSize bpFrom bpTo '
                                                   'bpSize binsInInterval binsArea binsMax binsMaxFrac binsAvg '
                                                   'binsMinRank binsMaxRank' ),
                            IDotData.openForWrite( intervalsStatsFN, 'replicaNum numSegments gdTotLen bpTotLen' ) ) \
                            as ( posteriorSplineFile, binInfoFile, intervalsListFile, intervalsStatsFile ):

        for replicaNum, complikeForReplica in complike.groupby( 'Chrom' ):

            gdMin = np.min( complikeForReplica.gdPos )
            gdMax = np.max( complikeForReplica.gdPos )
            binSize = ( gdMax - gdMin ) / nbins_orig
            binLefts = np.arange( gdMin, gdMax, binSize )
            binCenters = binLefts + binSize / 2
            binRights = binLefts + binSize
            dbg( 'len(complikeForReplica) replicaNum len(binLefts) len(binRights) len(binCenters)' )
            nbins = len( binLefts )

            #
            # Compute the mean and max CMS score in each gdPos bin
            #
            binAvgCMS = np.zeros( nbins )
            binMaxCMS = np.zeros( nbins )
            binNums = np.zeros( nbins, dtype = int )

            for bin, valsInBin in \
                    complikeForReplica.addCol( 'bin',
                                               map( functools.partial( min, nbins-1 ),
                                                    map( int,
                                                         ( complikeForReplica.gdPos - gdMin ) / binSize ))).groupby('bin'):

                binNums[ bin ] = bin
                if valsInBin:
                    binAvgCMS[ bin ] = np.mean( valsInBin.complikeExp )
                    binMaxCMS[ bin ] = max( valsInBin.complikeExp )
                else:
                    binAvgCMS[ bin ] = binAvgCMS[ bin-1 ]
                    binMaxCMS[ bin ] = binMaxCMS[ bin-1 ]

            # fit a spline to the function ( binCenter, binAvgCMS ), approximating the posterior probability of a SNP being
            # causal.
            posteriorDensitySpline = interpolate.splrep( binCenters, binAvgCMS, s = smoothing )

            splineX = np.arange( gdMin, gdMax, binSize / 2 )
            for x, y in zip( splineX, interpolate.splev( splineX, posteriorDensitySpline ) ):
                posteriorSplineFile.writeRecord( replicaNum, x, y )

            # compute the integral under the interpolated function, for each bin.
            binIntegral = np.zeros( nbins )
            for binLeft, binRight, binNum in zip( binLefts, binRights, binNums ):
                binIntegral[ binNum ] = interpolate.splint( binLeft, binRight, posteriorDensitySpline )

            # normalize the integral value above each bin so that the total posterior density of being causal
            # integrates to 1.0
            binIntegralNormed = binIntegral / sum( binIntegral )

            binsByIntegralSize = binIntegral.argsort()[::-1]

            binRank = np.zeros( nbins )
            for i in range( nbins ):
                binRank[ binsByIntegralSize[ i ] ] = i

            binsToUse = np.zeros( nbins, dtype = bool )
            binsIncluded = 0
            fractionCovered = 0.0
            for bin in binsByIntegralSize:
                if fractionCovered >= confidence and binsIncluded >= minBins: break
                binsToUse[ bin ] = True
                binsIncluded += 1
                fractionCovered += binIntegralNormed[ bin ]

            # list the confidence intervals for this replica,
            # merging adjacent intervals.

            gd2bp = interpolate.interp1d( complikeForReplica.gdPos, complikeForReplica.Pos )
            gdMax = max( complikeForReplica.gdPos )

            binInfo = IDotData( names = binInfoHeadings, 
                                Columns = ( itertools.repeat( replicaNum, nbins ),
                                            range( nbins ),
                                            binLefts, binRights, binsToUse,
                                            binCenters, binAvgCMS, binMaxCMS, binIntegral, binIntegralNormed, binRank
                                            ) )

            binInfoFile.writeRecords( binInfo )


            binsOverallMaxCMS = max( binInfo.binMaxCMS )

            gdTotLen = 0.0
            bpTotLen = 0
            numSegments = 0
            for included, bins in binInfo.groupby( 'included' ):
                if included:
                    gdFrom = min( bins.binStart )
                    gdTo = min( max( bins.binEnd ), gdMax )
                    bpFrom = int( gd2bp( gdFrom ) )
                    bpTo = int( gd2bp( gdTo ) )

                    gdTotLen += ( gdTo - gdFrom )
                    bpTotLen += ( bpTo - bpFrom )
                    numSegments += 1

                    intervalsListFile.writeRecord( replicaNum, gdFrom, gdTo, gdTo - gdFrom, bpFrom, bpTo, bpTo - bpFrom,
                                                   len( bins ), sum( bins.binIntegralNormed ),
                                                   max( bins.binMaxCMS ), max( bins.binMaxCMS ) / binsOverallMaxCMS,
                                                   np.mean( bins.binAvgCMS ),
                                                   min( bins.binRank ), max( bins.binRank ) )

            assert numSegments > 0 and bpTotLen > 0 and gdTotLen > 0.0
            intervalsStatsFile.writeRecord( replicaNum, numSegments, gdTotLen, bpTotLen )
            dbg( 'replicaNum numSegments gdTotLen bpTotLen' )
示例#38
0
def localizeSpatiallyByWindows(Ddata,
                               scenario,
                               nreplicas,
                               thinSfx='',
                               putativeMutPop=None,
                               complikeSfx='',
                               likesTableSfx='',
                               threshold=.5,
                               numSNP=1,
                               minGdInEachDir=.05,
                               fromReplica=None,
                               toReplica=None,
                               getio=None):
    """
    Spatially localize the selected variant for all replicas within a given scenario.
    The approach is to start with the highest-scoring SNP, and move left and right from it in fixed-size windows
    for as long as the windows contain at least 'numSNP' snps with score at least 'threshold'.

    Adapted from Operations.Shari_Operations.localize.hapmap_regions_0615.plotRegions() .
    """

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = (putativeMutPop, complikeSfx, likesTableSfx)
    complikeFN = os.path.join(snpStatsDir, AddFileSfx('complike.data/', *sfxs))

    intervalsListFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsWindowsList.tsv', *sfxs))

    if getio:
        return dict(
            depends_on=complikeFN,
            creates=intervalsListFN,
            mediumRuleNameSfx=(scenario.scenDir(), ) + sfxs,
            fileDescrs={
                intervalsListFN:
                'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                ' For each replica this table has one or more lines, giving intervals in that replica.'
            })

    #complike = IDotData( complikeFN ).filter( lambda r: all( np.isfinite( ( r.iHS, r.meanFst, r.max_xpop ) ) ) )
    complike = IDotData(complikeFN)

    with IDotData.openForWrite(
            intervalsListFN,
            'replicaNum gdFrom gdTo gdSize bpFrom bpTo bpSize numPositiveBins '
            'numSnpsOver_0_2 maxSNP_Pos maxSNP_lik') as intervalsListFile:

        for replicaNum, complikeForReplica in complike.groupby('Chrom'):

            if fromReplica is not None and replicaNum < fromReplica: continue
            if toReplica is not None and replicaNum > toReplica: break

            X = complikeForReplica.toDotData()

            minPos = np.min(X.gdPos)
            maxPos = np.max(X.gdPos)
            bins = np.arange(0, 1, .01)

            ind = X.complikeExp.argsort()
            lik = X.complikeExp[ind]
            maxlik = np.mean(lik[-5:])
            #maxlik = mean(lik[-5:])
            like = X.complikeExp / maxlik

            Y = X[ind]
            maxSNP = Y.Pos[-1]

            maxScore = Y.complikeExp[-1]
            relPos = X.gdPos - Y.gdPos[-1]
            X = X.hstack(
                DotData(Columns=[like, relPos],
                        names=['scaled_like', 'relPos']))

            topGdPos = Y.gdPos[-1]
            minPos = Y.Pos[-1]
            maxPos = Y.Pos[-1]
            minGdPos = topGdPos
            maxGdPos = topGdPos
            numPositiveBins = 0

            dbg('replicaNum minPos maxPos minGdPos maxGdPos')

            for dir in -1, +1:
                for bin in bins:
                    Z = X[np.abs(X.relPos - dir * bin) <= .02]

                    dbg('dir bin len(Z)')

                    if len(Z) == 0: continue
                    top = Z[Z.scaled_like > threshold]

                    dbg('len(top)')

                    if len(top) <= numSNP and np.abs(
                            topGdPos - top.gdPos) > minGdInEachDir:
                        break
                    if len(top) == 0: top = Z
                    if dir == -1:
                        minPos = np.min(top.Pos)
                        minGdPos = np.min(top.gdPos)
                    else:
                        maxPos = np.max(top.Pos)
                        maxGdPos = np.max(top.gdPos)
                    numPositiveBins += 1

            ind = np.all([X.Pos > minPos, X.Pos < maxPos], axis=0)
            peak = X[ind]

            intervalsListFile.writeRecord(replicaNum, minGdPos, maxGdPos,
                                          maxGdPos - minGdPos, minPos, maxPos,
                                          maxPos - minPos, numPositiveBins,
                                          sum(peak.scaled_like > .2), maxSNP,
                                          maxScore)
示例#39
0
def likeVal(stat,hitsLikes,missLikes,stat_start,stat_end,nbin,lik,ind=None, defaultNumSnps = 10000,
            useDefaultNumSnps = True):
    """For eah SNP, calculate loglikelihood of that SNP being causal based on one statistic,
    given the SNP's bin for that statistic.

    Input params:

       stat - for each SNP, its statistic value (typically, normalized).
       hitsLikes/missLikes - for each bin, what fraction of causal/non-causal SNPs has
          the statistic in that bin?
       stat_start, stat_end, nbin - bin boundaries and number of bins for the statistic

       ind - optionally indicates which snps should be skipped.   if given,
          for snps for which ind[i] is false no likelihood value is computed
          (and lik[i] is set to nan).

    Output params:

       lik - for each SNP, log of the probability that the SNP is causal given what bin this statistic
          for this SNP falls into.  the lik array is allocated by the caller, and filled-in
          by this function.

    Use by: complike()      
       
    """

    # if the (optional) indicator of which SNPs to compute likelihood for is not given,
    # compute likelihood for all SNPs.
    if ind == None  or   len(ind) < 1:
        ind = ones(len(stat))

    bin_size = float(stat_end - stat_start) / nbin
    # var: p_causal - probability that a SNP chosen randomly from the replica is causal.
    #    within a replica (region), exactly one SNP is causal, so this is 1 / ( total number of SNPs ).
    #    For Bayes' rule, this is the prior probability that a SNP is causal, before we look at the value
    #    of the statistic.
    #p_causal = 1. / len(stat)

    numSnps = defaultNumSnps if useDefaultNumSnps else len( stat )
    dbg( 'numSnps' )
    p_causal = 1./numSnps if numSnps > 0 else np.nan
    p_noncausal = 1 - p_causal
    dbg( 'p_causal p_noncausal' )

    for i in range(len(stat)):

        if not ind[i]:
            p_causal_bin = 0

        elif not isfinite(stat[i]):
            p_causal_bin = 1. / nbin
        
        else:


            bin = int((stat[i] - stat_start)/bin_size)

            if bin >= nbin:
                bin = nbin - 1
            if bin < 0:
                bin = 0

                

            # P( causal | stat_in_this_bin ) = ( P( stat_in_this_bin | causal ) * P( causal ) ) / P( stat_in_this_bin )

            # here, hitLikes[bin] gives P( stat_in_this_bin )    
        
            num = hitsLikes[bin]*p_causal
            # var: denom - probability that a SNP, whether causal or not, has this statistic in this bin
            denom = hitsLikes[bin] * p_causal + missLikes[bin]*p_noncausal
            assert denom != 0

            #print num,denom

            p_causal_bin = num / denom

            if np.abs( stat[i] - 4.6387211 ) < 1e-5:
                dbg( '"here!" bin stat[i] hitsLikes[bin] p_causal num denom p_causal_bin' )

        # endif: whether we have the statistic value for this snp,
        #    and whether we are asked to compute the SNP's likelihood of being causal
        
        likscore = log(p_causal_bin) if p_causal_bin > 0.0 else nan
        lik[i] = likscore
示例#40
0
文件: runner.py 项目: quank/cms
def RunTasks( options ):
    """Take tasks from the specified queue directory, and run them.

    Params:

       options - see command-line parameter definition in main() below.
       
    """

    if haveParamiko: Random.atfork()

    startClock = time.time()

    logging.info( 'Starting runner (process id %d on host %s) with options %s'
                  % ( os.getpid(), GetHostName(), options ) )

    stopSignal = [ False ]

    def SetStopSignal( sigNum, stkFrm ):
        logging.info( 'Setting stop signal to stop runners' )
        stopSignal[ 0 ] = True
        dbg( '"aftset" stopSignal' )

    signal.signal( signal.SIGUSR1, SetStopSignal )

    fs = RemoteFileSystem( remote = options.remote, pw = options.password, pkey = options.pkey ) \
        if options.remote else LocalFileSystem()

    # check that all queues exist
    assert all( fs.exists( queue ) and fs.isdir( queue )
                for queue in options.queues.split( fs.pathsep ) )

    # register a cleanup routine so that, if we claim a task and then
    # crash midway through it, our lock on the task is erased, so that
    # another runner can pick up the task.  Note that if the _task_ fails
    # with an error code, that's fine -- we just report the error code
    # to the mqsub.sh script instance that submitted the task.
    # The cleanup here happens only if the runner crashes before receiving
    # a proper exit code from the task.
    fileToErase = [ None ]
    @atexit.register
    def DoErase( eraseWhat = fileToErase ):
        if eraseWhat[0] and fs.exists(eraseWhat[0]):
            #print 'runner AtExit: removing ' + eraseWhat[0]
            fs.remove( eraseWhat[0] )

    # var: lastFinish - time when a task last finished.
    lastFinish = time.time()

    queues = options.queues.split( fs.pathsep )
    lastQueueModTime = [ None ] * len( queues )

    skipDirs = set([ 'newtask.dat' ] )

    numTasksRun = 0
    numProcsAvail = int( os.getenv( 'LSB_DJOB_NUMPROC', 1 ) )
    dbg( 'numProcsAvail' )

    for queue in queues:
        EnsureDirExists( os.path.join( queue, 'succ' ) )
        EnsureDirExists( os.path.join( queue, 'fail' ) )
    
    while not stopSignal[0]:

        ranCommand = False

        if options.maxRunHours > 0 and ( time.time() - startClock  ) / 3600.0 > options.maxRunHours:
            logging.info( 'Runner exiting after CPU time of %s hours' % ( time.time() - startClock ) / 3600.0 )
            return

        if stopSignal[ 0 ]:
            logging.info( 'Runner stopped by stop signal' )
            return
        else: dbg( '"chkstop" stopSignal' )

        dbg( 'queues' )
        for queueNum, queue in enumerate( queues ):
            dbg( 'queueNum queue' )

            # do a quick check to see if any tasks have been added to the queue since we last checked
            newTaskFN = os.path.join( queue, 'newtask.dat' )
            try:
                curQueueModTime = fs.stat( newTaskFN ).st_mtime
                if curQueueModTime == lastQueueModTime[ queueNum ]: continue
                lastQueueModTime[ queueNum ] = curQueueModTime
            except EnvironmentError as e:
                if os.path.exists( newTaskFN ):
                    logging.warning( 'ERROR CHECKING FOR NEW TASKS in queue %s: %s' % ( queue, e ) )
                pass
            
            # find an unclaimed task in this queue, and try to claim it
            taskDirs = sorted( fs.listdir( queue ) )
            dbg( 'len(taskDirs)' )
            #random.shuffle( taskDirs )
            dbg( 'os.environ.get("MQ_FIRST_DIR")' )
            if 'MQ_FIRST_DIR' in os.environ and os.environ[ 'MQ_FIRST_DIR' ] in taskDirs:
            
                taskDirs = [ os.environ[ 'MQ_FIRST_DIR' ] ] + taskDirs
                logging.info( 'putting specified dir first' )

            for taskDir in taskDirs:

                if taskDir in skipDirs: continue

                if options.maxRunHours > 0 and ( ( time.time() - startClock ) / 3600.0 ) > options.maxRunHours:
                    logging.info( 'Runner exiting after CPU time of %s hours' % ( ( time.time() - startClock ) / 3600.0 ) )
                    return

                if stopSignal[ 0 ]:
                    logging.info( 'Runner stopped by stop signal' )
                    return
                else: dbg( '"chkstop" stopSignal' )

                try:

                    while fs.path.exists( os.path.join( queue, 'noclaim.dat' ) ):
                        time.sleep( 60 + random.normalvariate( 10.0, 5.0 ) ) 
                
                    fullTaskDir = fs.path.join( queue, taskDir )
                    claimedFN = fs.path.join( fullTaskDir, options.claimedFN )

                    attrsFN = fs.path.join( fullTaskDir, 'attrs.tsv' )
                    cwdFN = fs.path.join( fullTaskDir, 'submitdir.txt' )

                    failedCond = []
                    
                    def saveVal( name, val, fc = failedCond ):
                        if not val: fc.append( name )
                        return val

                    if saveVal( 'ready', fs.path.exists( fs.path.join( fullTaskDir, options.readyFN ) ) ) \
                            and saveVal( 'not claimed', not fs.path.exists( claimedFN ) ) \
                            and saveVal( 'relocatable', ( not options.remote or \
                                                              all([ not f.startswith( '/' ) for which in ( 'sources', 'targets' )  \
                                          for f in fs.SlurpFile( fs.path.join( fullTaskDir, which + '.lst' ) ) ]) ) ) \
                            and saveVal( 'memOk', GetMemReq( fs, attrsFN ) <= options.maxMem ) \
                            and saveVal( 'minMemOk', options.minMem == 0 or GetMemReq( fs, attrsFN ) >= options.minMem ) \
                            and saveVal( 'minProc', GetProcReq( fs, attrsFN ) >= options.minProc ) \
                            and saveVal( 'maxProc', GetProcReq( fs, attrsFN ) <= numProcsAvail ) \
                            and saveVal( 'local', ( options.local_tasks or not GetTaskAttr( fs, attrsFN, 'piperun_run_locally', False ) ) ) \
                            and saveVal( 'onlyLocal',
                                         ( not options.only_local_tasks or GetTaskAttr( fs, attrsFN, 'piperun_run_locally', False ) ) ) \
                            and saveVal( 'short', ( not options.runOnlyShort or GetTaskAttr( fs, attrsFN, 'piperun_short', False ) ) )  \
                            and saveVal( 'long', ( not options.runOnlyLong or not GetTaskAttr( fs, attrsFN, 'piperun_short', False ) ) )  \
                            and saveVal( 'notRequeued', ( not options.noRequeuedTasks or not fs.path.exists( fs.path.join( fullTaskDir, 'requeued.dat' ) ) ) ) \
                            and saveVal( 'notFromHost', ( not options.onlyFromHost or socket.getfqdn() == options.onlyFromHost ) ) \
                            and saveVal( 'notFromPipeline', ( not options.onlyFromPipelineId or  \
                                                                  GetTaskAttr( fs, attrsFN, 'piperun_pipelineId' ) == options.onlyFromPipelineId ) ):

                        # try to claim the task
                        try:
                            fd = fs.open(claimedFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY)
                        except EnvironmentError:
                            # another runner beat us to this task -- go and check other tasks
                            logging.info( 'another job beat us to claiming ' + fullTaskDir )
                            continue

                        try:
                            fs.write( fd, 'locked by process %d on host %s\n' % ( os.getpid(), GetHostName() ) )
                            for v in os.environ.keys():
                                fs.write( fd, '%s=%s\n' % ( v, os.environ[v] ) )
                        finally:
                            fs.close( fd )
                        # Tell our cleanup code to release this task if we crash.
                        fileToErase[0] = claimedFN
                        # get the command to run the task
                        theCMD = fs.SlurpFile( os.path.join( fullTaskDir, 'command.dat' ) ).strip()
                        theCmdDir = fs.SlurpFile( os.path.join( fullTaskDir, 'submitdir.txt' ) ).strip()
                        theCmdEnvFN = os.path.join( fullTaskDir, 'submitenv.txt' )

                        if options.remote:
                            assert have_fcntl
                            SystemSucceed( 'mkdir -p ' + os.path.join( options.localDataDir, fs.root[1:] ) )
                            for needDir in 'Operations', 'Classes', 'System', 'Other':
                                needDirFull = os.path.join( options.localDataDir, fs.root[1:], '..', needDir )
                                if not os.path.exists( needDirFull ):
                                    os.symlink( os.path.realpath( os.path.join( '..', needDir ) ), needDirFull )

                            # copy source files

                            # get exclusive locks on the source files
                            srcFiles = sorted( set( fs.SlurpFile( os.path.join( fs.root, fullTaskDir, 'sources.lst' ) ).rstrip( '\n' ).split( '\n' ) ) )
                            srcLockIds = []
                            srcLockFiles = []
                            for srcFile in srcFiles:
                                lockFile = os.path.join( options.localDataDir, 'mqlocks', srcFile[1:] )
                                if lockFile.endswith('/'): lockFile = lockFile[:-1]
                                lockFile += '.lock'
                                SystemSucceed( 'mkdir -p ' + os.path.dirname( lockFile ) )
                                gotLock = False
                                while not gotLock:
                                    try:
                                        openMode = os.O_CREAT|os.O_EXCL|os.O_WRONLY
                                        logging.info( 'opening ' + lockFile + ' with mode ' + str( openMode ) )
                                        lockId = os.open( lockFile, openMode )
                                        gotLock = True
                                    except EnvironmentError:
                                        logging.info( 'Could not create ' + lockFile + ' , waiting...' )
                                        time.sleep( 10 + random.normalvariate( 3.0, 1.0 ) )
                                fcntl.lockf( lockId, fcntl.LOCK_EX )
                                srcLockIds.append( lockId )
                                srcLockFiles.append( lockFile )
                                logging.info( 'Got lock on ' + lockFile )

                            SystemSucceed( 'rsync -zprv --files-from=:' + os.path.join( fs.root, fullTaskDir, 'sources.lst' ) +
                                           ' ' + fs.username + '@' + fs.hostname + ':/ ' + options.localDataDir )

                            for srcLockId, srcLockFile in zip( srcLockIds, srcLockFiles)[::-1]:
                                fcntl.lockf( srcLockId, fcntl.LOCK_UN )
                                os.close( srcLockId )
                                SystemSucceed( 'rm -rf ' + srcLockFile )

                            targets = fs.SlurpFile( os.path.join( fs.root, fullTaskDir, 'targets.lst' ) ).rstrip( '\n' ).split('\n')
                            targetDirs = set( map( os.path.dirname, filter( None, map( str.strip, targets ) ) ) )
                            dbg( '"DDDDDD" targetDirs' )

                            for targetDir in targetDirs:
                                assert targetDir.startswith( '/' )
                                tdir = os.path.join( options.localDataDir, targetDir[1:] )
                                SystemSucceed( 'mkdir -p ' + tdir + ' ' + os.path.join( tdir, 'makeinfo' ) )

                            theCMD = 'cd ' + os.path.join( options.localDataDir, fs.root[1:] ) + ' && ' + theCMD

                        logging.info( 'Under ' + claimedFN + ' RUNNING: ' + theCMD )
                        # Actually run the task; get its exit code
                        save_cwd = os.getcwd()
                        try:
                            os.chdir( theCmdDir )
                            logging.info( 'CWD=' + os.getcwd() )

                            runScriptFN = os.path.join( fullTaskDir, 'run.sh' )

                            with open( runScriptFN, 'w' ) as out:
                                out.write( '#!/usr/bin/env bash\n' )
                                out.write( 'set -e -o pipefail\n' )
                                with open( theCmdEnvFN ) as envFile:
                                    for line in envFile:
                                        if '=' not in line or line.startswith('module='): break
                                        equalIdx = line.index( '=' )
                                        envVarName = line[ :equalIdx+1 ]
                                        if not ( re.search( r'\W', envVarName ) or envVarName.startswith( 'LSB_' ) or \
                                           envVarName.startswith( 'LSF_' ) or \
                                           envVarName.startswith( 'LS_' ) or envVarName.startswith( 'SLURM' ) or \
                                           envVarName in \
                                           ( 'SYS_TYPE', 'MACHTYPE', 'VENDOR', 'OSTYPE',
                                             'DOMAINNAME', 'HOSTTYPE', 'SHORTHOST', 'SSH_TTY',
                                             'HOST', 'HOSTNAME', 'REMOTEHOST', 'STY' ) ):
                                            out.write( 'export ' + envVarName + "'" + line[ equalIdx+1: -1 ] + "'\n" )
                                out.write( theCMD )

                            os.chmod( runScriptFN, stat.S_IXUSR | stat.S_IRWXU )

                            try:
                                exitCode = os.system( runScriptFN )
                            except ( KeyboardInterrupt, SystemExit ):
                                interruptedFN = os.path.join( fullTaskDir, 'interrupted.dat' )
                                DumpFile( interruptedFN, 'interrupted' );
                                raise
                        finally:
                            os.chdir( save_cwd )
                        logging.info( 'Under ' + claimedFN + ' FINISHED RUNNING: ' + theCMD )
                        logging.info( 'Got exit code %d' % exitCode )

                        if options.remote:
                            # copy the target files and the output log back to the correct dirs on the remote system

                            # first, make sure the files all exist, and are no longer being written to.

                            time.sleep( options.aftTaskDelay )

                            os.system( 'rsync -zprv --files-from=:' + os.path.join( fs.root, fullTaskDir, 'targets.lst' ) +
                                       ' ' + options.localDataDir + ' ' + fs.username + '@' + fs.hostname + ':/' )

                        # If we succeeded in running the task (whether the task itself failed or not),
                        # tell the cleanup code to NOT release this task if we crash.
                        fileToErase[0] = None
                        # Tell the task submitter script that we are done, and what the task's
                        # exit code was.

                        if os.path.exists( os.path.join( fullTaskDir, 'nmq.dat' ) ):

                            time.sleep(3)
                            fd = fs.open( os.path.join( fullTaskDir, 'completed.dat' ), os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            fs.close(fd)
                            
                            try:
                                shutil.move( fullTaskDir, os.path.join( queue, 'succ' if exitCode == 0 else 'fail' ) )
                            except EnvironmentError as e:
                                logging.warning( 'Error moving ' + fullTaskDir + ' to ' + os.path.join( queue, 'succ' if exitCode == 0 else 'fail' ) + ' : ' + e )
                        else:
                            exitCodeFN = os.path.join( fullTaskDir, 'exitCode.dat' )
                            fd = fs.open( exitCodeFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            bytesWritten = fs.write( fd, str( exitCode ) )
                            fs.close( fd )

                            time.sleep(3)
                            logging.info( 'Wrote exit code %s to file %s (%s bytes)' % ( exitCode, exitCodeFN, bytesWritten ) )

                            fd = fs.open( os.path.join( fullTaskDir, 'completed.dat' ), os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            fs.close(fd)

                        # Record that we actually ran a task here.
                        ranCommand = True
                        lastFinish = time.time()
                        numTasksRun += 1

                    else:
                        logging.info( 'did not take task ' + taskDir + ' ; reason: ' + str( failedCond ) );

                except:
                    excInfo = sys.exc_info()
                    logging.warning( 'Error trying to grab task from ' + taskDir + ' (%s), skipping...'
                                     % str( excInfo ) )
                    traceback.print_exc()

        dbg( 'ranCommand lastFinish time.time()-lastFinish' )
        if not ranCommand:
            waitTimeHere = time.time() - lastFinish
            if ( numTasksRun > 0 and options.maxWaitTime > 0 and waitTimeHere > options.maxWaitTime ) \
                    or ( numTasksRun == 0 and options.maxFirstWaitTime > 0 and waitTimeHere > options.maxFirstWaitTime ) :
                logging.info( 'Runner exiting after idle time of %s' % waitTimeHere )
                return
            time.sleep( options.taskCheckInterval + random.normalvariate( 3.0, 1.0 ) )
示例#41
0
def DefineRulesTo_RunSweepOnSims(pr,
                                 Ddata,
                                 simsOut,
                                 thinExt='',
                                 thinning='',
                                 suffix='',
                                 mutAges=AllAges,
                                 mutPops=AllPops,
                                 mutFreqs=AllFreqs,
                                 nreplicas=100,
                                 pop2name=pop2name,
                                 tests=('lrh', 'ihs', 'xpop'),
                                 acceptExistingSimConfigFiles=False,
                                 setOptions=(),
                                 appendOptions=(),
                                 inputParamsFiles=[],
                                 runImportsLocally=True,
                                 noImports=False,
                                 powerSfx='',
                                 doOnlyStages=None):
    """Define rules for running simulations and doing Sweep analyses of them.

    Parameters:

       mutAges, mutPops, mutFreqs - parameters defining the selection scenario.
    
    """

    # Define the rules to do Sweep analyses of the simulations.
    # These rules are created by running a Perl script, sim_analysis_pipe.pl .
    # Each invocation of the script defines rules for one type of test.
    # The rules for each test are saved to an .xml file, and then all these files are
    # merged into pr.

    # Rather than explicitly invoking the script three times we define a small pipeline to do this.
    # The output of this pipeline is a pipeline definition for analyzing simulation results by each of the tests,
    # saved to an .xml file.

    simPipeline = PipeRun(name='DefineSims',
                          descr='Define simulation and analysis pipeline')

    if not powerSfx: powerSfx = Sfx(*mutAges)

    if not acceptExistingSimConfigFiles:
        simPipeline.addInvokeRule(
            invokeFn=WriteSimulationInfo,
            invokeArgs=Dict('Ddata mutAges mutPops mutFreqs nreplicas suffix '
                            'inputParamsFiles pop2name powerSfx'))

    test2pipeline = {}
    for test in tests:

        simSfx = ''
        if suffix: simSfx += suffix
        if thinning: simSfx += thinning

        thinExtHere = '' if thinning else thinExt

        testPipeline = os.path.join('Ilya_Temp',
                                    AddFileSfx('p.xml', pr.name, test))
        test2pipeline[test] = testPipeline

        if thinning:
            simPipeline.addRule(
                comment="Copy config file",
                targets="$Ddata/power_$test$simSfx/config$powerSfx.txt",
                sources="$Ddata/power_$test$suffix/config.txt",
                commands="cp $Ddata/power_$test$suffix/config.txt "
                "$Ddata/power_$test$simSfx/config$powerSfx.txt",
                mediumRuleName='copy_config_' + test,
                name='copy_config')

        simPipeline.addRule(
            targets=testPipeline,
            sources=[
                '../Other/Ilya_Other/sweep/sims/scripts/sim_analysis_pipe.pl',
                "$Ddata/power_$test$simSfx/config$powerSfx.txt",
                "$Ddata/config$suffix/sims$powerSfx.txt",
                "$Ddata/config$suffix/scenarios$powerSfx.txt",
                "$Ddata/config$suffix/pops$powerSfx.txt"
            ],
            commands=
            "../Other/Ilya_Other/sweep/sims/scripts/sim_analysis_pipe.pl "
            " --only-write-pipeline $testPipeline " +
            ('--run-imports-locally ' if runImportsLocally else '') +
            ('--no-imports ' if noImports else '') +
            (('--do-only-stages ' + doOnlyStages +
              ' ') if doOnlyStages else '') +
            ('--sim-suffix ' + simSfx if simSfx else '') +
            ('--powerSfx ' + powerSfx if powerSfx else '') +
            (' --thin-ext ' + thinExtHere if thinExtHere else '') +
            reduce(operator.concat, [
                ' --set-option ' + setOption[0] + ' ' + setOption[1]
                for setOption in setOptions
            ], '') + reduce(operator.concat, [
                ' --append-option ' + appendOption[0] + " '" +
                appendOption[1] + "'" for appendOption in appendOptions
            ], '') +
            " --target-test $test $Ddata/$simsOut$simSfx $Ddata/power_$test$simSfx",
            name='simanal',
            mediumRuleName='simanal_$test$simSfx',
            comment='Define rules for analyzing simulations')

    dbg('"RUNNING_simPipeline"')
    simPipeline.runForced(aftRuleDelay=5)
    dbg('"DONE_RUNNING_simPipeline"')

    for test in tests:
        pr.addPipelineFromFile(test2pipeline[test])
示例#42
0
def fst_onePopPair(ancFreqs, sampleSizes):
        """Compute fst between two pops, for each SNP, given the ancestral freq in each pop and the sample sizes.
        """

        dbg( 'ancFreqs sampleSizes' )

        n = sampleSizes
        n_tot = n[0] + n[1]
#        nanc =  np.ceil([ ancFreqs[0] * sampleSizes[0], ancFreqs[1] * sampleSizes[1] ])
        nanc =  ( np.array([ ancFreqs[0] * sampleSizes[0], ancFreqs[1] * sampleSizes[1] ]) + .5 ).astype( int )

        dbg( 'nanc' )

        f1 = ancFreqs
        f2 = 1 - ancFreqs

        dbg( 'f1 f2 f1.shape f2.shape' )

        # Use Weir-Hill estimator for Fst

        pmean = (nanc[0] + nanc[1]) / n_tot

        nic = n[0] - (n[0]*n[0])/n_tot
        njc = n[1] - (n[1]*n[1])/n_tot
        dbg( 'pmean.shape nic.shape njc.shape nic njc' )
        #assert nic == njc == 60
        nc = nic + njc
        #print nc, f1[0], f1[1]
        #print f2[0], f2[1]
        msp = n[0] * (f1[0] - pmean) * (f1[0] - pmean) \
            + n[1] * (f1[1] - pmean) * (f1[1] - pmean)
        msg = ((n[0] * f1[0]* f2[0]) + (n[1] * f1[1] * f2[1])) \
            / (n[0] - 1 + n[1] - 1)
        dbg( 'msp.shape msg.shape' )
        num = msp - msg
        denom = msp + (msg*(nc - 1))
        #print msp, msg
        #print num, denom
        dbg( 'num.shape denom.shape' )
        denom [ denom == 0 ] = nan
        an_fst = num / denom

        dbg( 'an_fst.shape' )

        ipop = 0
        jpop = 1
        dbg( '"KKKKKKKKKKKKK" pmean nanc[ipop] nanc[jpop] n[ipop] n[jpop] nic njc nc msp msg num denom an_fst' )
        
        return an_fst
示例#43
0
def localizeSpatiallyByWindows(Ddata, scenario, nreplicas, thinSfx = '', putativeMutPop = None, complikeSfx = '',
                               likesTableSfx = '',
                               threshold = .5, numSNP = 1,
                               minGdInEachDir = .05,
                               fromReplica = None, toReplica = None, getio = None):
    """
    Spatially localize the selected variant for all replicas within a given scenario.
    The approach is to start with the highest-scoring SNP, and move left and right from it in fixed-size windows
    for as long as the windows contain at least 'numSNP' snps with score at least 'threshold'.

    Adapted from Operations.Shari_Operations.localize.hapmap_regions_0615.plotRegions() .
    """

    snpStatsDir = os.path.join( Ddata, 'snpStats'+ thinSfx, scenario.scenDir() )
    replicaStatsDir = os.path.join( Ddata, 'replicastats'+ thinSfx, scenario.scenDir() )
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = ( putativeMutPop, complikeSfx, likesTableSfx )
    complikeFN = os.path.join( snpStatsDir, AddFileSfx( 'complike.data/', *sfxs ) )

    intervalsListFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsWindowsList.tsv', *sfxs ) )

    if getio:
        return dict( depends_on = complikeFN,
                     creates = intervalsListFN,
                     mediumRuleNameSfx = ( scenario.scenDir(), ) + sfxs,
                     fileDescrs =
                     { intervalsListFN:
                           'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                       ' For each replica this table has one or more lines, giving intervals in that replica.' } )
    

    #complike = IDotData( complikeFN ).filter( lambda r: all( np.isfinite( ( r.iHS, r.meanFst, r.max_xpop ) ) ) )
    complike = IDotData( complikeFN )

    with IDotData.openForWrite( intervalsListFN,
                                'replicaNum gdFrom gdTo gdSize bpFrom bpTo bpSize numPositiveBins '
                                'numSnpsOver_0_2 maxSNP_Pos maxSNP_lik' ) as intervalsListFile:

        for replicaNum, complikeForReplica in complike.groupby( 'Chrom' ):


            if fromReplica is not None and replicaNum < fromReplica: continue
            if toReplica is not None and replicaNum > toReplica: break

            X = complikeForReplica.toDotData()

            minPos = np.min(X.gdPos)
            maxPos = np.max(X.gdPos)
            bins = np.arange(0,1,.01)

            ind = X.complikeExp.argsort()
            lik = X.complikeExp[ind]
            maxlik = np.mean(lik[-5:])
            #maxlik = mean(lik[-5:])
            like = X.complikeExp / maxlik

            Y = X[ind]
            maxSNP = Y.Pos[-1]

            maxScore = Y.complikeExp[-1]
            relPos = X.gdPos - Y.gdPos[-1]
            X = X.hstack(DotData(Columns = [like,relPos],names=['scaled_like','relPos']))

            topGdPos = Y.gdPos[-1]
            minPos = Y.Pos[-1]
            maxPos = Y.Pos[-1]
            minGdPos = topGdPos
            maxGdPos = topGdPos
            numPositiveBins = 0

            dbg( 'replicaNum minPos maxPos minGdPos maxGdPos' )

            for dir in -1, +1:
                for bin in bins:
                    Z = X[np.abs(X.relPos - dir*bin) <= .02]

                    dbg( 'dir bin len(Z)' )

                    if len(Z) == 0: continue
                    top = Z[Z.scaled_like > threshold]

                    dbg( 'len(top)' )

                    if len(top) <= numSNP and np.abs( topGdPos - top.gdPos ) > minGdInEachDir: break
                    if len( top ) == 0: top = Z
                    if dir == -1:
                        minPos = np.min(top.Pos)
                        minGdPos = np.min(top.gdPos)
                    else:
                        maxPos = np.max(top.Pos)
                        maxGdPos = np.max(top.gdPos)
                    numPositiveBins += 1

                    
            ind = np.all([X.Pos > minPos, X.Pos < maxPos],axis=0)
            peak = X[ind]
            
            intervalsListFile.writeRecord( replicaNum, minGdPos, maxGdPos, maxGdPos - minGdPos,
                                           minPos, maxPos, maxPos - minPos,
                                           numPositiveBins, sum(peak.scaled_like > .2),
                                           maxSNP, maxScore )
示例#44
0
def localizeSpatiallyBySplineFitting(Ddata,
                                     scenario,
                                     nreplicas,
                                     thinSfx='',
                                     putativeMutPop=None,
                                     complikeSfx='',
                                     likesTableSfx='',
                                     confidence=.9,
                                     minBins=20,
                                     nbins=200,
                                     smoothing=0.0,
                                     getio=None):
    """For each replica within a given scenario,
    localize the selected SNP spatially, by fitting a spline to a (smoothed version of) the CMS scores, dividing the
    region into bins, and finding the set of bins that cover 90% (or specified fraction of) area under the spline.

    Params:

       confidence - the spatially localized region will (hopefully) have this probability of containing the causal SNP;
         here, specifically, this means we'll include bins in the region that collectively cover this fraction of area
         under the posterior density curve.

       minBins - the region will include at least this many of the highest-average bins.
       
    """

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = (putativeMutPop, complikeSfx, likesTableSfx)
    complikeFN = os.path.join(snpStatsDir, AddFileSfx('complike.data/', *sfxs))

    intervalsListFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsSplineList.tsv', *sfxs))
    intervalsStatsFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsSplineStats.tsv', *sfxs))

    posteriorSplineFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsSplineSpline.tsv', *sfxs))
    binInfoFN = os.path.join(replicaStatsDir,
                             AddFileSfx('intervalsSplineBinInfo.tsv', *sfxs))

    if getio:
        return dict(
            depends_on=complikeFN,
            creates=(intervalsListFN, intervalsStatsFN, posteriorSplineFN,
                     binInfoFN),
            mediumRuleNameSfx=(scenario.scenDir(), ) + sfxs,
            fileDescrs={
                intervalsListFN:
                'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                ' For each replica this table has one or more lines, giving intervals in that replica.',
                intervalsStatsFN:
                'Per-replica statistic about confidence intervals in that replica',
                posteriorSplineFN:
                'The results of interpolating posterior density as a spline',
                binInfoFN:
                'Information about individual bins under the spline'
            })

    complike = IDotData(complikeFN).filter(
        lambda r: all(np.isfinite((r.iHS, r.meanFst, r.max_xpop))))

    binInfoHeadings = 'replicaNum binNum binStart binEnd included binCenters binAvgCMS binMaxCMS binIntegral '\
        'binIntegralNormed binRank'

    nbins_orig = nbins

    with contextlib.nested( IDotData.openForWrite( posteriorSplineFN, 'replicaNum gdPos complikeExp' ),
                            IDotData.openForWrite( binInfoFN, binInfoHeadings ),
                            IDotData.openForWrite( intervalsListFN, 'replicaNum gdFrom gdTo gdSize bpFrom bpTo '
                                                   'bpSize binsInInterval binsArea binsMax binsMaxFrac binsAvg '
                                                   'binsMinRank binsMaxRank' ),
                            IDotData.openForWrite( intervalsStatsFN, 'replicaNum numSegments gdTotLen bpTotLen' ) ) \
                            as ( posteriorSplineFile, binInfoFile, intervalsListFile, intervalsStatsFile ):

        for replicaNum, complikeForReplica in complike.groupby('Chrom'):

            gdMin = np.min(complikeForReplica.gdPos)
            gdMax = np.max(complikeForReplica.gdPos)
            binSize = (gdMax - gdMin) / nbins_orig
            binLefts = np.arange(gdMin, gdMax, binSize)
            binCenters = binLefts + binSize / 2
            binRights = binLefts + binSize
            dbg('len(complikeForReplica) replicaNum len(binLefts) len(binRights) len(binCenters)'
                )
            nbins = len(binLefts)

            #
            # Compute the mean and max CMS score in each gdPos bin
            #
            binAvgCMS = np.zeros(nbins)
            binMaxCMS = np.zeros(nbins)
            binNums = np.zeros(nbins, dtype=int)

            for bin, valsInBin in \
                    complikeForReplica.addCol( 'bin',
                                               list(map( functools.partial( min, nbins-1 ),
                                                    list(map( int,
                                                         ( complikeForReplica.gdPos - gdMin ) / binSize ))))).groupby('bin'):

                binNums[bin] = bin
                if valsInBin:
                    binAvgCMS[bin] = np.mean(valsInBin.complikeExp)
                    binMaxCMS[bin] = max(valsInBin.complikeExp)
                else:
                    binAvgCMS[bin] = binAvgCMS[bin - 1]
                    binMaxCMS[bin] = binMaxCMS[bin - 1]

            # fit a spline to the function ( binCenter, binAvgCMS ), approximating the posterior probability of a SNP being
            # causal.
            posteriorDensitySpline = interpolate.splrep(binCenters,
                                                        binAvgCMS,
                                                        s=smoothing)

            splineX = np.arange(gdMin, gdMax, binSize / 2)
            for x, y in zip(splineX,
                            interpolate.splev(splineX,
                                              posteriorDensitySpline)):
                posteriorSplineFile.writeRecord(replicaNum, x, y)

            # compute the integral under the interpolated function, for each bin.
            binIntegral = np.zeros(nbins)
            for binLeft, binRight, binNum in zip(binLefts, binRights, binNums):
                binIntegral[binNum] = interpolate.splint(
                    binLeft, binRight, posteriorDensitySpline)

            # normalize the integral value above each bin so that the total posterior density of being causal
            # integrates to 1.0
            binIntegralNormed = binIntegral / sum(binIntegral)

            binsByIntegralSize = binIntegral.argsort()[::-1]

            binRank = np.zeros(nbins)
            for i in range(nbins):
                binRank[binsByIntegralSize[i]] = i

            binsToUse = np.zeros(nbins, dtype=bool)
            binsIncluded = 0
            fractionCovered = 0.0
            for bin in binsByIntegralSize:
                if fractionCovered >= confidence and binsIncluded >= minBins:
                    break
                binsToUse[bin] = True
                binsIncluded += 1
                fractionCovered += binIntegralNormed[bin]

            # list the confidence intervals for this replica,
            # merging adjacent intervals.

            gd2bp = interpolate.interp1d(complikeForReplica.gdPos,
                                         complikeForReplica.Pos)
            gdMax = max(complikeForReplica.gdPos)

            binInfo = IDotData(names=binInfoHeadings,
                               Columns=(itertools.repeat(replicaNum, nbins),
                                        list(range(nbins)), binLefts,
                                        binRights, binsToUse, binCenters,
                                        binAvgCMS, binMaxCMS, binIntegral,
                                        binIntegralNormed, binRank))

            binInfoFile.writeRecords(binInfo)

            binsOverallMaxCMS = max(binInfo.binMaxCMS)

            gdTotLen = 0.0
            bpTotLen = 0
            numSegments = 0
            for included, bins in binInfo.groupby('included'):
                if included:
                    gdFrom = min(bins.binStart)
                    gdTo = min(max(bins.binEnd), gdMax)
                    bpFrom = int(gd2bp(gdFrom))
                    bpTo = int(gd2bp(gdTo))

                    gdTotLen += (gdTo - gdFrom)
                    bpTotLen += (bpTo - bpFrom)
                    numSegments += 1

                    intervalsListFile.writeRecord(
                        replicaNum, gdFrom, gdTo, gdTo - gdFrom, bpFrom, bpTo,
                        bpTo - bpFrom, len(bins), sum(bins.binIntegralNormed),
                        max(bins.binMaxCMS),
                        max(bins.binMaxCMS) / binsOverallMaxCMS,
                        np.mean(bins.binAvgCMS), min(bins.binRank),
                        max(bins.binRank))

            assert numSegments > 0 and bpTotLen > 0 and gdTotLen > 0.0
            intervalsStatsFile.writeRecord(replicaNum, numSegments, gdTotLen,
                                           bpTotLen)
            dbg('replicaNum numSegments gdTotLen bpTotLen')
示例#45
0
def DefineRulesTo_fastCMS(pr,
                          pops,
                          chroms,
                          selPop,
                          sweepDir,
                          cmsDir,
                          genomeBuild='hg19'):
    """Define rules to do fast CMS computation.

    Params:

       pr - the PipeRun object to which to add rules

       selPop - testing selection in which pop?
       pops - comparing selPop to which pops?
       sweepDir - the sweep directory
       cmsDir - the directory under which CMS stats go
    """

    pops = list(MakeSeq(pops))
    if selPop not in pops: pops.append(selPop)

    allPops = tuple(MakeSeq(pops))
    if selPop not in allPops: allPops += (selPop, )
    cmpPops = [pop for pop in allPops if pop != selPop]

    rawScoresFN = {}

    genMapSfx = genomeBuild2genMapSfx[genomeBuild]
    for pop in allPops:
        for chrom in chroms:
            with pr.settingAttrs('pop chrom'):
                snpInfoFN = os.path.join(
                    sweepDir,
                    'analysis/chr%(chrom)s/snps_%(pop)s.tsv' % locals())
                projDir = os.path.join(sweepDir,
                                       'data/chr%(chrom)s' % locals())
                ancestralImportedFN = os.path.join(projDir,
                                                   'ancestral.tsv.imported')
                genotypesImportedFN = os.path.join(
                    projDir,
                    'genotypes_chr%(chrom)s_%(pop)s_r21_nr_fwd_phased_all.imported'
                    % locals())
                genMapImportedFN = os.path.join(
                    projDir,
                    'genetic_map_chr%(chrom)s_%(genMapSfx)s.txt.imported' %
                    locals())
                pr.addRule(
                    name='extractSnpInfo',
                    commands=
                    'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s'
                    % locals(),
                    commandsOld=
                    'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep/target/sweep-1.0-SNAPSHOT-jar-with-dependencies.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s'
                    % locals(),
                    depends_on=(ancestralImportedFN, genotypesImportedFN,
                                genMapImportedFN),
                    creates=snpInfoFN)

    chr2dihhFN = {}

    for chrom in chroms:
        with pr.settingAttrs('chrom'):

            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            xpopScoresFN = os.path.join(
                chromDir, AddFileSfx('max_xpop.tsv', chrom_s, selPop, pops))

            pr.addInvokeRule(invokeFn=gatherXPOPscores,
                             invokeArgs=Dict('pops chrom selPop sweepDir',
                                             outFN=xpopScoresFN),
                             attrs=dict(pop=allPops,
                                        stat='max_xpop',
                                        piperun_short=True))

            ihsFN = getFN_ihs_signif(**Dict('sweepDir chrom', pop=selPop))

            ihsScoresFN = os.path.join(
                chromDir, AddFileSfx('iHS.tsv', chrom_s, selPop, pops))
            dihhScoresFN = os.path.join(
                chromDir, AddFileSfx('dihh.tsv', chrom_s, selPop, pops))

            chr2dihhFN[chrom] = dihhScoresFN

            pop2ancFreqFN = os.path.join(
                cmsDir, chrom_s, AddFileSfx('pop2ancFreq.tsv', chrom_s, pops))
            pop2sampleSizeFN = os.path.join(
                cmsDir, chrom_s, AddFileSfx('pop2sampleSize.tsv', chrom_s,
                                            pops))

            pop2snpInfoFN = dict([(pop,
                                   os.path.join(sweepDir, 'analysis',
                                                'chr%(chrom)s' % locals(),
                                                'snps_%(pop)s.tsv' % locals()))
                                  for pop in pops])

            pr.addInvokeRule(
                invokeFn=gather_snp_info,
                invokeArgs=Dict(
                    'pops pop2snpInfoFN pop2ancFreqFN pop2sampleSizeFN'))

            pr.addInvokeRule(
                invokeFn=gather_iHS_scores,
                invokeArgs=Dict(
                    'chrom selPop ihsFN pop2ancFreqFN',
                    #                                                 snpInfoFN = pop2snpInfoFN[ selPop ],
                    ihsOutFN=ihsScoresFN,
                    dihhOutFN=dihhScoresFN),
                attrs=dict(pop=selPop,
                           stat=('iHS', 'StdDiff'),
                           piperun_short=True))

            freqDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('freqDiff.tsv', chrom_s, selPop, pops))
            meanFstScoresFN = os.path.join(
                chromDir, AddFileSfx('meanFst.tsv', chrom_s, selPop, pops))

            pr.addInvokeRule(
                invokeFn=computeMeanFstAndFreqDiffScores,
                invokeArgs=Dict(
                    'chrom selPop sweepDir pops pop2ancFreqFN pop2sampleSizeFN',
                    outMeanFstFN=meanFstScoresFN,
                    outFreqDiffFN=freqDiffScoresFN),
                attrs=dict(pop=allPops,
                           stat=('freqDiff', 'meanFst'),
                           piperun_short=True))

            StdDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('StdDiff.tsv', chrom_s, selPop, pops))

            rawScoresFN[chrom] = dict(iHS=ihsScoresFN,
                                      StdDiff=StdDiffScoresFN,
                                      meanFst=meanFstScoresFN,
                                      freqDiff=freqDiffScoresFN,
                                      max_xpop=xpopScoresFN)

        # end: with pr.settingAttrs( 'chrom' )
    # end: for chrom in chroms

    #    ihhStdFN = os.path.join( cmsDir, 'dihhstd.tsv' )

    dihhGlobalStdFN = os.path.join(
        cmsDir, AddFileSfx('dihh_global_std.tsv', selPop, pops))
    dihhBinMeansFN = os.path.join(
        cmsDir, AddFileSfx('dihh_bin_means.tsv', selPop, pops))

    pr.addInvokeRule(invokeFn=normalizeByFreq_getMeanStd_tsv,
                     invokeArgs=dict(
                         iHHDiffFNs=[chr2dihhFN[k] for k in chroms],
                         globalStatFN=dihhGlobalStdFN,
                         binsStatFN=dihhBinMeansFN),
                     name='compute_dihh_meanstd')

    # pr.addInvokeRule( invokeFn = computeMeanStd_binned_tsvs,
    #                   invokeArgs = dict( inFNs = chr2dihhFN.values(), valCol = 'iHHDiff',
    #                                      binCol = 'normingFreqs', binMin = 0.05, binMax = 1.05, binStep = .05,
    #                                      outFN = ihhStdFN ),
    #                   name = 'compute_dihh_std' )

    for chrom in chroms:
        with pr.settingAttrs('chrom'):
            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            StdDiffScoresFN = os.path.join(
                chromDir, AddFileSfx('StdDiff.tsv', chrom_s, selPop, pops))
            dbg('chrom chr2dihhFN[chrom]')
            pr.addInvokeRule(invokeFn=normalizeByFreq_compute_normed_tsv,
                             invokeArgs=dict(iHHDiffFN=chr2dihhFN[chrom],
                                             globalStatFN=dihhGlobalStdFN,
                                             binsStatFN=dihhBinMeansFN,
                                             StdDiffFN=StdDiffScoresFN))

    statLikesRatioFNs = {}

    for stat in CMSBins.CMSstats:
        with pr.settingAttrs(
                stat=stat,
                pop=(selPop, ) if stat in ('iHS', 'StdDiff') else allPops,
                piperun_short=True):
            if stat not in CMSBins.nonNormedStats:
                rawFNs = [rawScoresFN[chrom][stat] for chrom in chroms]
                meanStdFN = os.path.join(
                    cmsDir, AddFileSfx('meanStd.tsv', stat, selPop, pops))

                # DefineRulesTo_computeMeanStd( pr, inFNs = rawFNs, colNum = 1,
                #                               outFN = meanStdFN,
                #                               addRuleArgs = \
                #                               dict( name = 'computeMeanStd_for_stat',
                #                                     attrs = dict( chrom = chroms ) ) )

                #                meanStdBzFN = os.path.join( cmsDir, stat + '_meanStdForStat.tsv' )
                pr.addInvokeRule(invokeFn=computeMeanStd,
                                 invokeArgs=dict(inFNs=rawFNs,
                                                 colName=stat,
                                                 outFN=meanStdFN))

            # end: if stat not in CMSBins.nonNormedStats

            for chrom in chroms:
                with pr.settingAttrs('chrom'):
                    statFN = rawScoresFN[chrom][stat]

                    if stat not in CMSBins.nonNormedStats:
                        normedFN = AddFileSfx(statFN, 'normed')

                        DefineRulesTo_normalizeOneColumn(
                            pr,
                            inFN=statFN,
                            meanStdFN=meanStdFN,
                            colName=stat,
                            outFN=normedFN,
                            addRuleArgs=dict(attrs=Dict('chrom')))
                        statFN = normedFN

                    bins_beg = CMSBins.stat_start[stat]
                    bins_end = CMSBins.stat_end[stat]
                    bins_n = CMSBins.stat_nbin[stat]

                    statLikesRatioFN = AddFileSfx(rawScoresFN[chrom][stat],
                                                  'likesRatio')
                    statLikesRatioFNs[(chrom, stat)] = statLikesRatioFN

                    pr.addInvokeRule(
                        invokeFn=computeLikeRatioForStat,
                        invokeArgs=dict(
                            stat=stat,
                            statValsFN=statFN,
                            hitLikesFN=
                            '../Data/Common_Data/sim/likes/hitsLikes_toneutFixed_1.tsv',
                            missLikesFN=
                            '../Data/Common_Data/sim/likes/missLikes_toneutFixed_1.tsv',
                            stat_start=bins_beg,
                            stat_end=bins_end,
                            stat_nbin=bins_n,
                            statLikesRatioFN=statLikesRatioFN))

                # end: with pr.settingAttrs( 'chrom' )
            # end: for chrom in chroms
        # end: with pr.settingAttrs( stat = stat, piperun_short = True )
    # end: for stat in CMSBins.CMSstats

    for chrom in chroms:
        with pr.settingAttrs(chrom=chrom, stat=CMSBins.CMSstats):
            chrom_s = 'chr' + str(chrom)
            chromDir = os.path.join(cmsDir, chrom_s)

            likesRatioFN = os.path.join(
                chromDir,
                AddFileSfx('likesRatio.tsv', CMSBins.CMSstats, selPop, pops))
            pr.addInvokeRule(invokeFn=addLikesRatios,
                             invokeArgs=dict(
                                 inFNs=[
                                     statLikesRatioFNs[(chrom, stat)]
                                     for stat in CMSBins.CMSstats
                                 ],
                                 colNames=[
                                     colName + 'likeRatio'
                                     for colName in CMSBins.CMSstats
                                 ],
                                 outFN=likesRatioFN))
示例#46
0
def RunTasks( options ):
    """Take tasks from the specified queue directory, and run them.

    Params:

       options - see command-line parameter definition in main() below.
       
    """

    if haveParamiko: Random.atfork()

    startClock = time.time()

    logging.info( 'Starting runner (process id %d on host %s) with options %s'
                  % ( os.getpid(), GetHostName(), options ) )

    stopSignal = [ False ]

    def SetStopSignal( sigNum, stkFrm ):
        logging.info( 'Setting stop signal to stop runners' )
        stopSignal[ 0 ] = True
        dbg( '"aftset" stopSignal' )

    signal.signal( signal.SIGUSR1, SetStopSignal )

    fs = RemoteFileSystem( remote = options.remote, pw = options.password, pkey = options.pkey ) \
        if options.remote else LocalFileSystem()

    # check that all queues exist
    assert all( fs.exists( queue ) and fs.isdir( queue )
                for queue in options.queues.split( fs.pathsep ) )

    # register a cleanup routine so that, if we claim a task and then
    # crash midway through it, our lock on the task is erased, so that
    # another runner can pick up the task.  Note that if the _task_ fails
    # with an error code, that's fine -- we just report the error code
    # to the mqsub.sh script instance that submitted the task.
    # The cleanup here happens only if the runner crashes before receiving
    # a proper exit code from the task.
    fileToErase = [ None ]
    @atexit.register
    def DoErase( eraseWhat = fileToErase ):
        if eraseWhat[0] and fs.exists(eraseWhat[0]):
            #print 'runner AtExit: removing ' + eraseWhat[0]
            fs.remove( eraseWhat[0] )

    # var: lastFinish - time when a task last finished.
    lastFinish = time.time()

    queues = options.queues.split( fs.pathsep )
    lastQueueModTime = [ None ] * len( queues )

    skipDirs = set([ 'newtask.dat' ] )

    numTasksRun = 0
    numProcsAvail = int( os.getenv( 'LSB_DJOB_NUMPROC', 1 ) )
    dbg( 'numProcsAvail' )

    for queue in queues:
        EnsureDirExists( os.path.join( queue, 'succ' ) )
        EnsureDirExists( os.path.join( queue, 'fail' ) )
    
    while not stopSignal[0]:

        ranCommand = False

        if options.maxRunHours > 0 and ( time.time() - startClock  ) / 3600.0 > options.maxRunHours:
            logging.info( 'Runner exiting after CPU time of %s hours' % ( time.time() - startClock ) / 3600.0 )
            return

        if stopSignal[ 0 ]:
            logging.info( 'Runner stopped by stop signal' )
            return
        else: dbg( '"chkstop" stopSignal' )

        dbg( 'queues' )
        for queueNum, queue in enumerate( queues ):
            dbg( 'queueNum queue' )

            # do a quick check to see if any tasks have been added to the queue since we last checked
            newTaskFN = os.path.join( queue, 'newtask.dat' )
            try:
                curQueueModTime = fs.stat( newTaskFN ).st_mtime
                if curQueueModTime == lastQueueModTime[ queueNum ]: continue
                lastQueueModTime[ queueNum ] = curQueueModTime
            except EnvironmentError as e:
                if os.path.exists( newTaskFN ):
                    logging.warning( 'ERROR CHECKING FOR NEW TASKS in queue %s: %s' % ( queue, e ) )
                pass
            
            # find an unclaimed task in this queue, and try to claim it
            taskDirs = sorted( fs.listdir( queue ) )
            dbg( 'len(taskDirs)' )
            #random.shuffle( taskDirs )
            dbg( 'os.environ.get("MQ_FIRST_DIR")' )
            if 'MQ_FIRST_DIR' in os.environ and os.environ[ 'MQ_FIRST_DIR' ] in taskDirs:
            
                taskDirs = [ os.environ[ 'MQ_FIRST_DIR' ] ] + taskDirs
                logging.info( 'putting specified dir first' )

            for taskDir in taskDirs:

                if taskDir in skipDirs: continue

                if options.maxRunHours > 0 and ( ( time.time() - startClock ) / 3600.0 ) > options.maxRunHours:
                    logging.info( 'Runner exiting after CPU time of %s hours' % ( ( time.time() - startClock ) / 3600.0 ) )
                    return

                if stopSignal[ 0 ]:
                    logging.info( 'Runner stopped by stop signal' )
                    return
                else: dbg( '"chkstop" stopSignal' )

                try:

                    while fs.path.exists( os.path.join( queue, 'noclaim.dat' ) ):
                        time.sleep( 60 + random.normalvariate( 10.0, 5.0 ) ) 
                
                    fullTaskDir = fs.path.join( queue, taskDir )
                    claimedFN = fs.path.join( fullTaskDir, options.claimedFN )

                    attrsFN = fs.path.join( fullTaskDir, 'attrs.tsv' )
                    cwdFN = fs.path.join( fullTaskDir, 'submitdir.txt' )

                    failedCond = []
                    
                    def saveVal( name, val, fc = failedCond ):
                        if not val: fc.append( name )
                        return val

                    if saveVal( 'ready', fs.path.exists( fs.path.join( fullTaskDir, options.readyFN ) ) ) \
                            and saveVal( 'not claimed', not fs.path.exists( claimedFN ) ) \
                            and saveVal( 'relocatable', ( not options.remote or \
                                                              all([ not f.startswith( '/' ) for which in ( 'sources', 'targets' )  \
                                          for f in fs.SlurpFile( fs.path.join( fullTaskDir, which + '.lst' ) ) ]) ) ) \
                            and saveVal( 'memOk', GetMemReq( fs, attrsFN ) <= options.maxMem ) \
                            and saveVal( 'minMemOk', options.minMem == 0 or GetMemReq( fs, attrsFN ) >= options.minMem ) \
                            and saveVal( 'minProc', GetProcReq( fs, attrsFN ) >= options.minProc ) \
                            and saveVal( 'maxProc', GetProcReq( fs, attrsFN ) <= numProcsAvail ) \
                            and saveVal( 'local', ( options.local_tasks or not GetTaskAttr( fs, attrsFN, 'piperun_run_locally', False ) ) ) \
                            and saveVal( 'onlyLocal',
                                         ( not options.only_local_tasks or GetTaskAttr( fs, attrsFN, 'piperun_run_locally', False ) ) ) \
                            and saveVal( 'short', ( not options.runOnlyShort or GetTaskAttr( fs, attrsFN, 'piperun_short', False ) ) )  \
                            and saveVal( 'long', ( not options.runOnlyLong or not GetTaskAttr( fs, attrsFN, 'piperun_short', False ) ) )  \
                            and saveVal( 'notRequeued', ( not options.noRequeuedTasks or not fs.path.exists( fs.path.join( fullTaskDir, 'requeued.dat' ) ) ) ) \
                            and saveVal( 'notFromHost', ( not options.onlyFromHost or socket.getfqdn() == options.onlyFromHost ) ) \
                            and saveVal( 'notFromPipeline', ( not options.onlyFromPipelineId or  \
                                                                  GetTaskAttr( fs, attrsFN, 'piperun_pipelineId' ) == options.onlyFromPipelineId ) ):

                        # try to claim the task
                        try:
                            fd = fs.open(claimedFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY)
                        except EnvironmentError:
                            # another runner beat us to this task -- go and check other tasks
                            logging.info( 'another job beat us to claiming ' + fullTaskDir )
                            continue

                        try:
                            fs.write( fd, 'locked by process %d on host %s\n' % ( os.getpid(), GetHostName() ) )
                            for v in list(os.environ.keys()):
                                fs.write( fd, '%s=%s\n' % ( v, os.environ[v] ) )
                        finally:
                            fs.close( fd )
                        # Tell our cleanup code to release this task if we crash.
                        fileToErase[0] = claimedFN
                        # get the command to run the task
                        theCMD = fs.SlurpFile( os.path.join( fullTaskDir, 'command.dat' ) ).strip()
                        theCmdDir = fs.SlurpFile( os.path.join( fullTaskDir, 'submitdir.txt' ) ).strip()
                        theCmdEnvFN = os.path.join( fullTaskDir, 'submitenv.txt' )

                        if options.remote:
                            assert have_fcntl
                            SystemSucceed( 'mkdir -p ' + os.path.join( options.localDataDir, fs.root[1:] ) )
                            for needDir in 'Operations', 'Classes', 'System', 'Other':
                                needDirFull = os.path.join( options.localDataDir, fs.root[1:], '..', needDir )
                                if not os.path.exists( needDirFull ):
                                    os.symlink( os.path.realpath( os.path.join( '..', needDir ) ), needDirFull )

                            # copy source files

                            # get exclusive locks on the source files
                            srcFiles = sorted( set( fs.SlurpFile( os.path.join( fs.root, fullTaskDir, 'sources.lst' ) ).rstrip( '\n' ).split( '\n' ) ) )
                            srcLockIds = []
                            srcLockFiles = []
                            for srcFile in srcFiles:
                                lockFile = os.path.join( options.localDataDir, 'mqlocks', srcFile[1:] )
                                if lockFile.endswith('/'): lockFile = lockFile[:-1]
                                lockFile += '.lock'
                                SystemSucceed( 'mkdir -p ' + os.path.dirname( lockFile ) )
                                gotLock = False
                                while not gotLock:
                                    try:
                                        openMode = os.O_CREAT|os.O_EXCL|os.O_WRONLY
                                        logging.info( 'opening ' + lockFile + ' with mode ' + str( openMode ) )
                                        lockId = os.open( lockFile, openMode )
                                        gotLock = True
                                    except EnvironmentError:
                                        logging.info( 'Could not create ' + lockFile + ' , waiting...' )
                                        time.sleep( 10 + random.normalvariate( 3.0, 1.0 ) )
                                fcntl.lockf( lockId, fcntl.LOCK_EX )
                                srcLockIds.append( lockId )
                                srcLockFiles.append( lockFile )
                                logging.info( 'Got lock on ' + lockFile )

                            SystemSucceed( 'rsync -zprv --files-from=:' + os.path.join( fs.root, fullTaskDir, 'sources.lst' ) +
                                           ' ' + fs.username + '@' + fs.hostname + ':/ ' + options.localDataDir )

                            for srcLockId, srcLockFile in zip( srcLockIds, srcLockFiles)[::-1]:
                                fcntl.lockf( srcLockId, fcntl.LOCK_UN )
                                os.close( srcLockId )
                                SystemSucceed( 'rm -rf ' + srcLockFile )

                            targets = fs.SlurpFile( os.path.join( fs.root, fullTaskDir, 'targets.lst' ) ).rstrip( '\n' ).split('\n')
                            targetDirs = set( map( os.path.dirname, [_f for _f in map( str.strip, targets ) if _f] ) )
                            dbg( '"DDDDDD" targetDirs' )

                            for targetDir in targetDirs:
                                assert targetDir.startswith( '/' )
                                tdir = os.path.join( options.localDataDir, targetDir[1:] )
                                SystemSucceed( 'mkdir -p ' + tdir + ' ' + os.path.join( tdir, 'makeinfo' ) )

                            theCMD = 'cd ' + os.path.join( options.localDataDir, fs.root[1:] ) + ' && ' + theCMD

                        logging.info( 'Under ' + claimedFN + ' RUNNING: ' + theCMD )
                        # Actually run the task; get its exit code
                        save_cwd = os.getcwd()
                        try:
                            os.chdir( theCmdDir )
                            logging.info( 'CWD=' + os.getcwd() )

                            runScriptFN = os.path.join( fullTaskDir, 'run.sh' )

                            with open( runScriptFN, 'w' ) as out:
                                out.write( '#!/usr/bin/env bash\n' )
                                out.write( 'set -e -o pipefail\n' )
                                with open( theCmdEnvFN ) as envFile:
                                    for line in envFile:
                                        if '=' not in line or line.startswith('module='): break
                                        equalIdx = line.index( '=' )
                                        envVarName = line[ :equalIdx+1 ]
                                        if not ( re.search( r'\W', envVarName ) or envVarName.startswith( 'LSB_' ) or \
                                           envVarName.startswith( 'LSF_' ) or \
                                           envVarName.startswith( 'LS_' ) or envVarName.startswith( 'SLURM' ) or \
                                           envVarName in \
                                           ( 'SYS_TYPE', 'MACHTYPE', 'VENDOR', 'OSTYPE',
                                             'DOMAINNAME', 'HOSTTYPE', 'SHORTHOST', 'SSH_TTY',
                                             'HOST', 'HOSTNAME', 'REMOTEHOST', 'STY' ) ):
                                            out.write( 'export ' + envVarName + "'" + line[ equalIdx+1: -1 ] + "'\n" )
                                out.write( theCMD )

                            os.chmod( runScriptFN, stat.S_IXUSR | stat.S_IRWXU )

                            try:
                                exitCode = os.system( runScriptFN )
                            except ( KeyboardInterrupt, SystemExit ):
                                interruptedFN = os.path.join( fullTaskDir, 'interrupted.dat' )
                                DumpFile( interruptedFN, 'interrupted' );
                                raise
                        finally:
                            os.chdir( save_cwd )
                        logging.info( 'Under ' + claimedFN + ' FINISHED RUNNING: ' + theCMD )
                        logging.info( 'Got exit code %d' % exitCode )

                        if options.remote:
                            # copy the target files and the output log back to the correct dirs on the remote system

                            # first, make sure the files all exist, and are no longer being written to.

                            time.sleep( options.aftTaskDelay )

                            os.system( 'rsync -zprv --files-from=:' + os.path.join( fs.root, fullTaskDir, 'targets.lst' ) +
                                       ' ' + options.localDataDir + ' ' + fs.username + '@' + fs.hostname + ':/' )

                        # If we succeeded in running the task (whether the task itself failed or not),
                        # tell the cleanup code to NOT release this task if we crash.
                        fileToErase[0] = None
                        # Tell the task submitter script that we are done, and what the task's
                        # exit code was.

                        if os.path.exists( os.path.join( fullTaskDir, 'nmq.dat' ) ):

                            time.sleep(3)
                            fd = fs.open( os.path.join( fullTaskDir, 'completed.dat' ), os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            fs.close(fd)
                            
                            try:
                                shutil.move( fullTaskDir, os.path.join( queue, 'succ' if exitCode == 0 else 'fail' ) )
                            except EnvironmentError as e:
                                logging.warning( 'Error moving ' + fullTaskDir + ' to ' + os.path.join( queue, 'succ' if exitCode == 0 else 'fail' ) + ' : ' + e )
                        else:
                            exitCodeFN = os.path.join( fullTaskDir, 'exitCode.dat' )
                            fd = fs.open( exitCodeFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            bytesWritten = fs.write( fd, str( exitCode ) )
                            fs.close( fd )

                            time.sleep(3)
                            logging.info( 'Wrote exit code %s to file %s (%s bytes)' % ( exitCode, exitCodeFN, bytesWritten ) )

                            fd = fs.open( os.path.join( fullTaskDir, 'completed.dat' ), os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            fs.close(fd)

                        # Record that we actually ran a task here.
                        ranCommand = True
                        lastFinish = time.time()
                        numTasksRun += 1

                    else:
                        logging.info( 'did not take task ' + taskDir + ' ; reason: ' + str( failedCond ) );

                except:
                    excInfo = sys.exc_info()
                    logging.warning( 'Error trying to grab task from ' + taskDir + ' (%s), skipping...'
                                     % str( excInfo ) )
                    traceback.print_exc()

        dbg( 'ranCommand lastFinish time.time()-lastFinish' )
        if not ranCommand:
            waitTimeHere = time.time() - lastFinish
            if ( numTasksRun > 0 and options.maxWaitTime > 0 and waitTimeHere > options.maxWaitTime ) \
                    or ( numTasksRun == 0 and options.maxFirstWaitTime > 0 and waitTimeHere > options.maxFirstWaitTime ) :
                logging.info( 'Runner exiting after idle time of %s' % waitTimeHere )
                return
            time.sleep( options.taskCheckInterval + random.normalvariate( 3.0, 1.0 ) )
示例#47
0
def computeMeanStd_binned(inDatas, valCol, binCol, binMin, binMax, binStep):
    """Compute binned stats for a set of tables"""

    binCount = int((binMax - binMin) / binStep)
    dbg('binCount')
    sums = np.zeros(binCount)
    sumsSq = np.zeros_like(sums)
    counts = np.zeros_like(sums)
    bins = np.arange(binMin, binMax, binStep)
    for d_idx, d in enumerate(inDatas):
        dbg('d_idx d binStep')
        dbg('d[binCol]')

        binColValues = 1.0 - (1.0 - d[binCol].values)

        for i in range(binCount):
            #        binBot = bins[i]
            binTop = bins[i]
            theIdx = ((binTop - binColValues) < binStep) & (
                (binTop - binColValues) > 0)
            #        theIdx = ( binBot < d[ binCol ].values ) & ( d[ binCol ].values <= binTop )
            #       DotData( names = ('rows',), Columns = theIdx.nonzero() ).saveToSV( 'nz%02d.tsv' % i )
            #rowsStr = ','.join(map(str,list(theIdx.nonzero())))
            #print 'binnedRows=', rowsStr
            hereVals = d[theIdx][valCol]
            #        DotData( names = ( 'temp', ), Columns = ( hereVals, ) ).saveToSV( 'temp2%2d.tsv' % i )

            dbg('"BEF" theIdx.sum() i bins[i] len(hereVals)')
            counts[i] += len(hereVals)
            sums[i] += np.sum(hereVals)
            sumsSq[i] += np.sum(hereVals * hereVals)
#        dbg( '"AFT" i bins[i] bins[i+1] len(hereVals)' )

        if False:
            # fast version
            binsHere = np.digitize(d[binCol], bins) - 1
            dbg('len(binsHere) binsHere')
            np.clip(binsHere, 0, binCount - 1, out=binsHere)
            dbg('binsHere')

            counts += np.bincount(binsHere, minlength=binCount)
            sums += np.bincount(binsHere,
                                weights=d[valCol],
                                minlength=binCount)
            sumsSq += np.bincount(binsHere,
                                  weights=d[valCol] * d[valCol],
                                  minlength=binCount)

    countsOrig = counts.astype(int)
    counts[counts == 0] = np.nan
    means = sums / counts
    stds = sumsSq / counts - means * means

    return pd.DataFrame(
        dict(binBeg=bins - binStep,
             binEnd=bins,
             counts=countsOrig,
             sums=sums,
             sumsSq=sumsSq,
             means=means,
             stds=stds))
示例#48
0
文件: resultPusher.py 项目: quank/cms
def PushResults( options ):
    """Push finished results"""

    logging.info( 'Starting result pusher as process %d on host %s with options %s' %
                  ( os.getpid(), GetHostName(), options ) )

    # check that all queues exist
    assert all( os.path.exists( queue ) and os.path.isdir( queue )
                for queue in options.queues.split( os.pathsep ) )

    stopSignal = [ False ]

    def SetStopSignal( sigNum, stkFrm ):
        logging.info( 'Setting stop signal to stop pushers' )
        stopSignal[ 0 ] = True
        dbg( '"aftset" stopSignal' )

    signal.signal( signal.SIGUSR2, SetStopSignal )

    while not stopSignal[0]:
        
        for queue in options.queues.split( os.pathsep ):

            logging.info( 'Pushing results in ' + queue + '...' )

            # find an unclaimed task in this queue, and try to claim it
            try: taskDirs = filter( lambda f: f.startswith('mq'), os.listdir( queue ) )
            except EnvironmentError as e:
                logging.info( 'Error getting list of tasks in queue ' + queue + ': ' + str( e ) )
                # sleep a bit -- maybe it's some transient condition that will resolve itself
                time.sleep( 60 + random.normalvariate( 3.0, 1.0 ) )
                continue

            for taskDir in taskDirs:
                fullTaskDir = os.path.join( queue, taskDir )

                try:
                    pushingFN = os.path.join( fullTaskDir, 'pushing.dat' )
                    submithostFN = os.path.join( fullTaskDir, 'submithost.txt' )
                    if os.path.exists( os.path.join( fullTaskDir, 'completed.dat' ) ) \
                            and os.path.exists( submithostFN ) and GetSubmitHost( submithostFN ) == GetHostName() \
                            and not os.path.exists( pushingFN ):
                        try:
                            fd = os.open( pushingFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                        except EnvironmentError as e:
                            if e.errno not in ( errno.EEXIST, errno.EACCES, errno.EAGAIN ): raise
                            # another resultPusher beat us to this task -- go and check other tasks
                            continue

                        
                        os.write( fd, 'result being pushed by process %d on host %s' % ( os.getpid(), GetHostName() ) )
                        os.close( fd )

                        taskDescr = ''

                        try:
                            attrsFN = os.path.join( fullTaskDir, 'attrs.tsv' )
                            if os.path.exists( attrsFN ):
                                taskDescr += ' output in ' + GetTaskAttr( attrsFN, 'piperun_outputSavedTo' )
                        except EnvironmentError as e:
                            logging.info( 'Could not read attrs for task in ' + fullTaskDir + ': ' + str( e ) )
                        
                        try:
                            infoFNs = [ os.path.join( fullTaskDir, f ) for f in 'command.dat', 'attrs.tsv', 'claimed.dat' ]
                            infoContents = '\n'.join([ SlurpFile( f ) if os.path.exists( f ) else 'missing file: ' + f
                                                       for f in infoFNs ])

                            thePipe = os.open( os.path.join( fullTaskDir, 'getresult' ), os.O_WRONLY | os.O_NONBLOCK )
                            exitCodeReadOk = False
                            writeOk = False
                            closeOk = False
                            exitCode = 'UNKNOWN'
                            try:
                                exitCode = SlurpFile( os.path.join( fullTaskDir, 'exitCode.dat' ) ).strip()
                                exitCodeReadOk = True
                                os.write( thePipe, exitCode )
                                writeOk = True
                            finally:
                                os.close( thePipe )
                                closeOk = True

                            logging.info( 'Pushed result in ' + fullTaskDir + ': ' + ( 'nonzero ' if exitCode != '0' else '' ) + 'exit code ' + exitCode + taskDescr )

                            if not writeOk or not closeOk or not exitCodeReadOk: dbg( 'exitCodeReadOk writeOk closeOk' )
                            if exitCodeReadOk and exitCode != '0': logging.info( infoContents )
                            
                        except EnvironmentError as e:
                            logging.info( 'The task at ' + fullTaskDir + ' seems to have been orphaned: ' + e.strerror )

                except EnvironmentError as e:
                    logging.info( 'Error processing queue ' + queue + ' task ' + taskDir + ': ' + str( e ) )
                    # sleep a bit -- maybe it's some transient condition that will resolve itself
                    time.sleep( 60 + random.normalvariate( 3.0, 1.0 ) )

        # if we pushed at least something, go back and try again.  if not, wait.
        time.sleep( options.sleepInterval + random.normalvariate( 3.0, 1.0 ) )
示例#49
0
                    default=1000,
                    help='number of boostrap iters')
parser.add_argument('--cosi-binary',
                    default='./coalescent',
                    help='cosi binary to run')

print 'calling parser'
args = parser.parse_args()
print 'parser done'
# do a reference run

print 'generating reference'
SystemSucceed(' '.join(
    map(str, (args.cosi_binary, '-p', '1_simple.cosiParams', '-n', 100,
              '-m'))) + ' | sample_stats_extra > ref.tsv')
refData = DotData(SVPath='ref.tsv')
min_p = np.ones(len(refData.dtype.names))
max_D = np.repeat(-np.inf, len(refData.dtype.names))
for i in range(10):
    dbg('i')
    refFN = 'reftest%d.tsv' % i
    SystemSucceed(' '.join(
        map(str, (args.cosi_binary, '-p', '0_simple.cosiParams', '-n', 100,
                  '-m'))) + ' | sample_stats_extra > ' + refFN)
    z = DotData(SVPath=refFN)
    for colNum, col in enumerate(z.dtype.names):
        ks_D, ks_p = stats.ks_2samp(refData[col], z[col])
        min_p[colNum] = np.min((min_p[colNum], ks_p))
        max_D[colNum] = np.max((max_D[colNum], ks_D))
    dbg('i min_p max_D')
示例#50
0
文件: resultPusher.py 项目: quank/cms
 def SetStopSignal( sigNum, stkFrm ):
     logging.info( 'Setting stop signal to stop pushers' )
     stopSignal[ 0 ] = True
     dbg( '"aftset" stopSignal' )