示例#1
0
    def __init__(self, model, observations, l, nPrc):

        self._model = model
        self._nPrc = nPrc

        # Implementation assumes only 2 emissions.
        assert model.nEmissions == 2

        # calculate a histogram of observed sequences in parallel
        res = runParallel(partial(_calcObservedDist, l=l), observations)

        # sum result histogram to one summary histogram H
        H = defaultdict(int)
        for d in res:
            for k, v in d.iteritems():
                H[k] += v

        # create two lists:
        # - _obsSequences, containing all observed l-sequences
        # - _obsProbs, containng the matching observed probabilities
        self._obsSequences, self._obsProbs = [], []

        s = float(sum(H.itervalues()))
        M, halfM = 2**l, 2**(l - 1)

        # for all observed l-sequences:
        for n in H.keys():

            # normalize count to probability
            self._obsProbs.append(float(H[n]) / s)

            # map n to a binary sequence (where the first element is the msb)
            seq = []
            assert n < M
            t = n
            for _ in xrange(l):
                msb = t / halfM
                assert msb in [0, 1]
                seq.append(msb)
                t = (t * 2) % M

            # represent n as an ObservedSequence object
            self._obsSequences.append(ObservedSequence.fromEmissionsList(seq))

        writeOutput(
            'Input sequences contain %d distinct %d-sequences (used for GOF statistics)'
            % (len(self._obsProbs), l))
示例#2
0
 def _maximizeQ(self, hiddenState, initThetas):
     
     # TODO nStartPoints 290?
     # number of start points
     nStartPoints = 60
     
     # TODO remove
     # -Q is a positive measure we're trying to minimize... 
     refs = [-self._Q(t, hiddenState) for t in initThetas]
     for r in refs :
         assert r > 0
     
     # TODO move 'initTheta' to class field and see if it screws up performance.
     # initial points for optimizer; None is later converted to random init point
     # inputs = [self._thetaToVec(theta) for theta in initThetas] + [None for _ in xrange(nStartPoints - len(initThetas))]
     # inputs = [(x0, hiddenState) for x0 in inputs]
     inputs = [(self._thetaToVec(theta), hiddenState) for theta in [initThetas[0], initThetas[-1]]]
     
     # run self._maxQSingleStartPoint() on all items in inputs
     # Note: using partial(runMemberFunc, ...) to overcome Pool.map limitations on class methods.
     res = runParallel(runMemberFunc(self, '_maxQSingleStartPoint'), inputs)
         
     maxFound = -np.inf
     indices = []
     for i in xrange(len(res)): # TODO return to for theta, val in res
         theta, val = res[i]
         if val > maxFound:
             maxFound = val
             maxTheta = theta
         
         assert val < 0.0
         indices.append('{0}:{1}'.format(i, -val/refs[-1]))
     
     # TODO remove
     writeOutput('reference vals: ' + ','.join(str(r/refs[-1]) for r in refs), filename = 'DBG')
     writeOutput('indices: ' + ','.join(str(v) for v in ss.rankdata(indices)), filename = 'DBG')
 
     return maxTheta, maxFound
示例#3
0
# TODO make sure everuthing works with nPrc = 1

# read input flags
args = parser.parse_args()
assert args.iter > 0
assert args.par > 0
if args.gof is not None:
    assert min(args.gof) > 0

# Init output-writer process and processes pool
initParallel(args.par, args.o)

# log command line
# TODO perhaps printOutput() ?
writeOutput(" ".join(sys.argv))
writeOutput('BW steps will be spanned over %d processes' % args.par)

# read input dir & match all input files...
files = []
for inpPattern in args.input:
    pathName = os.path.dirname(inpPattern)
    if pathName == '':
        pathName = os.curdir
    for f in os.listdir(pathName):
        if fnmatch.fnmatch(f, os.path.basename(inpPattern)):
            files.append(pathName + '/' + f)

# TODO proper error message if (a) file doesn't exist (b) file doesn't match format
# read all input files (executed in parallel)
assert len(files) > 0
示例#4
0
    def run(self, observations, nIterations, trueTheta = None, initTheta = None, gof = []):
        
        # initialize theta
        theta = initTheta
        if theta is None:
            theta = self._initTheta(observations)
        #TODO
        DBGME = [theta]
                
        # we expect the log likelihood at the next iteration to be higher than this        
        bound = -np.inf
                
        # print model specifications (see __str__ below for details):
        writeOutput('Model specifications:', 'loop')
        writeOutput(self, 'loop')
        writeOutput('\n', 'loop')
        
        # statistics to be collected
        self._statsNames = ['logL', 'Q-Init', 'Q-Max']
        for l in gof:
            self._statsNames.append('G%d'%l)
        
        # initialize GOF classes
        if len(gof) > 0:
            start = time.time()
            gof = [GOF(model, observations, l) for l in gof]
            writeOutput('initialized gof statistics within %f seconds'%(time.time()-start))

        # print the log-likelihood of the data under the true parameters (if given; simulated data only)
        if trueTheta is not None:
            
            # use the forward-backward algorithm to calculate the log-lokelihood of the observed sequence under trueTheta
            trueL = self._parallelExp(trueTheta, observations).logL
            
            # log True theta vals and statistics
            self._logVals('True parameters:', trueTheta, [trueL, '.', '.'], gof, target='DBG')
                
        for i in xrange(nIterations):
                
            writeOutput('starting BW iteration number %d'%(i + 1))
            
            # BW expectation step
            start = time.time()
            inferredHiddenState = self._parallelExp(theta, observations)
            writeOutput('finished BW exp step within %f seconds'%(time.time()-start))
            
            # sanity check: log(O|theta) has increased as expected in the last iteration
            if inferredHiddenState.logL < bound:
                writeOutput('WARNING **** BW error 1 %f %f'%(inferredHiddenState.logL, bound), 'ErrorLog')
            
            # sanity check (this is just Jensen's inequality... Q(theta | theta) = E( log(P(O,Z|theta) ) <= log( E(P(O,Z|theta)) ) = log( P(O|theta) ) 
            Qtheta = self._Q(theta, inferredHiddenState)
            if Qtheta > inferredHiddenState.logL:
                writeOutput('WARNING **** BW error 2 %f %f'%(Qtheta, inferredHiddenState.logL), 'ErrorLog')

            # maximization step
            start = time.time()
            newTheta, Qmax = self._maximizeQ(inferredHiddenState, DBGME)
            writeOutput('finished BW max step within %f seconds'%(time.time()-start))

            # sanity check: max_thetaStar Q(thetaStar | theta) >= Q(theta | theta)
            qDiff = Qmax - Qtheta
            if qDiff < 0:
                writeOutput('WARNING **** BW error 3 %f %f'%(Qmax, Qtheta), 'ErrorLog')
    
            # the log likelihood of newTheta should be higher by at least qDiff
            # (this is the inequality you get in the standard proof showing EM converges to a local maximum)
            bound = inferredHiddenState.logL + qDiff
            
            # sanity check for simulated data: verify that Qmax > Q(truetheta); This just helps convince us that the maximizer did converge.
            if trueTheta is not None:
                QTrue = self._Q(trueTheta, inferredHiddenState)
                if QTrue > Qmax:
                    writeOutput('WARNING **** BW error 4 %f %f'%(QTrue, Qmax), 'ErrorLog')
            
            # log iteration
            self._logVals('After %d iterations:'%i, theta, [inferredHiddenState.logL, Qtheta, Qmax], gof)

            # update theta
            theta = newTheta
            DBGME.append(theta)
                                
        # log final value of theta (for which some statistics are not calculated)
        self._logVals('After %d iterations:'%nIterations, theta, ['.', '.', '.'], gof)
示例#5
0
    def _logVals(self, header, theta, stats, gof, target = 'loop'):
        # print header
        writeOutput(header,target)
        writeOutput('\n',target)
        
        # print theta
        writeOutput(theta, target)
        writeOutput('\n',target)
        
        # calculate gof stats
        if len(gof) > 0:
            start = time.time()
            for c in gof:
                    stats.append(c.G(theta))
            writeOutput('calculated gof statistics within %f seconds'%(time.time()-start))        
        assert len(self._statsNames) == len(stats)

        temp = '\t'
        for i in xrange(len(self._statsNames)):
                temp += '{%d:<24}'%i
        writeOutput('Statistics:',target)
        writeOutput(temp.format(*self._statsNames),target)
        writeOutput(temp.format(*stats),target)
        writeOutput('\n',target)