Exemplo n.º 1
0
 def testDefaultVsHmm(self):
     emissionModel = IndependentMultinomialEmissionModel(
         10, [3], zeroAsMissingData=False)
     hmm = MultitrackHmm(emissionModel)
     pairModel = PairEmissionModel(emissionModel,
                                   [1.0] * emissionModel.getNumStates())
     cfg = MultitrackCfg(emissionModel, pairModel)
Exemplo n.º 2
0
 def testInit(self):
     emissionModel = IndependentMultinomialEmissionModel(
         10, [3], zeroAsMissingData=False)
     pairModel = PairEmissionModel(emissionModel,
                                   [1.0] * emissionModel.getNumStates())
     cfg = MultitrackCfg(emissionModel, pairModel)
     cfg = MultitrackCfg(emissionModel, [3, 8])
     cfg.validate()
Exemplo n.º 3
0
 def createSimpleModel1(self):
     #two states, 1 track, 2 symbols
     em = IndependentMultinomialEmissionModel(numStates=2,
                                              numSymbolsPerTrack = [2])
     state1 = [0.2, 0.8]
     state2 = [0.5, 0.5]
     track1 = [state1, state2]
     em.initParams([track1])
     return em
Exemplo n.º 4
0
 def createSimpleModel1(self):
     #two states, 1 track, 2 symbols
     em = IndependentMultinomialEmissionModel(numStates=2,
                                              numSymbolsPerTrack=[2])
     state1 = [0.2, 0.8]
     state2 = [0.5, 0.5]
     track1 = [state1, state2]
     em.initParams([track1])
     return em
Exemplo n.º 5
0
    def testPredict(self):
        observations = [0, 1, 2]
        h = MultinomialHMM(self.n_components,
                           startprob=self.startprob,
                           transmat=self.transmat,)
        h.emissionprob_ = self.emissionprob
        state_sequence = h.predict(observations)
        posteriors = h.predict_proba(observations)
        assert_array_equal(state_sequence, [1, 0, 0])
        assert_array_almost_equal(posteriors, [
            [0.23170303, 0.76829697],
            [0.62406281, 0.37593719],
            [0.86397706, 0.13602294],
        ])

         # add a couple dummy tracks that shouldn't change anything
        trackObs3 = np.asarray([[0,0,0], [1,0,0], [2,0,0]])
        emissionprob3 = [
            [[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]],
            [[1.], [1.]],
            [[1.], [1.]]
            ]
        emissionModel3 = IndependentMultinomialEmissionModel(
            2, [3,1,1], emissionprob3, zeroAsMissingData=False)
        trackHmm3 = MultitrackHmm(emissionModel3,
                                  startprob=self.startprob,
                                  transmat=self.transmat)
        state_sequence = trackHmm3.predict(trackObs3)
        posteriors = trackHmm3.predict_proba(trackObs3)
        assert_array_equal(state_sequence, [1, 0, 0])
        
        #assert_array_almost_equal(posteriors, [
        #    [0.23170303, 0.76829697],
        #    [0.62406281, 0.37593719],
        #    [0.86397706, 0.13602294],
        #])
        # above is no longer true since we fixed bacwkardTable[N] to be a dsitrubtion
        # rather than just all 1s.  instead we do a test to make sure that total
        # probability from forward is the same as from backward
        emProbs = emissionModel3.allLogProbs(trackObs3)
        flp, ftable = trackHmm3._do_forward_pass(emProbs)
        emProbsOld = h._compute_log_likelihood(np.asarray(observations))
        flpOld, ftableOld = h._do_forward_pass(emProbsOld)
        assert_array_almost_equal(ftable, ftableOld)
        assert_array_almost_equal(flp, flpOld)
        
        btable = trackHmm3._do_backward_pass(emProbs)
        bneg1 = np.zeros((self.n_components))
        for i in xrange(self.n_components):
            for j in xrange(self.n_components):
                bneg1[i] += np.exp(trackHmm3._log_startprob[j] + emProbs[0, j] +\
                                  btable[0, j])

        assert np.log(np.sum(bneg1)) == flp
Exemplo n.º 6
0
    def testPredict(self):
        observations = [0, 1, 2]
        h = MultinomialHMM(
            self.n_components,
            startprob=self.startprob,
            transmat=self.transmat,
        )
        h.emissionprob_ = self.emissionprob
        state_sequence = h.predict(observations)
        posteriors = h.predict_proba(observations)
        assert_array_equal(state_sequence, [1, 0, 0])
        assert_array_almost_equal(posteriors, [
            [0.23170303, 0.76829697],
            [0.62406281, 0.37593719],
            [0.86397706, 0.13602294],
        ])

        # add a couple dummy tracks that shouldn't change anything
        trackObs3 = np.asarray([[0, 0, 0], [1, 0, 0], [2, 0, 0]])
        emissionprob3 = [[[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]], [[1.], [1.]],
                         [[1.], [1.]]]
        emissionModel3 = IndependentMultinomialEmissionModel(
            2, [3, 1, 1], emissionprob3, zeroAsMissingData=False)
        trackHmm3 = MultitrackHmm(emissionModel3,
                                  startprob=self.startprob,
                                  transmat=self.transmat)
        state_sequence = trackHmm3.predict(trackObs3)
        posteriors = trackHmm3.predict_proba(trackObs3)
        assert_array_equal(state_sequence, [1, 0, 0])

        #assert_array_almost_equal(posteriors, [
        #    [0.23170303, 0.76829697],
        #    [0.62406281, 0.37593719],
        #    [0.86397706, 0.13602294],
        #])
        # above is no longer true since we fixed bacwkardTable[N] to be a dsitrubtion
        # rather than just all 1s.  instead we do a test to make sure that total
        # probability from forward is the same as from backward
        emProbs = emissionModel3.allLogProbs(trackObs3)
        flp, ftable = trackHmm3._do_forward_pass(emProbs)
        emProbsOld = h._compute_log_likelihood(np.asarray(observations))
        flpOld, ftableOld = h._do_forward_pass(emProbsOld)
        assert_array_almost_equal(ftable, ftableOld)
        assert_array_almost_equal(flp, flpOld)

        btable = trackHmm3._do_backward_pass(emProbs)
        bneg1 = np.zeros((self.n_components))
        for i in xrange(self.n_components):
            for j in xrange(self.n_components):
                bneg1[i] += np.exp(trackHmm3._log_startprob[j] + emProbs[0, j] +\
                                  btable[0, j])

        assert np.log(np.sum(bneg1)) == flp
Exemplo n.º 7
0
    def testDefaultHmmViterbi(self):
        emissionModel = IndependentMultinomialEmissionModel(
            5, [3], zeroAsMissingData=False)
        hmm = MultitrackHmm(emissionModel)
        pairModel = PairEmissionModel(emissionModel,
                                      [1.0] * emissionModel.getNumStates())
        cfg = MultitrackCfg(emissionModel, pairModel)

        obs = np.array([[0], [0], [1], [2]], dtype=np.uint8)
        hmmProb, hmmStates = hmm.decode(obs)
        cfgProb, cfgStates = cfg.decode(obs)
        assert_array_almost_equal(hmmProb, cfgProb)
Exemplo n.º 8
0
    def createSimpleModel2(self):
        #2 states, 2 tracks, 2 symbols in track 0, and 3 symbols in track 2
        em = IndependentMultinomialEmissionModel(numStates = 2,
                                                 numSymbolsPerTrack=[2, 3])
        state1track1 = [0.2, 0.8]
        state2track1 = [0.5, 0.5]
        state1track2 = [0.1, 0.3, 0.6]
        state2track2 = [0.7, 0.1, 0.2]
 
        track1 = [state1track1, state2track1]
        track2 = [state1track2, state2track2]
        em.initParams([track1, track2])
        return em
Exemplo n.º 9
0
    def createSimpleModel2(self):
        #2 states, 2 tracks, 2 symbols in track 0, and 3 symbols in track 2
        em = IndependentMultinomialEmissionModel(numStates=2,
                                                 numSymbolsPerTrack=[2, 3])
        state1track1 = [0.2, 0.8]
        state2track1 = [0.5, 0.5]
        state1track2 = [0.1, 0.3, 0.6]
        state2track2 = [0.7, 0.1, 0.2]

        track1 = [state1track1, state2track1]
        track2 = [state1track2, state2track2]
        em.initParams([track1, track2])
        return em
Exemplo n.º 10
0
    def testFit(self):
        h = MultinomialHMM(self.n_components,
                           startprob=self.startprob,
                           transmat=self.transmat)

        h.emissionprob_ = self.emissionprob
        train_obs = [h.sample(n=10)[0] for x in range(10)]
        train_obs3 = []
        for o in train_obs:
            o3 = np.empty((len(o), 3), dtype=np.float)
            for i in xrange(len(o)):
                o3[i][0] = o[i]
                o3[i][1] = 0
                o3[i][2] = 0
            train_obs3.append(o3)

        for params in ["s", "t", "e", "st", "se", "te", "ste"]:
            # dont randomly initialize emission model for now since
            # our class doesnt support it yet
            init_params = params.replace("e", "")
            hTrain = MultinomialHMM(self.n_components,
                                    params=params,
                                    init_params=init_params)
            hTrain.transmat_ = [[0.5, 0.5], [0.5, 0.5]]
            hTrain._set_emissionprob([[1. / 3., 1. / 3., 1. / 3.],
                                      [1. / 3., 1. / 3., 1. / 3.]])
            hTrain.startprob_ = [0.5, 0.5]
            hTrain.fit(train_obs)

            emissionModel3 = IndependentMultinomialEmissionModel(
                2, [3, 1, 1], zeroAsMissingData=False)
            trackHmm3 = MultitrackHmm(emissionModel3,
                                      params=params,
                                      init_params=init_params)
            trackHmm3.transmat_ = [[0.5, 0.5], [0.5, 0.5]]
            trackHmm3.startprob_ = [0.5, 0.5]

            trackHmm3.fit(train_obs3)

            assert (hTrain.n_iter == trackHmm3.n_iter)
            assert (hTrain.thresh == trackHmm3.thresh)
            assert_array_equal(hTrain.transmat_, trackHmm3.transmat_)
            for state in xrange(2):
                for symbol in xrange(3):
                    ep = hTrain.emissionprob_[state][symbol]
                    ep3 = trackHmm3.emissionModel.singleLogProb(
                        state, np.asarray([symbol, 0, 0]))
                    assert_array_almost_equal(ep, np.exp(ep3))

            # test consistency of log likelihood function
            assert_array_equal(
                trackHmm3._compute_log_likelihood(train_obs3[0]),
                hTrain._compute_log_likelihood(train_obs[0]))

            # test consistency of viterbi
            logprob, state_sequence = hTrain.decode(train_obs[0])
            logprob3, state_sequence3 = trackHmm3.decode(train_obs3[0])
            self.assertAlmostEqual(logprob, logprob3)
            assert_array_equal(state_sequence, state_sequence3)
Exemplo n.º 11
0
    def testTraceBack(self):
        # a model with 2 states.  state 0 has a .75 chance of emitting 0
        # state 1 has a 0.95 chance of emitting 1
        emissionModel = IndependentMultinomialEmissionModel(
            2, [2], zeroAsMissingData=False)
        emProbs = np.zeros((1, 2, 2), dtype=np.float)
        emProbs[0, 0] = [0.75, 0.25]
        emProbs[0, 1] = [0.05, 0.95]
        emissionModel.logProbs = np.log(emProbs)

        hmm = MultitrackHmm(emissionModel)
        pairModel = PairEmissionModel(emissionModel,
                                      [1.0] * emissionModel.getNumStates())
        cfg = MultitrackCfg(emissionModel, pairModel)

        obs = np.array([[0], [0], [1], [0]], dtype=np.uint8)
        hmmProb, hmmStates = hmm.decode(obs)
        cfgProb, cfgStates = cfg.decode(obs)
        assert_array_almost_equal(hmmProb, cfgProb)
        assert_array_almost_equal(hmmStates, [0, 0, 1, 0])
        assert_array_almost_equal(hmmStates, cfgStates)
Exemplo n.º 12
0
    def testBasicNesting(self):
        # a model with 3 states.  state 0 has a .75 chance of emitting 0
        # state 1 has a 0.95 chance of emitting 1
        # state 2 has a 0.90 chance of emitting 1
        emissionModel = IndependentMultinomialEmissionModel(
            3, [2], zeroAsMissingData=False)
        emProbs = np.zeros((1, 3, 2), dtype=np.float)
        emProbs[0, 0] = [0.75, 0.25]
        emProbs[0, 1] = [0.05, 0.95]
        emProbs[0, 2] = [0.01, 0.90]
        emissionModel.logProbs = np.log(emProbs)

        # state 1 is a nested pair state!
        pairModel = PairEmissionModel(emissionModel,
                                      [1.0] * emissionModel.getNumStates())
        cfg = MultitrackCfg(emissionModel, pairModel, nestStates=[1])

        obs = np.array([[0], [0], [1], [0]], dtype=np.uint8)
        cfgProb, cfgStates = cfg.decode(obs)
        # 1 is a pair only state.  no way it should be here
        assert 1 not in cfgStates
        assert_array_equal(cfgStates, [0, 0, 2, 0])

        obs = np.array([[1], [0], [0], [1]], dtype=np.uint8)
        cfgProb, cfgStates = cfg.decode(obs)
        assert_array_equal(cfgStates, [2, 0, 0, 2])

        alignment = np.array([[1], [0], [0], [1]], dtype=np.uint16)
        cfgProb, cfgStates = cfg.decode(obs,
                                        alignmentTrack=alignment,
                                        defAlignmentSymbol=0)
        assert_array_equal(cfgStates, [1, 0, 0, 1])

        alignment = np.array([[1], [0], [0], [2]], dtype=np.uint16)
        cfgProb, cfgStates = cfg.decode(obs,
                                        alignmentTrack=alignment,
                                        defAlignmentSymbol=0)
        assert_array_equal(cfgStates, [2, 0, 0, 2])
Exemplo n.º 13
0
    def testSupervisedTrain(self):
        bedIntervals = getBedStates()
        trackData = TrackData()
        trackData.loadTrackData(getTracksInfoPath(), bedIntervals)
        assert len(trackData.getTrackTableList()) == len(bedIntervals)
        # set the fudge to 1 since when the test was written this was
        # hardcoded default
        em = IndependentMultinomialEmissionModel(
            2, trackData.getNumSymbolsPerTrack(), fudge=1.)
        em.supervisedTrain(trackData, bedIntervals)

        # count frequency of symbols for a given track
        for track in xrange(3):
            counts = [dict(), dict()]
            totals = [0, 0]

            # init to ones like we do in emisisonModel
            for i in em.getTrackSymbols(track):
                counts[0][i] = 1
                counts[1][i] = 1
                totals[0] += 1
                totals[1] += 1

            for tableIdx, table in enumerate(trackData.getTrackTableList()):
                state = bedIntervals[tableIdx][3]
                count = counts[state]
                for i in xrange(len(table)):
                    val = table[i][track]
                    if val in count:
                        count[val] += 1
                        totals[state] += 1

            # compute track frequency from model by marginalizing and compare
            for state in xrange(2):
                for val in counts[state]:
                    frac = float(counts[state][val]) / float(totals[state])
                    prob = 0.0
                    for val3d in em.getSymbols():
                        if val3d[track] == val:
                            prob += np.exp(em.singleLogProb(state, val3d))
                    assert_array_almost_equal(prob, frac)
Exemplo n.º 14
0
    def testSupervisedTrain(self):
        bedIntervals = getBedStates()
        trackData = TrackData()
        trackData.loadTrackData(getTracksInfoPath(), bedIntervals)
        assert len(trackData.getTrackTableList()) == len(bedIntervals)
        # set the fudge to 1 since when the test was written this was
        # hardcoded default
        em = IndependentMultinomialEmissionModel(
            2, trackData.getNumSymbolsPerTrack(),
            fudge = 1.)
        em.supervisedTrain(trackData, bedIntervals)

        # count frequency of symbols for a given track
        for track in xrange(3):            
            counts = [dict(), dict()]
            totals = [0, 0]

            # init to ones like we do in emisisonModel
            for i in em.getTrackSymbols(track):
                counts[0][i] = 1
                counts[1][i] = 1
                totals[0] += 1
                totals[1] += 1
                
            for tableIdx, table in enumerate(trackData.getTrackTableList()):
                state = bedIntervals[tableIdx][3]
                count = counts[state]
                for i in xrange(len(table)):
                    val = table[i][track]
                    if val in count:
                        count[val] += 1
                        totals[state] += 1

            # compute track frequency from model by marginalizing and compare
            for state in xrange(2):
                for val in counts[state]:
                    frac = float(counts[state][val]) / float(totals[state])
                    prob = 0.0
                    for val3d in em.getSymbols():
                        if val3d[track] == val:
                            prob += np.exp(em.singleLogProb(state, val3d))
                    assert_array_almost_equal(prob, frac)
Exemplo n.º 15
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Benchmark HMM Dynamic Programming Functions.")

    parser.add_argument("--N",
                        help="Number of observations.",
                        type=int,
                        default=200000)
    parser.add_argument("--S", help="Number of states.", type=int, default=10)
    parser.add_argument("--alg",
                        help="Algorithm. Valid options are"
                        " {viterbi, forward, backward}.",
                        default="viterbi")
    parser.add_argument("--new",
                        help="Only run new hmm",
                        action="store_true",
                        default=False)
    parser.add_argument("--old",
                        help="Only run old hmm",
                        action="store_true",
                        default=False)
    parser.add_argument("--fb",
                        help="Run a little forward/backward test",
                        action="store_true",
                        default=False)

    args = parser.parse_args()
    alg = args.alg.lower()
    assert alg == "viterbi" or alg == "forward" or alg == "backward"

    # orginal, scikit hmm
    basehmm = BaseHMM(n_components=args.S)
    # new, hopefully faster hmm
    mthmm = MultitrackHmm(
        emissionModel=IndependentMultinomialEmissionModel(args.S, [2]))
    frame = makeFrame(args.S, args.N)
    baseret = None
    mtret = None

    if args.new == args.old or args.old:
        startTime = time.time()
        baseret = runTest(basehmm, frame, alg)
        deltaTime = time.time() - startTime
        print "Elapsed time for OLD %d x %d %s: %s" % (
            args.N, args.S, args.alg, str(deltaTime))
        if args.fb:
            fbTest(basehmm, frame)

    if args.new == args.old or args.new:
        startTime = time.time()
        newret = runTest(mthmm, frame, alg)
        deltaTime = time.time() - startTime
        print "Elapsed time for NEW %d x %d %s: %s" % (
            args.N, args.S, args.alg, str(deltaTime))
        if args.fb:
            fbTest(mthmm, frame)

    if baseret is not None and mtret is not None:
        # note comparison doesnt mean much since data is so boring so
        # hopefully hmmTest will be more meaningful.  that said, this will still
        # detect many catastrophic bugs.
        if alg == "viterbi":
            assert_array_almost_eqal(baseret[0], mtret[0])
            assert_array_eqal(baseret[1], mtret[1])
        else:
            assert_array_almost_equal(baseret, mtret)
Exemplo n.º 16
0
    def testWikipediaExample(self):
        """ Mostly taken from test_hmm.py in sckikit-learn """

        # do scikit model as sanity check
        observations = [0, 1, 2]
        h = MultinomialHMM(
            self.n_components,
            startprob=self.startprob,
            transmat=self.transmat,
        )
        h.emissionprob_ = self.emissionprob
        logprob, state_sequence = h.decode(observations)
        self.assertAlmostEqual(np.exp(logprob), 0.01344)
        assert_array_equal(state_sequence, [1, 0, 0])

        # do multitrack model (making sure to wrap params in list to reflect
        # extra dimension for tracks)
        trackObs = np.asarray([[0], [1], [2]])
        emissionModel = IndependentMultinomialEmissionModel(
            2, [3], [self.emissionprob], zeroAsMissingData=False)
        trackHmm = MultitrackHmm(emissionModel,
                                 startprob=self.startprob,
                                 transmat=self.transmat)

        # test consistency of log likelihood function
        assert_array_equal(trackHmm._compute_log_likelihood(trackObs),
                           h._compute_log_likelihood(observations))

        # test consistency of viterbi
        logprob, state_sequence = trackHmm.decode(trackObs)
        self.assertAlmostEqual(np.exp(logprob), 0.01344)
        assert_array_equal(state_sequence, [1, 0, 0])

        # add a couple dummy tracks that shouldn't change anything
        trackObs3 = np.asarray([[0, 0, 0], [1, 0, 0], [2, 0, 0]])
        emissionprob3 = [[[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]], [[1.], [1.]],
                         [[1.], [1.]]]
        emissionModel3 = IndependentMultinomialEmissionModel(
            2, [3, 1, 1], emissionprob3, zeroAsMissingData=False)
        trackHmm3 = MultitrackHmm(emissionModel3,
                                  startprob=self.startprob,
                                  transmat=self.transmat)

        # test consistency of log likelihood function
        assert_array_equal(trackHmm3._compute_log_likelihood(trackObs3),
                           h._compute_log_likelihood(observations))

        # test consistency of viterbi
        logprob, state_sequence = trackHmm3.decode(trackObs3)
        self.assertAlmostEqual(np.exp(logprob), 0.01344)
        assert_array_equal(state_sequence, [1, 0, 0])

        # test consistency of viterbi
        logprob, state_sequence = trackHmm.decode(trackObs)
        self.assertAlmostEqual(np.exp(logprob), 0.01344)
        assert_array_equal(state_sequence, [1, 0, 0])

        # go through same excecise but with another track that has a bunch
        # of equiprobables states
        trackObs4 = np.asarray([[0, 0, 0, 0], [1, 0, 0, 5], [2, 0, 0, 7]])
        emissionprob4 = [
            [[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]],
            [[1.], [1.]],
            [[1.], [1.]],
            [[.1] * 10, [.1] * 10],
        ]
        emissionModel4 = IndependentMultinomialEmissionModel(
            2, [3, 1, 1, 10], emissionprob4, zeroAsMissingData=False)
        trackHmm3 = MultitrackHmm(emissionModel4,
                                  startprob=self.startprob,
                                  transmat=self.transmat)

        # test consistency of viterbi
        logprob, state_sequence = trackHmm3.decode(trackObs4)
        self.assertAlmostEqual(np.exp(logprob), 0.01344 * 0.1 * 0.1 * 0.1)
        assert_array_equal(state_sequence, [1, 0, 0])

        # make sure that it still works using a TrackTable structure instead
        # of a numpy array
        trackTable4 = IntegerTrackTable(4, "scaffold_1", 10, 13)
        for row in xrange(4):
            trackTable4.writeRow(
                row, [trackObs4[0][row], trackObs4[1][row], trackObs4[2][row]])
        logprob, state_sequence = trackHmm3.decode(trackTable4)
        self.assertAlmostEqual(np.exp(logprob), 0.01344 * 0.1 * 0.1 * 0.1)
        assert_array_equal(state_sequence, [1, 0, 0])
Exemplo n.º 17
0
 def testInit(self):
     emissionModel = IndependentMultinomialEmissionModel(
         2, [3], zeroAsMissingData=False)
     hmm = MultitrackHmm(emissionModel)
Exemplo n.º 18
0
    def testSupervisedLearn(self):
        intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4)
        truthIntervals = []
        for i in intervals:
            truthIntervals.append((i[0], i[1], i[2], int(i[3])))

        allIntervals = [(truthIntervals[0][0],
                        truthIntervals[0][1],
                        truthIntervals[-1][2])]
        trackData = TrackData()
        trackData.loadTrackData(getTracksInfoPath(3), allIntervals)
        assert len(trackData.getTrackTableList()) == 1
        # set the fudge to 1 since when the test was written this was
        # hardcoded default
        em = IndependentMultinomialEmissionModel(
            4, trackData.getNumSymbolsPerTrack(),
			  fudge = 1.0)
        hmm = MultitrackHmm(em)
        hmm.supervisedTrain(trackData, truthIntervals)
        hmm.validate()

        # check emissions, they should basically be binary. 
        trackList = hmm.getTrackList()
        emp = np.exp(em.getLogProbs())
        ltrTrack = trackList.getTrackByName("ltr")
        track = ltrTrack.getNumber()
        cmap = ltrTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap(0)
        # we add 1 to all frequencies like emission trainer
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.) 
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.) 
        assert_array_almost_equal(emp[track][1][s1], 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 26. / 27.) 
        assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 6. / 7.)

        insideTrack = trackList.getTrackByName("inside")
        track = insideTrack.getNumber()
        cmap = insideTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap("Inside")
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.) 
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 26. / 27.) 
        assert_array_almost_equal(emp[track][3][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.)

        
        # crappy check for start probs.  need to test transition too!
        freq = [0.0] * em.getNumStates()
        total = 0.0
        for interval in truthIntervals:
           state = interval[3]
           freq[state] += float(interval[2]) - float(interval[1])
           total += float(interval[2]) - float(interval[1])

        sprobs = hmm.getStartProbs()
        assert len(sprobs) == em.getNumStates()
        for state in xrange(em.getNumStates()):
            assert_array_almost_equal(freq[state] / total, sprobs[state])

        # transition probabilites
        # from eyeball:
        #c	0	5	0   0->0 +4   0->1 +1    0-> +5
        #c	5	10	1   1->1 +4   1->2 +1    1-> +5
        #c	10	35	2   2->2 +24  2->3 +1    2-> +25
        #c	35	40	3   3->3 +4   3->0 +1    3-> +5
        #c	40	70	0   0->0 +29             0-> +19
        realTransProbs = np.array([
            [33. / 34., 1. / 34., 0., 0.],
            [0., 4. / 5., 1. / 5., 0.],
            [0., 0., 24. / 25., 1. / 25.],
            [1. / 5., 0., 0., 4. / 5.]
            ])
            
        tprobs = hmm.getTransitionProbs()
        assert tprobs.shape == (em.getNumStates(), em.getNumStates())
        assert_array_almost_equal(tprobs, realTransProbs)
        prob, states = hmm.viterbi(trackData)[0]
        for truthInt in truthIntervals:
            for i in xrange(truthInt[1], truthInt[2]):
                assert states[i] == truthInt[3]
Exemplo n.º 19
0
    def testSupervisedLearn(self):
        intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4)
        truthIntervals = []
        for i in intervals:
            truthIntervals.append((i[0], i[1], i[2], int(i[3])))

        allIntervals = [(truthIntervals[0][0], truthIntervals[0][1],
                         truthIntervals[-1][2])]
        trackData = TrackData()
        trackData.loadTrackData(getTracksInfoPath(3), allIntervals)
        assert len(trackData.getTrackTableList()) == 1
        # set the fudge to 1 since when the test was written this was
        # hardcoded default
        em = IndependentMultinomialEmissionModel(
            4, trackData.getNumSymbolsPerTrack(), fudge=1.0)
        hmm = MultitrackHmm(em)
        hmm.supervisedTrain(trackData, truthIntervals)
        hmm.validate()

        # check emissions, they should basically be binary.
        trackList = hmm.getTrackList()
        emp = np.exp(em.getLogProbs())
        ltrTrack = trackList.getTrackByName("ltr")
        track = ltrTrack.getNumber()
        cmap = ltrTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap(0)
        # we add 1 to all frequencies like emission trainer
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.)
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 6. / 7.)

        insideTrack = trackList.getTrackByName("inside")
        track = insideTrack.getNumber()
        cmap = insideTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap("Inside")
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.)
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.)

        # crappy check for start probs.  need to test transition too!
        freq = [0.0] * em.getNumStates()
        total = 0.0
        for interval in truthIntervals:
            state = interval[3]
            freq[state] += float(interval[2]) - float(interval[1])
            total += float(interval[2]) - float(interval[1])

        sprobs = hmm.getStartProbs()
        assert len(sprobs) == em.getNumStates()
        for state in xrange(em.getNumStates()):
            assert_array_almost_equal(freq[state] / total, sprobs[state])

        # transition probabilites
        # from eyeball:
        #c	0	5	0   0->0 +4   0->1 +1    0-> +5
        #c	5	10	1   1->1 +4   1->2 +1    1-> +5
        #c	10	35	2   2->2 +24  2->3 +1    2-> +25
        #c	35	40	3   3->3 +4   3->0 +1    3-> +5
        #c	40	70	0   0->0 +29             0-> +19
        realTransProbs = np.array([[33. / 34., 1. / 34., 0., 0.],
                                   [0., 4. / 5., 1. / 5., 0.],
                                   [0., 0., 24. / 25., 1. / 25.],
                                   [1. / 5., 0., 0., 4. / 5.]])

        tprobs = hmm.getTransitionProbs()
        assert tprobs.shape == (em.getNumStates(), em.getNumStates())
        assert_array_almost_equal(tprobs, realTransProbs)
        prob, states = hmm.viterbi(trackData)[0]
        for truthInt in truthIntervals:
            for i in xrange(truthInt[1], truthInt[2]):
                assert states[i] == truthInt[3]
Exemplo n.º 20
0
    def testHmmSupervisedLearn(self):
        """ Pretty much copied from the HMM unit test.  We try to recapitualte
        all results with a CFG with no nest states, which should be same as
        HMM"""
        intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4)
        truthIntervals = []
        for i in intervals:
            truthIntervals.append((i[0], i[1], i[2], int(i[3])))

        allIntervals = [(truthIntervals[0][0], truthIntervals[0][1],
                         truthIntervals[-1][2])]
        trackData = TrackData()
        trackData.loadTrackData(getTracksInfoPath(3), allIntervals)
        assert len(trackData.getTrackTableList()) == 1
        # set the fudge to 1 since when the test was written this was
        # hardcoded default
        em = IndependentMultinomialEmissionModel(
            4, trackData.getNumSymbolsPerTrack(), fudge=1.0)
        hmm = MultitrackHmm(em)
        hmm.supervisedTrain(trackData, truthIntervals)
        hmm.validate()
        pairModel = PairEmissionModel(em, [1.0] * em.getNumStates())
        # Test validates with neststate just for fun
        cfg = MultitrackCfg(em, pairModel, nestStates=[1])

        cfg.supervisedTrain(trackData, truthIntervals)
        cfg.validate()

        # Then reload as an hmm-equivalent
        cfg = MultitrackCfg(em, pairModel, nestStates=[])

        cfg.supervisedTrain(trackData, truthIntervals)
        cfg.validate()

        # check emissions, they should basically be binary.
        trackList = cfg.getTrackList()
        emp = np.exp(em.getLogProbs())
        ltrTrack = trackList.getTrackByName("ltr")
        track = ltrTrack.getNumber()
        cmap = ltrTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap(0)
        # we add 1 to all frequencies like emission trainer
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.)
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 6. / 7.)

        insideTrack = trackList.getTrackByName("inside")
        track = insideTrack.getNumber()
        cmap = insideTrack.getValueMap()
        s0 = cmap.getMap(None)
        s1 = cmap.getMap("Inside")
        assert_array_almost_equal(emp[track][0][s0], 36. / 37.)
        assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.)
        assert_array_almost_equal(emp[track][1][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.)
        assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.)
        assert_array_almost_equal(emp[track][2][s1], 26. / 27.)
        assert_array_almost_equal(emp[track][3][s0], 6. / 7.)
        assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.)

        # crappy check for start probs.  need to test transition too!
        freq = [0.0] * em.getNumStates()
        total = 0.0
        for interval in truthIntervals:
            state = interval[3]
            freq[state] += float(interval[2]) - float(interval[1])
            total += float(interval[2]) - float(interval[1])

        sprobs = cfg.getStartProbs()
        assert len(sprobs) == em.getNumStates()
        for state in xrange(em.getNumStates()):
            assert_array_almost_equal(freq[state] / total, sprobs[state])

        # transition probabilites
        # from eyeball:
        #c	0	5	0   0->0 +4   0->1 +1    0-> +5
        #c	5	10	1   1->1 +4   1->2 +1    1-> +5
        #c	10	35	2   2->2 +24  2->3 +1    2-> +25
        #c	35	40	3   3->3 +4   3->0 +1    3-> +5
        #c	40	70	0   0->0 +29             0-> +19
        realTransProbs = np.array([[33. / 34., 1. / 34., 0., 0.],
                                   [0., 4. / 5., 1. / 5., 0.],
                                   [0., 0., 24. / 25., 1. / 25.],
                                   [1. / 5., 0., 0., 4. / 5.]])

        tprobs = np.exp(cfg.getLogProbTables()[0])
        assert tprobs.shape == (em.getNumStates(), em.getNumStates(),
                                em.getNumStates())
        for i in xrange(em.getNumStates()):
            for j in xrange(em.getNumStates()):
                fbTot = tprobs[i, i, j]
                if i != j:
                    fbTot += tprobs[i, j, i]
                assert_array_almost_equal(fbTot, realTransProbs[i, j])
        prob, states = cfg.decode(trackData.getTrackTableList()[0])
        for truthInt in truthIntervals:
            for i in xrange(truthInt[1], truthInt[2]):
                # gah, just realized that ltr track is binary, which means
                # ltr states can be either 1 or 3.  need to fix test properly
                # but just relax comparison for now.
                if truthInt[3] == 1 or truthInt[3] == 3:
                    assert states[i] == 1 or states[i] == 3
                else:
                    assert states[i] == truthInt[3]