def testSupervisedLearn(self): intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4) truthIntervals = [] for i in intervals: truthIntervals.append((i[0], i[1], i[2], int(i[3]))) allIntervals = [(truthIntervals[0][0], truthIntervals[0][1], truthIntervals[-1][2])] trackData = TrackData() trackData.loadTrackData(getTracksInfoPath(3), allIntervals) assert len(trackData.getTrackTableList()) == 1 # set the fudge to 1 since when the test was written this was # hardcoded default em = IndependentMultinomialEmissionModel( 4, trackData.getNumSymbolsPerTrack(), fudge = 1.0) hmm = MultitrackHmm(em) hmm.supervisedTrain(trackData, truthIntervals) hmm.validate() # check emissions, they should basically be binary. trackList = hmm.getTrackList() emp = np.exp(em.getLogProbs()) ltrTrack = trackList.getTrackByName("ltr") track = ltrTrack.getNumber() cmap = ltrTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap(0) # we add 1 to all frequencies like emission trainer assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 6. / 7.) insideTrack = trackList.getTrackByName("inside") track = insideTrack.getNumber() cmap = insideTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap("Inside") assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.) # crappy check for start probs. need to test transition too! freq = [0.0] * em.getNumStates() total = 0.0 for interval in truthIntervals: state = interval[3] freq[state] += float(interval[2]) - float(interval[1]) total += float(interval[2]) - float(interval[1]) sprobs = hmm.getStartProbs() assert len(sprobs) == em.getNumStates() for state in xrange(em.getNumStates()): assert_array_almost_equal(freq[state] / total, sprobs[state]) # transition probabilites # from eyeball: #c 0 5 0 0->0 +4 0->1 +1 0-> +5 #c 5 10 1 1->1 +4 1->2 +1 1-> +5 #c 10 35 2 2->2 +24 2->3 +1 2-> +25 #c 35 40 3 3->3 +4 3->0 +1 3-> +5 #c 40 70 0 0->0 +29 0-> +19 realTransProbs = np.array([ [33. / 34., 1. / 34., 0., 0.], [0., 4. / 5., 1. / 5., 0.], [0., 0., 24. / 25., 1. / 25.], [1. / 5., 0., 0., 4. / 5.] ]) tprobs = hmm.getTransitionProbs() assert tprobs.shape == (em.getNumStates(), em.getNumStates()) assert_array_almost_equal(tprobs, realTransProbs) prob, states = hmm.viterbi(trackData)[0] for truthInt in truthIntervals: for i in xrange(truthInt[1], truthInt[2]): assert states[i] == truthInt[3]
def testSupervisedLearn(self): intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4) truthIntervals = [] for i in intervals: truthIntervals.append((i[0], i[1], i[2], int(i[3]))) allIntervals = [(truthIntervals[0][0], truthIntervals[0][1], truthIntervals[-1][2])] trackData = TrackData() trackData.loadTrackData(getTracksInfoPath(3), allIntervals) assert len(trackData.getTrackTableList()) == 1 # set the fudge to 1 since when the test was written this was # hardcoded default em = IndependentMultinomialEmissionModel( 4, trackData.getNumSymbolsPerTrack(), fudge=1.0) hmm = MultitrackHmm(em) hmm.supervisedTrain(trackData, truthIntervals) hmm.validate() # check emissions, they should basically be binary. trackList = hmm.getTrackList() emp = np.exp(em.getLogProbs()) ltrTrack = trackList.getTrackByName("ltr") track = ltrTrack.getNumber() cmap = ltrTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap(0) # we add 1 to all frequencies like emission trainer assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 6. / 7.) insideTrack = trackList.getTrackByName("inside") track = insideTrack.getNumber() cmap = insideTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap("Inside") assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.) # crappy check for start probs. need to test transition too! freq = [0.0] * em.getNumStates() total = 0.0 for interval in truthIntervals: state = interval[3] freq[state] += float(interval[2]) - float(interval[1]) total += float(interval[2]) - float(interval[1]) sprobs = hmm.getStartProbs() assert len(sprobs) == em.getNumStates() for state in xrange(em.getNumStates()): assert_array_almost_equal(freq[state] / total, sprobs[state]) # transition probabilites # from eyeball: #c 0 5 0 0->0 +4 0->1 +1 0-> +5 #c 5 10 1 1->1 +4 1->2 +1 1-> +5 #c 10 35 2 2->2 +24 2->3 +1 2-> +25 #c 35 40 3 3->3 +4 3->0 +1 3-> +5 #c 40 70 0 0->0 +29 0-> +19 realTransProbs = np.array([[33. / 34., 1. / 34., 0., 0.], [0., 4. / 5., 1. / 5., 0.], [0., 0., 24. / 25., 1. / 25.], [1. / 5., 0., 0., 4. / 5.]]) tprobs = hmm.getTransitionProbs() assert tprobs.shape == (em.getNumStates(), em.getNumStates()) assert_array_almost_equal(tprobs, realTransProbs) prob, states = hmm.viterbi(trackData)[0] for truthInt in truthIntervals: for i in xrange(truthInt[1], truthInt[2]): assert states[i] == truthInt[3]
def testHmmSupervisedLearn(self): """ Pretty much copied from the HMM unit test. We try to recapitualte all results with a CFG with no nest states, which should be same as HMM""" intervals = readBedIntervals(getTestDirPath("truth.bed"), ncol=4) truthIntervals = [] for i in intervals: truthIntervals.append((i[0], i[1], i[2], int(i[3]))) allIntervals = [(truthIntervals[0][0], truthIntervals[0][1], truthIntervals[-1][2])] trackData = TrackData() trackData.loadTrackData(getTracksInfoPath(3), allIntervals) assert len(trackData.getTrackTableList()) == 1 # set the fudge to 1 since when the test was written this was # hardcoded default em = IndependentMultinomialEmissionModel( 4, trackData.getNumSymbolsPerTrack(), fudge=1.0) hmm = MultitrackHmm(em) hmm.supervisedTrain(trackData, truthIntervals) hmm.validate() pairModel = PairEmissionModel(em, [1.0] * em.getNumStates()) # Test validates with neststate just for fun cfg = MultitrackCfg(em, pairModel, nestStates=[1]) cfg.supervisedTrain(trackData, truthIntervals) cfg.validate() # Then reload as an hmm-equivalent cfg = MultitrackCfg(em, pairModel, nestStates=[]) cfg.supervisedTrain(trackData, truthIntervals) cfg.validate() # check emissions, they should basically be binary. trackList = cfg.getTrackList() emp = np.exp(em.getLogProbs()) ltrTrack = trackList.getTrackByName("ltr") track = ltrTrack.getNumber() cmap = ltrTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap(0) # we add 1 to all frequencies like emission trainer assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 1 - 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 1. - 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 1. - 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 6. / 7.) insideTrack = trackList.getTrackByName("inside") track = insideTrack.getNumber() cmap = insideTrack.getValueMap() s0 = cmap.getMap(None) s1 = cmap.getMap("Inside") assert_array_almost_equal(emp[track][0][s0], 36. / 37.) assert_array_almost_equal(emp[track][0][s1], 1 - 36. / 37.) assert_array_almost_equal(emp[track][1][s0], 6. / 7.) assert_array_almost_equal(emp[track][1][s1], 1 - 6. / 7.) assert_array_almost_equal(emp[track][2][s0], 1. - 26. / 27.) assert_array_almost_equal(emp[track][2][s1], 26. / 27.) assert_array_almost_equal(emp[track][3][s0], 6. / 7.) assert_array_almost_equal(emp[track][3][s1], 1. - 6. / 7.) # crappy check for start probs. need to test transition too! freq = [0.0] * em.getNumStates() total = 0.0 for interval in truthIntervals: state = interval[3] freq[state] += float(interval[2]) - float(interval[1]) total += float(interval[2]) - float(interval[1]) sprobs = cfg.getStartProbs() assert len(sprobs) == em.getNumStates() for state in xrange(em.getNumStates()): assert_array_almost_equal(freq[state] / total, sprobs[state]) # transition probabilites # from eyeball: #c 0 5 0 0->0 +4 0->1 +1 0-> +5 #c 5 10 1 1->1 +4 1->2 +1 1-> +5 #c 10 35 2 2->2 +24 2->3 +1 2-> +25 #c 35 40 3 3->3 +4 3->0 +1 3-> +5 #c 40 70 0 0->0 +29 0-> +19 realTransProbs = np.array([[33. / 34., 1. / 34., 0., 0.], [0., 4. / 5., 1. / 5., 0.], [0., 0., 24. / 25., 1. / 25.], [1. / 5., 0., 0., 4. / 5.]]) tprobs = np.exp(cfg.getLogProbTables()[0]) assert tprobs.shape == (em.getNumStates(), em.getNumStates(), em.getNumStates()) for i in xrange(em.getNumStates()): for j in xrange(em.getNumStates()): fbTot = tprobs[i, i, j] if i != j: fbTot += tprobs[i, j, i] assert_array_almost_equal(fbTot, realTransProbs[i, j]) prob, states = cfg.decode(trackData.getTrackTableList()[0]) for truthInt in truthIntervals: for i in xrange(truthInt[1], truthInt[2]): # gah, just realized that ltr track is binary, which means # ltr states can be either 1 or 3. need to fix test properly # but just relax comparison for now. if truthInt[3] == 1 or truthInt[3] == 3: assert states[i] == 1 or states[i] == 3 else: assert states[i] == truthInt[3]
def getMaskPath(): return getTestDirPath("mask.bed")
def getSegmentsPath(): return getTestDirPath("segments.bed")
def getStatesPath(): return getTestDirPath("states.bed")
def getTracksInfoPath(idx = 1): if idx == 1: return getTestDirPath("tracksInfo.xml") else: return getTestDirPath("tracksInfo%d.xml" % idx)