Пример #1
0
 def train(self):
     mmModel = np.zeros((noteRange+1, noteRange+1))
     mm3Model = np.zeros((noteRange+1, noteRange+1, noteRange+1))
     hmmModel = HMM(12, noteRange+1)
     obs = []
     ground = []
     actions = []
     for i in range(minNote, maxNote):
         actions.append(i)
     qModel = QLearner(actions, epsilon=0.1, alpha=0.2, gamma=0.9)
     for ls in self.clusterData:
         for quadidx, quad in enumerate(ls):
             tempquad = map(lambda x: x - minNote, quad) #take this out for prevnote stuff
             obs.append(tempquad[1:]) #this is for hmm: you can also do same thing for qlearning to change state that way
             tempquad = map(lambda x: (x - minNote) % 12, quad)
             ground.append(tempquad[:3])
             if (quad):
                 for idx, note in enumerate(quad):
                     if idx > 0:
                         currNote = note
                         prevNote = quad[idx - 1]
                         #Q learning
                         #q.learn(state1, action1, reward, state2)
                         qModel.learn(prevNote, note, 1, note)
                         #Markov model
                         mmModel[currNote - minNote, prevNote - minNote] += 1
                     if idx > 2:
                         #Markov model, more order
                         currNote = note - minNote
                         prevNote = quad[idx - 1] - minNote
                         prevNote2 = quad[idx - 2] - minNote
                         mm3Model[currNote, prevNote, prevNote2] += 1
     hmmModel.learn(obs, ground)
     return (mmModel, mm3Model, hmmModel, qModel)
class HMMPOSTagger(object):
    """
    在中文分词结果基础上, 采用 HMM 模型实现词性标注 (Part-of-speech tagging).
    """

    def __init__(self):
        self.hmm = HMM()

        self.re_chinese = re.compile(ur"([\u4E00-\u9FA5]+)")  # 正则匹配汉字串
        self.re_skip = re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")  # 正则匹配英文串和数字串

    def load(self, model_dir):
        """
        加载模型文件.
        """
        self.hmm.load(model_dir)

    def pos_tag(self, words):
        """
        基于 HMM 模型的词性标注.
        """
        log_prob, pos_list = self._viterbi(words)

        for i, w in enumerate(words):
            yield (w, pos_list[i])
Пример #3
0
def main(job_id, params):
    num_runs = 20
    obs_length = 100
    num_states = 2
    num_obs = 2

    # readin hmm indx
    t = 0
    try:
        with open(os.path.join('.', 'hmm_index.txt')) as hmm_index_file:
            t = int(hmm_index_file.read())
        sys.stderr.write("!!!!!!!!!!!!!!!!!!HMM INDEX:  " + str(t) + "   !!!!!!!!!!!!!!!\n")
    except IOError:
        t = 0


    # generate HMM observations
    np.random.seed(0x6b6c26b2)
    seeds = np.random.randint(0x0fffffff, size=num_runs)
    np.random.seed(seeds[t])
    # random hmm
    z_mat, t_mat = random_hmm(num_states, num_obs)
    pi_vec = np.array([1.0 / num_states] * num_states)
    hmm_test = HMM(z_mat, t_mat, pi_vec)
    # random obs trajectory
    obs = hmm_test.generate(obs_length)[np.newaxis,:]

    # calculate log likelihood for input HMM parameters
    z_mat_p_input = np.array([[params['z_mat_p_0'][0], params['z_mat_p_1'][0]]])
    t_mat_p_input = np.array([[params['t_mat_p_0'][0], params['t_mat_p_1'][0]]])
    # pi_vec_input = np.array([params['pi_0'], 1 - params['pi_0']])
    hmm_estimate = make_parameterized_HMM(z_mat_p_input, t_mat_p_input, pi_vec)
    hmm_loglikelihood = hmm_estimate.loglikelihood(obs[0])

    return -hmm_loglikelihood
Пример #4
0
    def prepare_seqs_nl_dbg(self, decoding="viterbi"):
        params_fixed = (np.load("{}ip.npy".format(self.path)),
                        np.load("{}tp.npy".format(self.path)),
                        np.load("{}fp.npy".format(self.path)),
                        np.load("{}ep.npy".format(self.path)))

        h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False)
        h.dirname = self.path
        self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict)

        # train_seq = self.ner_corpus.read_sequence_list_conll(ned_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(ned_test)

        if decoding == "viterbi":
            decoder = h.viterbi_decode_corpus
        elif decoding == "max_emission":
            decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined, using Viterbi.")
            decoder = h.viterbi_decode_corpus

        # print("Decoding word representations on train.")
        # decoder(train_seq)
        print("Decoding word representations on dev.")
        decoder(dev_seq)
def simple_weather_model():
    hmm = HMM(["s1", "s2"], ["R", "NR"])
    init = [0.7, 0.3]
    trans = [[0.8, 0.2], [0.1, 0.9]]
    observ = [[0.75, 0.25], [0.4, 0.6]]
    hmm.set_hidden_model(init, trans, observ)
    return hmm
Пример #6
0
Файл: train.py Проект: 52nlp/iir
def main():
    parser = OptionParser()
    parser.add_option("-d", dest="training", help="training data directory")
    parser.add_option("-k", dest="K", type="int", help="number of latent states", default=6)
    parser.add_option("-a", dest="a", type="float", help="Dirichlet parameter", default=1.0)
    parser.add_option("-i", dest="I", type="int", help="iteration count", default=10)
    parser.add_option("-m", dest="model", help="model data filename to save")
    (options, args) = parser.parse_args()
    if not options.training: parser.error("need training data directory(-d)")

    features = load_data(options.training)

    hmm = HMM()
    hmm.set_corpus(features)
    hmm.init_inference(options.K, options.a)
    pre_L = -1e10
    for i in range(options.I):
        log_likelihood = hmm.inference()
        print i, ":", log_likelihood
        if pre_L > log_likelihood: break
        pre_L = log_likelihood
    if options.model:
        hmm.save(options.model)
    else:
        hmm.dump()
Пример #7
0
def main():
    hmm = HMM(*train(sys.argv[1]))
    
    with open(sys.argv[2]) as f:
        correct = 0
        wrong = 0
        
        correct_sents = 0
        wrong_sents = 0
        
        correct_known = 0
        wrong_known = 0
        
        for i, sent in enumerate(Reader(f)):
            prob, path = hmm.decode([word for (word, pos) in sent])
            correct1 = 0
            wrong1 = 0
            for (gold, predicted) in zip(sent, path):
                if gold == predicted:
                    correct1 += 1
                else:
                    wrong1 += 1
            print('%e\t%.3f\t%s' % (prob, correct1 / (correct1 + wrong1), ' '.join('%s/%s' % pair for pair in path)))
            if prob > 0:
                correct_sents += 1
                correct_known += correct1
                wrong_known += wrong1
            else:
                wrong_sents += 1
            correct += correct1
            wrong += wrong1
    
    print("Correctly tagged words: %s" % (correct / (correct + wrong)))
    print("Sentences with non-zero probability: %s" % (correct_sents / (correct_sents + wrong_sents)))
    print("Correctly tagged words when only considering sentences with non-zero probability: %s" % (correct_known / (correct_known + wrong_known)))
 def toy_model(self):
     hmm = HMM(["s1", "s2"], ["R", "NR"])
     init = [0.5, 0.5]
     trans = [[0.2, 0.8], [0.8, 0.2]]
     observ = [[0.8, 0.2], [0.2, 0.8]]
     hmm.set_hidden_model(init, trans, observ)
     return hmm
Пример #9
0
    def __init__(self, limb, name='Demo_26_Final'):
        rospy.init_node(name, anonymous=True)
	self._startGesture = '0'
	self._limb = limb
        self._knhObj = KinectNiteHelp()
	self._baeObj = BaxterArmEndpoint(self._limb)
	
	self._dmObj = Demo26Help()
	
	self._lhObj = LeapHelp()
	
	self._handCoordinates = []
	self._baxterCoordinates = []
	self._posCount = 0
	rtMatFile = open("RTMatFile.dat", "r")
	self._rotMat = cPickle.load(rtMatFile)
	self._transMat = cPickle.load(rtMatFile)
	self._gCount = 0
	self._gOn = 0
	self._gPointCount = 0
	self._gPoints = []
	self._hmmObjG1 = HMM('G1.hmm')
	self._hmmObjG2 = HMM('G2.hmm')
	self._hmmObjG3 = HMM('G3.hmm')
	self._hmmObjG4 = HMM('G4.hmm')
	self._flag = '0'
	self._pub = rospy.Publisher('/robot/xdisplay', Image, latch=True)
	#self._flagPub = rospy.Publisher('flag_topic', String)
	self._sub = rospy.Subscriber('/key_tap_topic', String, self._callback) 
	rtMatFile.close()
	img = cv.LoadImage('Welcome.png')
    	msg = cv_bridge.CvBridge().cv_to_imgmsg(img, encoding="bgr8")
    	self._pub.publish(msg)
    	# Sleep to allow for image to be published.
    	rospy.sleep(3)
Пример #10
0
 def __init__(self, **kwarg):    # lname, url, other prior knowledge
     super(HMMClassifier, self).__init__()
     self.HMMauthor = HMM('author', 2)
     self.HMMvenue = HMM('venue', 2)     # Not important
     self.HMMentire = HMM('entire', 6)   # Set empirically
     self.observations_raw = []
     self.observation_sequences = []
     self.labels = []
Пример #11
0
class TestHMM():
	
	def __init__(self):
		self.Z = numpy.array([
			[0.8,  0.09, 0.01],
			[0.09, 0.8,  0.01],
			[0.1,  0,    0.8]
		])
		self.b = numpy.array([
			[0.1, 0.1, 0.8],
			[0.05, 0.9, 0.05],
			[0.8, 0.1, 0.1]
		])
		self.pi = numpy.array([0.9,0.05,0.05])
		self.T = 2000
		# we want the errors to be less than 20%
		self.error_threshold = 0.2
	
	def setup(self):	
		self.model = HMM(self.Z,self.b,self.pi)
	
	def gen_states_obs(self):
		states = []
		obsvns = []
		for (s,o) in self.model.gen(self.T):
			states.append(s)
			obsvns.append(o)
		return states, obsvns
	
	def test_init(self):	
		self.model = HMM(self.Z,self.b,self.pi)
		
	def test_gen(self):
		self.setup()
		states = []
		obsvns = []
		for (s,o) in self.model.gen(10):
			states.append(s)
			obsvns.append(o)
		assert len(states) == 10
		assert len(obsvns) == 10
	
	def test_forward_backward(self):
		self.setup()
		states, obsvns = self.gen_states_obs()
		alpha,beta = self.model.forward_backward(obsvns)	
		
		gamma = [a*b/sum(a*b) for a,b in zip(alpha,beta)]
		state_est = numpy.array([numpy.where(g==max(g))[0][0] for g in gamma])
		err = sum(state_est != numpy.array(states))/float(len(states))
		assert err < self.error_threshold
	
	def test_viterbi(self):
		self.setup()
		states, obsvns = self.gen_states_obs()
		state_est = self.model.viterbi(obsvns)		
		err = sum(state_est != numpy.array(states))/float(len(states))
		assert err < self.error_threshold
 def test_simple_hmm_learning(self):
     state_seq = [[0, 1, 1, 0, 1, 0, 1, 1], [0, 0, 1, 0]]
     obs_seq = [[0, 0, 1, 1, 0, 0, 0, 1], [0, 1, 0, 0]]
     hmm = HMM(range(2), range(2))
     hmm.learn_from_labeled_data(state_seq, obs_seq)
     print hmm
     eps = 0.00001
     self.assertTrue(max_delta(hmm.initial, [0.750000, 0.250000]) < eps)
     self.assertTrue(max_delta(hmm.transition, [[0.285714, 0.714286], [0.571429, 0.428571]]) < eps)
     self.assertTrue(max_delta(hmm.observation, [[0.625000, 0.375000], [0.625000, 0.375000]]) < eps)
Пример #13
0
def test_hmm():
    m = HMM(2, 2)
    observations = [[0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,1,1,1,1],[0,0,0,0,1,0,1,1,0,1,1,0,0,1,0,0,1,1,1,1,0,0,1,0,0]]
    ground = [[0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,1],[0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0]]
    m.learn(observations, ground, smooth=None)
    trueres = ([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0], -21.944)
    res = m.viterbi(observations[1])
    assert trueres[0] == res[0]
    print trueres[1]
    print res[1]
    assert abs(trueres[1] - res[1]) < 0.1
Пример #14
0
 def setUp(self):
     self._model_filename = "hmm_m4n4.pkl"
     self._train_filename = "m4n4.train.data"
     self._num_hidden = 4
     self._num_observ = 4
     transition_matrix = np.random.rand(4, 4)
     observation_matrix = np.random.rand(4, 4)
     hmm = HMM(self._num_hidden, self._num_observ, transition_matrix=transition_matrix,
               observation_matrix=observation_matrix)
     sequences = hmm.generate_data(10000, 4, 51)
     io.save_sequences(self._train_filename, sequences)
     HMM.to_file(self._model_filename, hmm)
Пример #15
0
def main(job_id, params):
    num_runs = 20
    obs_length = 100
    num_states = 2
    num_obs = 2

    # readin hmm indx
    t = 0
    try:
        with open(os.path.join('.', 'hmm_index.txt')) as hmm_index_file:
            t = int(hmm_index_file.read())
        sys.stderr.write("!!!!!!!!!!!!!!!!!!HMM INDEX:  " + str(t) + "   !!!!!!!!!!!!!!!\n")
    except IOError:
        t = 0


    # generate HMM observations
    np.random.seed(0x6b6c26b2)
    seeds = np.random.randint(0x0fffffff, size=num_runs)
    np.random.seed(seeds[t])
    # random hmm
    z_mat, t_mat = random_hmm(num_states, num_obs)
    pi_vec = np.array([1.0 / num_states] * num_states)
    hmm_test = HMM(z_mat, t_mat, pi_vec)
    # random obs trajectory
    obs = hmm_test.generate(obs_length)[np.newaxis, :]

    # calculate log likelihood for input HMM parameters
    z_mat_p_input = np.array([[params['z_mat_p_0'][0], params['z_mat_p_1'][0]]])
    t_mat_p_input = np.array([[params['t_mat_p_0'][0], params['t_mat_p_1'][0]]])
    # pi_vec_input = np.array([params['pi_0'], 1 - params['pi_0']])
    hmm_estimate = make_parameterized_HMM(z_mat_p_input, t_mat_p_input, pi_vec)
    hmm_loglikelihood = hmm_estimate.loglikelihood(obs[0])

    # use the current suggest point and run EM to get a new point
    hmm_em_est, _, _ = em.em(hmm_estimate, hmm_estimate.z_mat, hmm_estimate.t_mat, obs, 30, 0.1)
    em_est_z_mat, em_est_t_mat = retrieve_parameterized_HMM(hmm_em_est)
    em_est_ll = -hmm_em_est.loglikelihood(obs[0])
    em_est_z_mat.reshape((em_est_z_mat.size,))
    em_est_t_mat.reshape((em_est_t_mat.size,))
    print em_est_t_mat
    print em_est_z_mat
    historical_points = [{'params': {}}]
    # write z_mat
    for i, v in enumerate(em_est_z_mat[0]):
        historical_points[0]['params']['z_mat_p_' + str(i)] = {'values': np.array([v]), 'type': 'float'}
    # write t_mat
    for i, v in enumerate(em_est_t_mat[0]):
        historical_points[0]['params']['t_mat_p_' + str(i)] = {'values': np.array([v]), 'type': 'float'}
    historical_points[0]['value'] = em_est_ll
    dump_new_history('.', historical_points)
    return -hmm_loglikelihood
Пример #16
0
Файл: test.py Проект: 52nlp/iir
def main():
    parser = OptionParser()
    parser.add_option("-t", dest="test", help="test data directory")
    parser.add_option("-m", dest="model", help="model data filename to save")
    (options, args) = parser.parse_args()
    if not options.model: parser.error("need model data filename(-m)")

    hmm = HMM()
    hmm.load(options.model)

    if options.test:
        tests = load_data(options.test)
        for x in tests:
            print zip(x, hmm.Viterbi(hmm.words2id(x)))
Пример #17
0
def train_hmm_from_data(data_filename, debug=False):
    if debug:
	print "\n\nReading dataset %s ..." % data_filename
    data_filename = normalize_filename(data_filename)
    d = DataSet(data_filename)
    #if options.verbose:
    #	print d
    if debug:
	print "Building an HMM from the full training data..."
    hmm = HMM(d.states, d.outputs)
    hmm.learn_from_labeled_data(d.train_state, d.train_output)
    if debug:
	print "The model:"
	print hmm
    return (hmm, d)
Пример #18
0
def hmmBaumWelchRestarts(nRuns, stopping, obs, parallel=True):
    '''Performs the baum welch nRuns times stopping when the likelihood
    changes by less than stopping'''

    models = [HMM.randomStudent() for _ in xrange(nRuns)]

    return baumWelchRestarts(models, obs, stopping, parallel)
Пример #19
0
def main():
    hmm = HMM(3, ('up', 'down', 'unchanged'),
              initial_probability=[0.5, 0.2, 0.3],
              transition_probability=[[0.6, 0.2, 0.2],
                                      [0.5, 0.3, 0.2],
                                      [0.4, 0.1, 0.5]],
              observation_probability=[[0.7, 0.1, 0.2],
                                       [0.1, 0.6, 0.3],
                                       [0.3, 0.3, 0.4]])

    observation = ("up", "up", "unchanged", "down", "unchanged", "down", "up")
    ob_length = len(observation)
    p, _ = hmm.forward(observation, ob_length)
    path = hmm.decode(observation, ob_length)
    print("P{} = {:.13f}".format(tuple(observation), p))
    print("Observation sequence =", tuple(i+1 for i in path))
Пример #20
0
 def test_accumulative(self):
     hmm = HMM.from_file(self._model_filename)
     for i in xrange(self._num_hidden):
         self.assertAlmostEqual(1.0, hmm._accumulative_transition_matrix[-1, i], 
                          delta=1e-6)
         self.assertAlmostEqual(1.0, hmm._accumulative_observation_matrix[-1, i], 
                          delta=1e-6)
Пример #21
0
def hmm_train(results):
	train_file = results.train
	freq_file = results.freq

	logger.debug(	'Started training HMM with options:'	+ "\n" +
					'training file:	' + str(train_file) 	+ "\n" +
					'frequency file:' + str(freq_file)		+ "\n")


	if not os.path.exists('model/hmm-model'):
		classifier = HMM()
		classifier.train(train_file,freq_file)
		logger.info("Done Training, model is written in model file")
		model = classifier.get_theta()
		write_obj(model, 'hmm-model')
	else:
		logger.info('model already exists, nothing to do!')
Пример #22
0
 def test_slfit(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     learner = SLHMM(self._num_hidden, self._num_observ)
     learner.fit(sequences, verbose=True)
     for sequence in sequences:
         pprint("True probability: %f" % hmm.predict(sequence))
         pprint("Infered probability: %f" % learner.predict(sequence))
Пример #23
0
    def __init__(self, hmm, rule_set=None):
        if isinstance(hmm, HMM):
            self.hmm = hmm
        else:
            self.hmm = HMM(hmm)
        segment_table = SegmentTable()
        self.segment_symbol_length = uniform_encoding.log2(len(segment_table) + 1)  # + 1 for the delimiter
        if rule_set:
            self.rule_set = rule_set
        else:
            self.rule_set = RuleSet(noise=False)

        noises = configurations.get("NOISE_RULE_SET", [])
        self.noise_rule_set = RuleSet.load_noise_rules_from_flat_list(noises)

        self._cached_hmm_transducer = None
        self._cached_rule_set_transducer = None
        self._cached_noise_rule_set_transducer = None
Пример #24
0
def hmm_test(results):

	test_file = results.test
	freq_file = results.freq


	logger.debug(	'Started testing HMM with options:'		+ "\n" +
					'test file:	'		+ str(results.test) + "\n" +
					'frequency file:' 	+ str(freq_file)	+ "\n")

	logger.info("Loading model")
	model = read_obj('hmm-model')


	classifier = HMM()

	classifier.load_theta(model)
	classifier.test(test_file,freq_file)
Пример #25
0
	def testViterbi(self):
		"""Test viterbi algorithm
	
		Come up with a good non-trivial way to test your training function	
		You can use the given icecream dataset or make up your own 
		
		Add your implementation
		"""
		seq1 = Instance(label = "he saw a dog".split(), data = ['NN', 'VB', 'D', 'NN'])
		seq2 = Instance(label = "the dog sees a saw".split(), data = ['D', 'NN', 'VB', 'D', 'NN'])
		seq3 = Instance(label = "the saw saw a dog".split(), data = ['D', 'NN', 'VB', 'D', 'NN'])
		seq4 = Instance(label = "he saw the saw".split(), data = ['NN', 'VB', 'D', 'NN'])
		instances = [seq1, seq2, seq3, seq4]
		hmm = HMM()
		hmm.train(instances)
		sawind = hmm.label_alphabet.get_index('saw')
		self.assertNotEqual(hmm.emission_matrix[sawind, 0], 0, 'WRONG')
		self.assertNotEqual(hmm.emission_matrix[sawind, 1], 0, 'WRONG')
Пример #26
0
    def __init__(self, demos, n_state, gamma, n_offspring):
        '''
        Base RL class. Creates an hmm inside. Explores and updates hmm parameters. Generates motion for an
        episode. Stores rollout (or episode) information

         :param demos: List of 2D numpy arrays.
                      Each 2D numpy array is (n_keyframe, n_dim)
                      for action model, n_dim = 7
                      for goal model, n_dim = 8

        :param n_state: Integer, number of possible hidden states.
        :param gamma: Initial covariance matrix multiplier
        :param n_offspring: Number of offsprings (rollouts) in an episode
        '''
        self.hmm = HMM(demos, n_state, gamma)

        self.n_offspring = n_offspring
        self.reset_rollout()
Пример #27
0
	def testSupervisedTraining(self):
		"""Test parameter fitting
	
		Come up with a good non-trivial way to test your training function	
		You can use the given icecream dataset or make up your own 
		
		Add your implementation
		"""
		seq1 = Instance(label = ['odd', 'even', 'odd', 'even', 'odd'], data = [3, 2, 1, 4, 1])
		seq2 = Instance(label = ['even', 'even', 'odd', 'odd', 'even'], data = [2, 4, 1, 3, 2])
		seq3 = Instance(label = ['even', 'even', 'odd', 'odd', 'odd'], data = [1, 2, 3, 4, 3])
		seq4 = Instance(label = ['odd', 'odd', 'even', 'even', 'even'], data = [4, 3, 4, 1, 2])
		instances = [seq1, seq2, seq3, seq4]
		hmm = HMM()
		hmm.train(instances)
		mystery = Instance(data = [2, 1, 3, 4, 2, 2, 1, 3])
		labels = hmm.classify_instance(mystery)
		self.assertEqual(labels, ['even', 'odd', 'odd', 'even', 'even', 'even', 'odd', 'odd'], 'NOOO')
Пример #28
0
 def test_accumulative(self):
     hmm = HMM.from_file(self._model_filename)
     for i in xrange(self._num_hidden):
         self.assertAlmostEqual(1.0,
                                hmm._accumulative_transition_matrix[-1, i],
                                delta=1e-6)
         self.assertAlmostEqual(1.0,
                                hmm._accumulative_observation_matrix[-1, i],
                                delta=1e-6)
Пример #29
0
 def test_morpheme_boundary(self):
     configurations["MORPHEME_BOUNDARY_FLAG"] = True
     self.initialise_segment_table("plural_english_segment_table.txt")
     hmm = HMM({
         INITIAL_STATE: ['q1'],
         'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
         'q2': ([FINAL_STATE], ['z'])
     })
     grammar = Grammar(hmm, [])
Пример #30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str, help='the path to testing data')
    args = parser.parse_args()

    test__data_path = args.path
    test_words = load_raw_data(test__data_path)

    hmm = HMM()
    hmm.load_model('./hmmmodel.txt')

    tag_result = []

    for sentence in test_words:
        re = hmm.decode(sentence)
        tag_result.append(re)

    save_result(test_words, tag_result)
Пример #31
0
    def prepare_seqs_en(self, decoding="viterbi"):
        params_fixed = (np.load("{}/ip.npy".format(self.path)),
                        np.load("{}/tp.npy".format(self.path)),
                        np.load("{}/fp.npy".format(self.path)),
                        np.load("{}/ep.npy".format(self.path)))

        h = HMM(self.n_states,
                self.n_obs,
                params=params_fixed,
                writeout=False,
                dirname=self.path)

        self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict)

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        muc_seq = self.ner_corpus.read_sequence_list_conll(
            muc_test) if self.use_muc else None

        decoder = None
        type_decoder = None
        if decoding == "viterbi":
            decoder = h.viterbi_decode_corpus
        elif decoding == "max_emission":
            decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined correctly, using Viterbi.")
            decoder = h.viterbi_decode_corpus

        print(
            "Decoding word representations on train. This may take a while...")
        type_decoder(
            train_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(train_seq)
        print("Decoding word representations on dev.")
        type_decoder(
            dev_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(dev_seq)
        print("Decoding word representations on test.")
        type_decoder(
            test_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(test_seq)
        if self.use_muc:
            print("Decoding word representations on MUC.")
            type_decoder(
                muc_seq, self.dataset,
                self.logger) if type_decoder is not None else decoder(muc_seq)

        return train_seq, dev_seq, test_seq, muc_seq
Пример #32
0
def model_training(train_data, tags):
    """
    Train HMM based on training data

    Inputs:
    - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class
    - tags: (1*num_tags) a list of POS tags

    Returns:
    - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data
    """
    model = None
    
    ###################################################
    # Edit here
    N=len(train_data)
    S=len(tags)
    pi=np.zeros(S)
    A=np.zeros((S,S))
    state_dict={}
    obs=[]
    obs_dict={}
    o=0
    for t in range(S):
        state_dict[tags[t]]=t
    
    for line in train_data:
        pi[state_dict[line.tags[0]]]+=1
        for w in range(line.length-1):
            A[state_dict[line.tags[w]],state_dict[line.tags[w+1]]]+=1

    for line in train_data:
        for w in range(line.length):
            if line.words[w] not in obs_dict.keys():
                obs_dict[line.words[w]]=o
                o+=1  
    
    pi=pi/N
    A=(A.T/np.sum(A, axis=1)).T

               
    O=len(obs_dict)
    
    B=np.zeros((S,O))
    
    for line in train_data:
        for w in range(line.length):
            B[state_dict[line.tags[w]],obs_dict[line.words[w]]]+=1
    B=(B.T/np.sum(B,axis=1)).T
    a1=np.isnan(A)
    A[a1]=0
    b1=np.isnan(B)
    B[b1]=0
    model=HMM(pi,A,B,obs_dict,state_dict)
    
    return model
Пример #33
0
 def setUp(self):
     """Initialize Eisner ice cream HMM (J & M, Figure 6.3)"""
     self.hmm = HMM()
     # These variables have many aliases. J & M call them π, A, B, Q, and V.
     # You don't need to use these names, but you do need to provide a way
     # of initializing them.
     self.hmm.train(
         [],
         initial_probabilities=[.8, .2],  # P(Hot, Cold)
         transition_probabilities=[
             [.7, .3],  # P(Hot|Hot, Cold)
             [.4, .6]
         ],  # P(Cold|Hot, Cold)
         emission_probabilities=[
             [.2, .4, .4],  # P(1, 2, 3|Hot)
             [.5, .4, .1]
         ],  # P(1, 2, 3|Cold)
         states=("Hot", "Cold"),
         vocabulary=(1, 2, 3))
Пример #34
0
 def test_predict(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence),
                          "HMM.prediction Error")
     sequences = [[0, 1], [1, 2, 3, 0], [0, 0, 0, 1]]
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence),
                          "HMM.prediction Error")
def speech_tagging(test_data, model, tags):
    """
	Inputs:
	- test_data: (1*num_sentence) a list of sentences, each sentence is an object of line class
	- model: an object of HMM class

	Returns:
	- tagging: (num_sentence*num_tagging) a 2D list of output tagging for each sentences on test_data
	"""
    tagging = []
    ###################################################
    # Edit here
    ###################################################
    N, M = model.B.shape
    new_model = model
    new_column = 1e-6 * np.ones([N, 1])
    new_feature_number = 0
    new_b = model.B
    new_obs_dict = model.obs_dict

    for sentence in test_data:
        for word in sentence.words:
            if word not in model.obs_dict:
                # add new features and set number of new features
                # add new column to b
                # sample : np.append(???,new_column,axis=1)
                new_b = np.append(new_b, new_column, axis=1)

                # add new features to obs_dict
                new_obs_dict[word] = len(new_b[0, :]) - 1

                # augment new features number
                new_feature_number += 1

    if new_feature_number != 0:
        new_model = HMM(model.pi, model.A, new_b, new_obs_dict,
                        model.state_dict)

    for sentence in test_data:
        tag_row = new_model.viterbi(sentence.words)
        tagging.append(tag_row)

    return tagging
Пример #36
0
 def __get_best_pos_to_shoot(self):
     """Returns a position in which is more likely to shoot the enemy.
 """
     #Gets the state of the markov model at time t.
     transition_probabilities = self.__get_net_probs()
     emission_probabilities = self.__get_net_probs()
     hmm = HMM(transition_probabilities, emission_probabilities)
     emissions = [2, 1, 0]
     initial = self.__get_net_probs()
     return (self.net[self.viterbi(hmm, initial, emissions)[0]].id)
Пример #37
0
def hmm_test():

    st_time = time.time()

    model_file = "hmm_model.json"

    # load data
    with open(model_file, 'r') as f:
        data = json.load(f)
    A = np.array(data['A'])
    B = np.array(data['B'])
    pi = np.array(data['pi'])
    # observation symbols
    obs_dict = data['observations']
    # state symbols
    states_symbols = dict()
    for idx, item in enumerate(data['states']):
        states_symbols[item] = idx
    Osequence = np.array(data['Osequence'])
    N = len(Osequence)
    model = HMM(pi, A, B, obs_dict, states_symbols)

    delta = model.forward(Osequence)
    m_delta = np.array([[3.5000e-01, 1.3600e-01, 0.0000e+00, 0.0000e+00, 1.1136e-05, 1.1136e-05, 0.0000e+00],
               [1.5000e-01, 3.2000e-02, 4.6400e-03, 2.7840e-04, 3.3408e-05, 1.1136e-05, 8.9088e-07]])

    print("Your forward function output:", delta)
    print("My forward function output:", m_delta)

    gamma = model.backward(Osequence)
    m_gamma = np.array([[1.6896e-06, 3.8400e-06, 6.4000e-05, 2.0000e-03, 1.4000e-02, 2.0000e-02, 1.0000e+00],
               [1.9968e-06, 1.1520e-05, 1.9200e-04, 3.2000e-03, 2.2000e-02, 6.0000e-02, 1.0000e+00]])

    print("Your backward function output:", gamma)
    print("My backward function output:", m_gamma)

    prob1 = model.sequence_prob(Osequence)
    m_prob1 = 8.908800000000002e-07

    print("Your sequence_prob function output:", prob1)
    print("My sequence_prob function output:", m_prob1)

    prob2 = model.posterior_prob(Osequence)
    m_prob2 = np.array([[0.6637931, 0.5862069, 0., 0., 0.175, 0.25, 0.],
               [0.3362069, 0.4137931, 1., 1., 0.825, 0.75, 1.]])

    print("Your posterior_prob function output:", prob2)
    print("My posterior_prob function output:", m_prob2)

    viterbi_path = model.viterbi(Osequence)
    m_viterbi_path = ['1', '1', '2', '2', '2', '2', '2']

    print('Your viterbi function output: ', viterbi_path)
    print('My viterbi function output: ', m_viterbi_path)

    en_time = time.time()
    print()
    print("hmm total time: ", en_time - st_time)
Пример #38
0
    def test_hmm_connected_components(self):
        hmm = HMM({INITIAL_STATE: ['q1'],
                   'q1': (['q2', FINAL_STATE], ['dag', 'kot']),
                   'q2': (['q1', FINAL_STATE], ['z'])})
        log_hmm(hmm)
        component_states = hmm.get_connected_components(ignore_initial_and_final_states=False)
        self.write_to_dot_to_file(hmm, 'connected_hmm')
        print(component_states)

        assert component_states[0][0] == FINAL_STATE
        assert component_states[2][0] == INITIAL_STATE
        assert 'q1' in component_states[1]
        assert 'q2' in component_states[1]

        component_states = hmm.get_connected_components(ignore_initial_and_final_states=True)
        assert 'q1' in component_states[0]
        assert 'q2' in component_states[0]
        assert [INITIAL_STATE] not in component_states
        assert [FINAL_STATE] not in component_states
Пример #39
0
 def test_predict(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), 
                          "HMM.prediction Error")
     sequences = [[0,1], [1,2,3,0], [0,0,0,1]]
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), 
                          "HMM.prediction Error")
Пример #40
0
def train_hmm(X,
              k,
              prev_hmm=None,
              window_size=5,
              num_clusters=5,
              num_states=4,
              max_iter=10):
    """Trains an HMM
    
    Args:
        X: the data
        k: the number of sequences to divide the data into
        prev_hmm: an HMM object to set the parameters
        window_size: the window size of exponential weighting
        num_clusters: the number of mixtures in each GMM
        num_states: the number of states in the HMM
        max_iter: the max number of iterations allowed in the HMM
    Returns:
        the trained HMM
    """
    temp_X = X.reshape((k, X.shape[0] // k, X.shape[1]))
    num_features = X.shape[1]

    if (prev_hmm is None):
        pi = random_splits(num_states, 1)
        A = np.array([random_splits(num_states, 1) for _ in range(num_states)])
        weights = np.random.rand(num_states, num_clusters) / num_clusters
        means = np.random.rand(num_states, num_clusters,
                               num_features) * .6 - .3
        cov = np.tile(np.eye(num_features), (num_states, num_clusters, 1, 1))
        for i in range(num_states):
            weights[i] = weights[i] / weights[i].sum()
    else:
        pi = prev_hmm.pi
        A = prev_hmm.A
        weights = prev_hmm.weights
        means = prev_hmm.means
        cov = prev_hmm.cov

    hmm = HMM(pi, A, weights, means, cov, num_states)
    hmm.em(X, window_size=3, max_iter=max_iter)

    return hmm
Пример #41
0
    def test_init(self):
        hmm = HMM(UDDataSet('data/en-ud-train.conllu'))

        self.assertEqual(17, hmm.num_state)
        self.assertEqual(17, hmm.bos_idx)
        self.assertEqual(18, hmm.eos_idx)

        for i in range(hmm.num_state):
            self.assertAlmostEqual(-12.228919653600784,
                                   hmm.emission_counter[(i, -1)])
def train_N_state_hmms_from_data(filename, num_states, debug=False):
    """ reads all the data, then split it up into each category, and then
    builds a separate hmm for each category in data """
    dataset = DataSet(filename)
    category_seqs = split_into_categories(dataset)

    # Build a hmm for each category in data
    hmms = {}
    for cat, seqs in category_seqs.items():
        if debug:
            print "\n\nLearning %s-state HMM for category %s" % (
                num_states, cat)

        model = HMM(range(num_states), dataset.outputs)
        model.learn_from_observations(seqs, debug)
        hmms[cat] = model
        if debug:
            print "The learned model for %s:" % cat
            print model
    return (hmms, dataset)
def model_training(train_data, tags):
    """
	Train HMM based on training data

	Inputs:
	- train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class
	- tags: (1*num_tags) a list of POS tags

	Returns:
	- model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data
	"""
    model = None
    ###################################################
    N = len(tags)
    A = np.ones((N, N)) / N
    pi = np.ones(N) / N

    state_dict, tag_dict, obs_dict = {}, {}, {}
    word_list = []

    for idx, tag in enumerate(tags):
        state_dict[tag] = idx

    for cur_line in train_data:
        pi[state_dict[cur_line.tags[0]]] += 1
        for idx in range(cur_line.length):
            tag = cur_line.tags[idx]
            word_list.append(cur_line.words[idx])
            if tag not in tag_dict:
                tag_dict[tag] = 1
            else:
                tag_dict[tag] += 1
            if idx < cur_line.length - 1:
                A[tags.index(cur_line.tags[idx]),
                  tags.index(cur_line.tags[idx + 1])] += 1

    word_list = list(set(word_list))
    for idx, word in enumerate(word_list):
        obs_dict[word] = idx

    total_tags = sum(tag_dict.values())
    for key in tag_dict.keys():
        tag_dict[key] /= total_tags

    B = np.zeros([N, len(word_list)])
    for line in train_data:
        for word, tag in zip(line.words, line.tags):
            B[state_dict[tag], obs_dict[word]] = tag_dict[tag]

    A /= np.sum(A, axis=1)[:, None]
    pi /= len(train_data)
    model = HMM(pi, A, B, obs_dict, state_dict)
    ###################################################
    return model
Пример #44
0
 def test_morpheme_boundary(self):
     self.configurations["MORPHEME_BOUNDARY_FLAG"] = True
     self.initialise_segment_table("plural_english_segment_table.txt")
     hmm = HMM({
         INITIAL_STATE: ['q1'],
         'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
         'q2': ([FINAL_STATE], ['z'])
     })
     grammar = Grammar(hmm)
     self.assertCountEqual(['dog', 'kat', 'dogz', 'katz'],
                           grammar.get_all_outputs())
    def test_morphology_only2(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        self.configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25
        data = [u'tozata', u'tozaso', u'tozakt', u'tozzookata', u'tozzookaso', u'tozzookakt', u'tozzook', u'tozdodata', u'tozdodaso', u'tozdodakt', u'tozdod', u'tozgosata', u'tozgosaso', u'tozgosakt', u'tozgos', u'toz', u'dagata', u'dagaso', u'dagakt', u'dagzookata', u'dagzookaso', u'dagzookakt', u'dagzook', u'dagdodata', u'dagdodaso', u'dagdodakt', u'dagdod', u'daggosata', u'daggosaso', u'daggosakt', u'daggos', u'dag', u'gasata', u'gasaso', u'gasakt', u'gaszookata', u'gaszookaso', u'gaszookakt', u'gaszook', u'gasdodata', u'gasdodaso', u'gasdodakt', u'gasdod', u'gasgosata', u'gasgosaso', u'gasgosakt', u'gasgos', u'gas', u'kodata', u'kodaso', u'kodakt', u'kodzookata', u'kodzookaso', u'kodzookakt', u'kodzook', u'koddodata', u'koddodaso', u'koddodakt', u'koddod', u'kodgosata', u'kodgosaso', u'kodgosakt', u'kodgos', u'kod', u'katata', u'kataso', u'katakt', u'katzookata', u'katzookaso', u'katzookakt', u'katzook', u'katdodata', u'katdodaso', u'katdodakt', u'katdod', u'katgosata', u'katgosaso', u'katgosakt', u'katgos', u'kat', u'dotata', u'dotaso', u'dotakt', u'dotzookata', u'dotzookaso', u'dotzookakt', u'dotzook', u'dotdodata', u'dotdodaso', u'dotdodakt', u'dotdod', u'dotgosata', u'dotgosaso', u'dotgosakt', u'dotgos', u'dot']
        hmm = HMM({'q0': [u'q1'],
        'q1': ([u'q2', u'q3', u'qf'], ['toz', 'dag', 'kat', 'dot', 'kod', 'gas']),
        'q2': ([u'q3',u'qf'], ['zook', 'gos', 'dod']),
        'q3': ([u'qf'], ['aso', 'akt', 'ata'])})

        self.configurations.simulation_data = data
        hypothesis = Hypothesis(Grammar(hmm, []))
Пример #46
0
def model_training(train_data, tags):
    #####lowercased######
    for data in train_data:
        for it in range(data.length):
            data.words[it] = data.words[it].lower()

    #####################
    S = len(tags)
    pi = np.zeros(S)
    A = np.zeros([S, S])
    B = []
    Bc = np.zeros([S, 1])
    Ac = np.zeros([S, S])
    obs_dict = {}
    states_symbols = {}
    for i in range(S):
        if not tags[i] in states_symbols.keys():
            states_symbols[tags[i]] = i
    numS = np.zeros(S)
    num1S = np.zeros(S)
    ####################################
    for data in train_data:
        firsttag = data.tags[0]
        num1S[states_symbols[firsttag]] += 1
        for i in range(data.length):
            word = data.words[i]
            tag = data.tags[i]
            if not word in obs_dict.keys():
                obs_dict[word] = len(obs_dict)
                Bc = np.append(Bc, np.zeros([S, 1]), axis=1)
            Bc[states_symbols[tag], obs_dict[word]] += 1

            numS[states_symbols[tag]] += 1
            if i != data.length - 1:
                Ac[states_symbols[tag], states_symbols[data.tags[i + 1]]] += 1
    B = np.zeros(np.shape(Bc))
    pi = normalize(num1S)
    for s in range(S):
        for sp in range(S):
            if numS[s] == 0:
                A[s, sp] = 0
            else:
                A[s, sp] = Ac[s, sp] / numS[s]
    for s in range(len(Bc)):
        for o in range(len(Bc[0])):
            if numS[s] == 0:
                B[s, o] = 0
            else:
                B[s, o] = Bc[s, o] / numS[s]
    ###################################

        model = HMM(pi, A, B, obs_dict, states_symbols)

    return model
Пример #47
0
    def test_crossover_subgraph(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        hmm_1 = HMM({INITIAL_STATE: ['q1'],
                     'q1': (['q1', 'q2'], ['da']),
                     'q2': ([FINAL_STATE], ['s'])})

        hmm_2 = HMM({INITIAL_STATE: ['q1'],
                     'q1': (['q2'], ['ko']),
                     'q2': (['q3'], ['bo']),
                     'q3': (['q4'], ['go']),
                     'q4': ([FINAL_STATE], ['z'])})

        offspring_1, offspring_2 = HMM.crossover_subgraphs(hmm_1, hmm_2)

        self.write_to_dot_to_file(hmm_1, 'subgraph_parent_1')
        self.write_to_dot_to_file(hmm_2, 'subgraph_parent_2')
        self.write_to_dot_to_file(offspring_1, 'subgraph_offspring_1')
        self.write_to_dot_to_file(offspring_2, 'subgraph_offspring_2')
        offspring_1.get_transducer()
        offspring_2.get_transducer()
Пример #48
0
    def predict_crimes(cls, crimes, limit=16):
        # Sort crimes in chronological order
        crimes.sort(key=lambda crime: crime.date)
        # Find the average duration between crimes
        deltas = [later.date - now.date for now, later in zip(crimes, crimes[1:])]
        delta = sum(deltas, timedelta()) / len(deltas)
        # Only allow a granularity of 1 hour
        delta = max(delta, timedelta(hours=1))
        # Create a timeline that marks each of the given events, and also
        # includes empty non-events at regular intervals (determined by `delta`)
        # between the first and last timestamp
        time, stop = crimes[0].date, crimes[-1].date
        timeline = {crime.date : crime for crime in crimes}

        while time <= stop:
            timeline.setdefault(time, None)
            time += delta

        # Convert the padded timeline back to an ordered time sequence of events
        events = [timeline[k] for k in sorted(timeline.keys())]
        # Create an HMM to predict the regions in which crimes will take place
        regions = [e.region if e is not None else None for e in events]
        regions = iter(HMM.from_events(regions))
        # Create a separate HMM for the crimes' descriptions
        descs = [e.description for e in events if e is not None]
        descs = iter(HMM.from_events(descs))

        future = []

        while len(future) < limit:
            time += delta
            region = next(regions)

            if region is not None:
                future.append({
                    'date': time.strftime(cls.DATE_FMT),
                    'region': region.to_dict(),
                    'description': next(descs)
                })

        return future
Пример #49
0
    def prepare_seqs_en_dbg(self, decoding="viterbi"):
        params_fixed = (np.load("{}ip.npy".format(self.path)),
                        np.load("{}tp.npy".format(self.path)),
                        np.load("{}fp.npy".format(self.path)),
                        np.load("{}ep.npy".format(self.path)))

        h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False)
        h.dirname = self.path
        self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict)

        # train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        # test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        # muc_seq = self.ner_corpus.read_sequence_list_conll(muc_test)

        if decoding == "viterbi":
            decoder = h.viterbi_decode_corpus
        elif decoding == "max_emission":
            decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined correctly, using Viterbi.")
            decoder = h.viterbi_decode_corpus

        #print("Decoding word representations on train.")
        #decoder(train_seq)
        print("Decoding word representations on dev.")
        decoder(dev_seq, self.dataset)

        #print("Decoding word representations on test.")
        #decoder(test_seq)
        #print("Decoding word representations on MUC.")
        #decoder(muc_seq)

        #return train_seq, dev_seq, test_seq, muc_seq
        return dev_seq
 def init_target_hypothesis(self):
     target_tuple = self.simulation.target_tuple
     target_rule_set = RuleSet.load_form_flat_list(target_tuple[1])
     target_hypothesis = Hypothesis.create_hypothesis(
         HMM(deepcopy(target_tuple[0])), target_rule_set)
     target_energy = target_hypothesis.get_energy()
     self.logger.info('Target hypothesis:')
     log_hypothesis(target_hypothesis, self.logger.info)
     self.logger.info('Target energy: {}'.format(target_energy))
     self.logger.info('Target hypothesis energy signature: {}'.format(
         target_hypothesis.get_recent_energy_signature()))
     return target_hypothesis, target_energy
Пример #51
0
def generate_samples():
    model = HMM.from_fixed_params()
    T = 10
    Z, X = model.sample(T)
    print(model)
    print(f"X: {X}, True Z: {Z}")

    smcmc = SMCMC(model, T)
    samples = smcmc.sample(N=100, x_sequence=X)
    print(f"Last L samples: \n{samples[:, -1]}")
    # print(np.unique(samples[:, 0], return_counts=True)[1] / len(samples))
    print(f"Mean sample: {samples.mean((0, 1)).round(1)}")
    def __init__(self,
                 string_input_words,
                 max_word_length_in_data,
                 initial_hmm=None,
                 alphabet_or_words="words"):
        if not isinstance(string_input_words, list):
            raise ValueError("should be list")
        self.feature_table = get_feature_table()
        self.max_word_length_in_data = max_word_length_in_data
        if not initial_hmm:
            feature_table = get_feature_table()
            if alphabet_or_words == "alphabet":
                alphabet = feature_table.get_alphabet()
                self.hmm = HMM.create_hmm_alphabet(alphabet)
            elif alphabet_or_words == "words":
                self.hmm = HMM.create_hmm_from_list(string_input_words)
        else:
            self.hmm = initial_hmm

        self.words = []
        self._update_words()
Пример #53
0
    def __init__(self):
        # TODO 测试集上检查平滑处理的抉择问题
        self.minfreq = -3.14e+100
        # 构建字典树、用于扫描全切分有向图
        self.trie = Trie()
        self.construct_trie()
        # 构建 二元词典
        # self.construct_bigram_dic()
        # 读取二元词典
        with open('files/bigram_dic.json', 'r') as f:
            self.bigram_dic = json.load(f)

        # 进行特殊处理
        self.SP = SpecialProcess()

        # 创建HMM分词模型
        self.hmm = HMM()

        # 获取常用姓名中名字
        self.get_second_names()
        self.get_first_name()
Пример #54
0
    def test_plural_english_grammar(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        rule_set = self.get_rule_set("plural_english_rule_set.json")

        hmm = HMM({
            INITIAL_STATE: ['q1'],
            'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
            'q2': ([FINAL_STATE], ['z'])
        })

        grammar = Grammar(hmm, rule_set)
        grammar_transducer = grammar.get_transducer()
 def __load_dictionary(self, dir_name):
     print('Loading dictionary from ' + dir_name + "...")
     for dir in os.walk(dir_name).next()[1]:
         self.dictionary[dir] = []
         for file in os.walk(dir_name + '/' + dir).next()[2]:
             if file.endswith('.wav'):
                 rate, data = wavfile.read(dir_name + '/' + dir + '/' + file)
                 if self.rate is not None and self.rate != rate:
                     print('Error: Dictionary sampling rate not constant.')
                 self.rate = rate
                 coefficients = self.__get_features(data)
                 self.dictionary[dir].append((coefficients, len(data)))
     print('Done.')
     print('Computing HMMs...')
     for key, value in self.dictionary.items():
         hmm = HMM(key)
         model_size = self.__get_model_size([len(x[0]) for x in value])
         hmm.train(model_size, [x[0] for x in value])
         self.hmms.append(hmm)
         print('Trained {0} with {1} states'.format(key, model_size))
     print('Done.')
Пример #56
0
def main():
    tagged_words = brown.tagged_words()
    words_corpus = brown.words()

    word2vec = Word2Vec()
    word2vec.train(words_corpus)

    word_vecs = [word2vec.word2vec(word) for word in words_corpus]

    n_clusters = 10 # random number for now
    kmeans = KMeans(n_clusters)
    kmeans.compute(word_vecs)

    # word-cluster HMM
    p_word = {}
    p_cluster = {}

    p_cluster_given_word = None # softmax
    p_word_given_cluster = None # joint probability formula

    p_transition_cluster = None # count
    p_initial_cluster = None # count

    # cluster-tag HMM
    p_cluster_given_tag = None # softmax
    p_transition_tag = None # count from tagged data
    p_initial_tag = None # count from tagged data

    hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster)
    hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag)

    words = []
    clusters = hmm_word_cluster.viterbi(words)
    tags = hmm_cluster_tag.viterbi(clusters)
Пример #57
0
def main():
    # load data
    train_data = myutils.read_conll_file("data/da_ddt-ud-train.conllu")
    dev_data = myutils.read_conll_file("data/da_ddt-ud-dev.conllu")

    hmm = HMM()

    # fit model to training data
    hmm.fit(train_data)

    # get most likely tag predictions 
    most_likely_predictions = hmm.predict(dev_data, method='most_likely')
    viterbi_predictions = hmm.predict(dev_data, method='viterbi')

    # evaluate
    gold = [x[1] for x in dev_data]

    sent_level, word_level = myutils.evaluate(gold, most_likely_predictions)
    print('most likely scores:')
    print('sent level:  {:.4f}'.format(sent_level))
    print('word level:  {:.4f} \n'.format(word_level))

    sent_level, word_level = myutils.evaluate(gold, viterbi_predictions)
    print('viterbi scores:')
    print('sent level:  {:.4f}'.format(sent_level))
    print('word level:  {:.4f} \n'.format(word_level))
    def test_crossover(self):
        from copy import deepcopy

        rule_1 = Rule.load([[{
            'cont': '+'
        }], [{
            'coronal': '-'
        }], [{
            'coronal': '-'
        }], [], True])
        rule_2 = Rule.load([[{
            'cons': '+',
            'low': '-'
        }], [{
            'voice': '-'
        }], [{
            'voice': '-'
        }], [], True])

        crossover_rule_1 = deepcopy(rule_1)
        crossover_rule_2 = deepcopy(rule_2)
        crossover_rule_1.left_context_feature_bundle_list = rule_2.left_context_feature_bundle_list
        crossover_rule_1.right_context_feature_bundle_list = rule_2.right_context_feature_bundle_list
        crossover_rule_1.change_feature_bundle_list = rule_2.change_feature_bundle_list

        crossover_rule_2.left_context_feature_bundle_list = rule_1.left_context_feature_bundle_list
        crossover_rule_2.right_context_feature_bundle_list = rule_1.right_context_feature_bundle_list
        crossover_rule_2.change_feature_bundle_list = rule_1.change_feature_bundle_list

        rule_set_1 = RuleSet([crossover_rule_1])
        rule_set_2 = RuleSet([crossover_rule_2])
        print(rule_set_1)
        print(rule_set_2)

        hmm = HMM({
            'q0': ['q1'],
            'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']),
            'q2': (['qf'], ['zo', 'go', 'do'])
        })
        grammar_1 = Grammar(hmm, rule_set_1)
        grammar_2 = Grammar(hmm, rule_set_2)

        data = ['kat', 'dot',     'dag', 'kod'] + \
               ['katso', 'dotso', 'dagzo', 'kodzo'] + \
               ['katko', 'dotko', 'daggo', 'kodgo'] + \
               ['katto', 'dotto', 'dagdo', 'koddo']

        hypothesis_1 = Hypothesis(grammar_1, data)
        hypothesis_2 = Hypothesis(grammar_2, data)

        print(hypothesis_1.get_energy())
        print(hypothesis_2.get_energy())
Пример #59
0
def model_training(train_data, tags):
    """
    Train HMM based on training data

    Inputs:
    - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class
    - tags: (1*num_tags) a list of POS tags

    Returns:
    - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data
    """
    model = None
    obs_dict = {}
    state_dict = {}
    curr_index = 0
    for tag in tags:
        state_dict[tag] = curr_index
        curr_index += 1

    curr_index = 0
    for line in train_data:
        for word in line.words:
            if word not in obs_dict:
                obs_dict[word] = curr_index
                curr_index += 1

    S = len(state_dict.keys())
    L = len(obs_dict.keys())
    pi = np.zeros([S])
    A = np.zeros([S, S])
    B = np.zeros([S, L])

    for line in train_data:
        pi[state_dict[line.tags[0]]] += 1
    pi /= np.sum(pi)

    for line in train_data:
        for i in range(len(line.tags)-1):
            A[state_dict[line.tags[i]], state_dict[line.tags[i+1]]] += 1

    for i in range(S):
        A[i, :] /= np.sum(A[i, :])

    for line in train_data:
        for i in range(len(line.words)):
            B[state_dict[line.tags[i]], obs_dict[line.words[i]]] += 1

    for i in range(S):
        B[i, :] /= np.sum(B[i, :])

    model = HMM(pi, A, B, obs_dict, state_dict)
    return model