Python HMM.HMM示例，hmm.HMM.HMM Python示例

示例#1

0

显示文件

文件： test_gibbs.py 项目： harperjiang/TTIC31210

    def test_sample(self):
        train_ds = UDDataSet("data/en-ud-train.conllu")
        dev_ds = UDDataSet("data/en-ud-dev.conllu", train_ds)

        gibbs = Gibbs(HMM(train_ds))

        sample = gibbs.sample(dev_ds.sentences()[50], 10)
        print([train_ds.idx2pos(i) for i in sample])

示例#2

0

显示文件

文件： test_parser.py 项目： anna-hope/morphophonology_spe

 def test_parser2(self):
     hmm = HMM({
         INITIAL_STATE: ['q1'],
         'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
         'q2': ([FINAL_STATE], ['z'])
     })
     grammar = Grammar(hmm, self.plural_english_rule_set)
     nfa = grammar.get_nfa()

示例#3

0

显示文件

文件： final_devoicing.py 项目： taucompling/morphophonology_spe

 def test_from_simulation1(self):
     hmm = HMM({'q0': ['q1'],
      'q1': (['qf'],  ['ba', 'baFi', 'babF', 'badF', 'bbFa', 'bbFidbF', 'bdFibF', 'bi', 'bibF', 'bidBF', 'dFabFF', 'dFaddF'])
       })
     rule1 = Rule([{'voiceless': '-', 'labial': '+'}], [], [], [{'voiceless': '-', 'labial': '-'}], False)
     rule2 = Rule([{'voiceless': '-', 'high': '-'}], [{'labial': '-'}], [], [{'voiceless': '-'}], False)
     rule3 = Rule([{}], [], [{'cons': '+'}], [{'voiceless': '+', 'bound': '-'}], True)
     return self.get_energy(hmm, [rule1, rule2, rule3], "from_simulation1")

示例#4

0

显示文件

 def test_np_chunk_pos(self):
     """predicting sequences using baseline feature"""
     train, test = self.split_np_chunk_corpus(POS)
     classifier = HMM()
     classifier.train(train)
     results = ConfusionMatrix(classifier, test)
     _, _, _, _ = results.print_out()
     self.assertGreater(accuracy(classifier, test), 0.55)

示例#5

0

显示文件

    def get_random_hypothesis_randomized(cls, simulation, data, initial_hmm=None, initial_rules=None):
        if initial_rules:
            rule_set = RuleSet.load_from_flat_list(initial_rules)
        elif not configurations['EVOLVE_RULES']:
            rule_set = RuleSet.load_from_flat_list(deepcopy(simulation.target_tuple[1]))
        else:
            rule_set = RuleSet.get_random_rule_set()

        if initial_hmm:
            hmm = HMM(deepcopy(initial_hmm))
        elif not configurations['EVOLVE_HMM']:
            hmm = HMM(deepcopy(simulation.target_tuple[0]))
        else:
            hmm = HMM.get_random_hmm(data)

        grammar = Grammar(hmm, rule_set)
        return Hypothesis(grammar)

示例#6

0

显示文件

 def test_np_chunk_baseline(self):
     """predicting sequences using baseline feature"""
     train, test = self.split_np_chunk_corpus(Document)
     classifier = HMM()
     classifier.train(train)
     test_result = compute_cm(classifier, test)
     _, _, f1, accuracy = test_result.print_out()
     self.assertGreater(accuracy, 0.55)

示例#7

0

显示文件

def hmm_test():

    st_time = time.time()

    model_file = "hmm_model.json"

    # load data
    with open(model_file, 'r') as f:
        data = json.load(f)
    A = np.array(data['A'])
    B = np.array(data['B'])
    pi = np.array(data['pi'])
    # observation symbols
    obs_dict = data['observations']
    # state symbols
    states_symbols = dict()
    for idx, item in enumerate(data['states']):
        states_symbols[item] = idx
    Osequence = np.array(data['Osequence'])
    N = len(Osequence)
    model = HMM(pi, A, B, obs_dict, states_symbols)

    delta = model.forward(Osequence)
    m_delta = np.array([[3.5000e-01, 1.3600e-01, 0.0000e+00, 0.0000e+00, 1.1136e-05, 1.1136e-05, 0.0000e+00],
               [1.5000e-01, 3.2000e-02, 4.6400e-03, 2.7840e-04, 3.3408e-05, 1.1136e-05, 8.9088e-07]])

    print("Your forward function output:", delta)
    print("My forward function output:", m_delta)

    gamma = model.backward(Osequence)
    m_gamma = np.array([[1.6896e-06, 3.8400e-06, 6.4000e-05, 2.0000e-03, 1.4000e-02, 2.0000e-02, 1.0000e+00],
               [1.9968e-06, 1.1520e-05, 1.9200e-04, 3.2000e-03, 2.2000e-02, 6.0000e-02, 1.0000e+00]])

    print("Your backward function output:", gamma)
    print("My backward function output:", m_gamma)

    prob1 = model.sequence_prob(Osequence)
    m_prob1 = 8.908800000000002e-07

    print("Your sequence_prob function output:", prob1)
    print("My sequence_prob function output:", m_prob1)

    prob2 = model.posterior_prob(Osequence)
    m_prob2 = np.array([[0.6637931, 0.5862069, 0., 0., 0.175, 0.25, 0.],
               [0.3362069, 0.4137931, 1., 1., 0.825, 0.75, 1.]])

    print("Your posterior_prob function output:", prob2)
    print("My posterior_prob function output:", m_prob2)

    viterbi_path = model.viterbi(Osequence)
    m_viterbi_path = ['1', '1', '2', '2', '2', '2', '2']

    print('Your viterbi function output: ', viterbi_path)
    print('My viterbi function output: ', m_viterbi_path)

    en_time = time.time()
    print()
    print("hmm total time: ", en_time - st_time)

示例#8

0

显示文件

def model_training(train_data, tags):
    """
    Train HMM based on training data

    Inputs:
    - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class
    - tags: (1*num_tags) a list of POS tags

    Returns:
    - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data
    """
    model = None
    ###################################################
    default = 1e-06
    words = []
    word_index_map = {}

    S = len(tags)
    L = len(train_data)
    state_dict = {tags[i]: i for i in range(S)}
    first_tag_counts = {tags[i]: 0 for i in range(S)}
    tag_counts = {tags[i]: 0 for i in range(S)}
    tag_tag_counts = {
        tags[i]: {tags[j]: 0
                  for j in range(S)}
        for i in range(S)
    }
    tag_word_counts = {tags[i]: {} for i in range(S)}

    for line in train_data:
        first_tag_counts[line.tags[0]] += 1

        for index in range(line.length):
            tag, word = line.tags[index], line.words[index]

            if word not in word_index_map:
                word_index_map[word] = len(words)
                words.append(word)

            tag_counts[tag] += 1
            tag_word_counts[tag].setdefault(word, 0)
            tag_word_counts[tag][word] += 1

            if index < line.length - 1:
                nexttag = line.tags[index + 1]
                tag_tag_counts[tag][nexttag] += 1

    pi = np.array([first_tag_counts[t] for t in tags]) / L
    tag_counts_array = np.array([[tag_counts[t]] for t in tags])
    A = np.array([[tag_tag_counts[s].get(ss, default) for ss in tags]
                  for s in tags]) / tag_counts_array
    B = np.array([[tag_word_counts[s].get(w, default) for w in words]
                  for s in tags]) / tag_counts_array

    model = HMM(pi, A, B, word_index_map, state_dict)
    ###################################################
    return model

示例#9

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

    def test_morpheme_boundary(self):
        configurations["MORPHEME_BOUNDARY_FLAG"] = True
        self.initialise_segment_table("plural_english_segment_table.txt")
        hmm = HMM({INITIAL_STATE: ['q1'],
                   'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
                   'q2': ([FINAL_STATE], ['z'])})

        morpheme_boundary_hmm_transducer = hmm.get_transducer()
        self.write_to_dot_to_file(morpheme_boundary_hmm_transducer, "morpheme_boundary_hmm_transducer")

示例#10

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

    def test_get_log_lines(self):
        hmm = HMM({'q0': ['q1'],
                   'q1': (['q2', 'q3', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']),
                   'q2': (['q3', 'qf'], ['zo', 'go', 'do']),
                   'q3': (['qf'], ['as', 'ak', 'at'])})
        print(hmm)

        for line in hmm.get_log_lines():
            print(line)

示例#11

0

显示文件

文件： minitest_hmm.py 项目： hprovenza/hidden-markov-model

 def test_np_chunk_baseline(self):
     """Test NP chunking with word and postag feature"""
     train, test = self.split_np_chunk_corpus(Document)
     classifier = HMM()
     classifier.train(train)
     test_result = compute_cm(classifier, test)
     _, _, f1, accuracy = test_result.print_out()
     self.assertGreater(accuracy, 0.55)
     self.assertTrue(all(i>=.90 for i in f1), 'not all greater than 90.0%')

示例#12

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

 def test_change_segment_in_emission(self):
     self.initialise_segment_table("plural_english_segment_table.txt")
     hmm = HMM({INITIAL_STATE: ['q1'],
                'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
                'q2': ([FINAL_STATE], ['z'])})
     self.write_to_dot_to_file(hmm, "hmm")
     segments = SegmentTable().get_segments_symbols()
     hmm.change_segment_in_emission(segments)
     print(hmm.get_all_emissions())

示例#13

0

显示文件

文件： main.py 项目： pmarkovic/comp_ling

def main(args):
    model = HMM(args.full_emissions, \
                args.add_one, \
                args.end_token, \
                args.config_path, \
                args.data_path, \
                args.save_model)

    model.test_model(args.data_path, args.save_test)

示例#14

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

    def test_crossover_connected_components(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        hmm_1 = HMM({INITIAL_STATE: ['q1'],
                     'q1': (['q2'], ['dogo', 'koko']),
                     'q2': (['q1', FINAL_STATE], ['z'])})
        hmm_2 = HMM({INITIAL_STATE: ['q1'],
                     'q1': (['q2'], ['dag', 'kat']),
                     'q2': (['q3', FINAL_STATE], ['k']),
                     'q3': (['q1'], ['z'])})

        offspring_1, offspring_2 = HMM.crossover(hmm_1, hmm_2)

        self.write_to_dot_to_file(hmm_1, 'component_parent_1')
        self.write_to_dot_to_file(hmm_2, 'component_parent_2')
        self.write_to_dot_to_file(offspring_1, 'component_offspring_1')
        self.write_to_dot_to_file(offspring_2, 'component_offspring_2')
        offspring_1.get_transducer()
        offspring_2.get_transducer()

示例#15

0

显示文件

def fit_sin(length=100,n_hidden_states=10,max_iter=10):
    x, y = sin(length)

    hmm = HMM(n_hidden_states,101)
    hmm.fit(np.reshape(y,(1,-1)), max_iter=max_iter)
    y_gen = hmm.generate(length)

    show_data(x,y)
    show_data(x,y_gen)

示例#16

0

显示文件

 def test_morpheme_boundary(self):
     configurations["MORPHEME_BOUNDARY_FLAG"] = True
     self.initialise_segment_table("plural_english_segment_table.txt")
     hmm = HMM({
         INITIAL_STATE: ['q1'],
         'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
         'q2': ([FINAL_STATE], ['z'])
     })
     grammar = Grammar(hmm, [])

示例#17

0

显示文件

文件： speech_tagging.py 项目： agjay96/HMM

def model_training(train_data, tags):
    """
    Train HMM based on training data

    Inputs:
    - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class
    - tags: (1*num_tags) a list of POS tags

    Returns:
    - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data
    """
    model = None
    
    ###################################################
    # Edit here
    N=len(train_data)
    S=len(tags)
    pi=np.zeros(S)
    A=np.zeros((S,S))
    state_dict={}
    obs=[]
    obs_dict={}
    o=0
    for t in range(S):
        state_dict[tags[t]]=t
    
    for line in train_data:
        pi[state_dict[line.tags[0]]]+=1
        for w in range(line.length-1):
            A[state_dict[line.tags[w]],state_dict[line.tags[w+1]]]+=1

    for line in train_data:
        for w in range(line.length):
            if line.words[w] not in obs_dict.keys():
                obs_dict[line.words[w]]=o
                o+=1  
    
    pi=pi/N
    A=(A.T/np.sum(A, axis=1)).T

               
    O=len(obs_dict)
    
    B=np.zeros((S,O))
    
    for line in train_data:
        for w in range(line.length):
            B[state_dict[line.tags[w]],obs_dict[line.words[w]]]+=1
    B=(B.T/np.sum(B,axis=1)).T
    a1=np.isnan(A)
    A[a1]=0
    b1=np.isnan(B)
    B[b1]=0
    model=HMM(pi,A,B,obs_dict,state_dict)
    
    return model

示例#18

0

显示文件

文件： tests.py 项目： koenboeckx/prob_inference

    def test_simple_hmm(self):
        "https://people.csail.mit.edu/rameshvs/content/hmms.pdf"
        hidden_values = ['Area 1', 'Area 2', 'Area 3']

        prior = {'Area 1': 1 / 3, 'Area 2': 1 / 3, 'Area 3': 1 / 3}

        transition_table = {
            'Area 1': {
                'Area 1': 0.25,
                'Area 2': 0.75,
                'Area 3': 0.
            },
            'Area 2': {
                'Area 1': 0.,
                'Area 2': 0.25,
                'Area 3': 0.75
            },
            'Area 3': {
                'Area 1': 0.,
                'Area 2': 0.,
                'Area 3': 1.
            },
        }

        obs_values = ['hot', 'cold']
        obs_table = {
            'Area 1': {
                'hot': 1.0,
                'cold': 0.0
            },
            'Area 2': {
                'hot': 0.0,
                'cold': 1.0
            },
            'Area 3': {
                'hot': 1.0,
                'cold': 0.0
            },
        }

        hmm = HMM(hidden_values,
                  prior,
                  transition_table,
                  obs_values,
                  obs_table,
                  n_steps=3)
        hmm.set_observed_values(['hot', 'cold', 'hot'])

        hmm.compute_alpha()
        result = hmm.filtered_posterior(index=2)
        for r, v in zip(result.values(), [0.0, 0.0, 1.0]):
            self.assertEqual(r, v)
        hmm.compute_beta()
        for r, v in zip(
                hmm.smoothed_posterior(index=2).values(), [0.0, 0.0, 1.0]):
            self.assertEqual(r, v)

示例#19

0

显示文件

    def prepare_seqs_en(self, decoding="viterbi"):
        params_fixed = (np.load("{}/ip.npy".format(self.path)),
                        np.load("{}/tp.npy".format(self.path)),
                        np.load("{}/fp.npy".format(self.path)),
                        np.load("{}/ep.npy".format(self.path)))

        h = HMM(self.n_states,
                self.n_obs,
                params=params_fixed,
                writeout=False,
                dirname=self.path)

        self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict)

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        muc_seq = self.ner_corpus.read_sequence_list_conll(
            muc_test) if self.use_muc else None

        decoder = None
        type_decoder = None
        if decoding == "viterbi":
            decoder = h.viterbi_decode_corpus
        elif decoding == "max_emission":
            decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined correctly, using Viterbi.")
            decoder = h.viterbi_decode_corpus

        print(
            "Decoding word representations on train. This may take a while...")
        type_decoder(
            train_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(train_seq)
        print("Decoding word representations on dev.")
        type_decoder(
            dev_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(dev_seq)
        print("Decoding word representations on test.")
        type_decoder(
            test_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(test_seq)
        if self.use_muc:
            print("Decoding word representations on MUC.")
            type_decoder(
                muc_seq, self.dataset,
                self.logger) if type_decoder is not None else decoder(muc_seq)

        return train_seq, dev_seq, test_seq, muc_seq

示例#20

0

显示文件

 def __get_best_pos_to_shoot(self):
     """Returns a position in which is more likely to shoot the enemy.
 """
     #Gets the state of the markov model at time t.
     transition_probabilities = self.__get_net_probs()
     emission_probabilities = self.__get_net_probs()
     hmm = HMM(transition_probabilities, emission_probabilities)
     emissions = [2, 1, 0]
     initial = self.__get_net_probs()
     return (self.net[self.viterbi(hmm, initial, emissions)[0]].id)

示例#21

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

    def test_advance_emission(self):
        hmm = HMM({'q0': ['q1'],
                   'q1': (['q1', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do'] + ['at'])
                   })

        self.write_to_dot_to_file(hmm, "pre_advance_emission_hmm")
        hmm.advance_emission()
        for line in hmm.get_log_lines():
            print(line)
        self.write_to_dot_to_file(hmm, "advance_emission_hmm")

示例#22

0

显示文件

    def test_init(self):
        hmm = HMM(UDDataSet('data/en-ud-train.conllu'))

        self.assertEqual(17, hmm.num_state)
        self.assertEqual(17, hmm.bos_idx)
        self.assertEqual(18, hmm.eos_idx)

        for i in range(hmm.num_state):
            self.assertAlmostEqual(-12.228919653600784,
                                   hmm.emission_counter[(i, -1)])

示例#23

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

    def test_crossover_subgraph(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        hmm_1 = HMM({INITIAL_STATE: ['q1'],
                     'q1': (['q1', 'q2'], ['da']),
                     'q2': ([FINAL_STATE], ['s'])})

        hmm_2 = HMM({INITIAL_STATE: ['q1'],
                     'q1': (['q2'], ['ko']),
                     'q2': (['q3'], ['bo']),
                     'q3': (['q4'], ['go']),
                     'q4': ([FINAL_STATE], ['z'])})

        offspring_1, offspring_2 = HMM.crossover_subgraphs(hmm_1, hmm_2)

        self.write_to_dot_to_file(hmm_1, 'subgraph_parent_1')
        self.write_to_dot_to_file(hmm_2, 'subgraph_parent_2')
        self.write_to_dot_to_file(offspring_1, 'subgraph_offspring_1')
        self.write_to_dot_to_file(offspring_2, 'subgraph_offspring_2')
        offspring_1.get_transducer()
        offspring_2.get_transducer()

示例#24

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

    def test_split_then_merge_state(self):
        hmm = HMM({'q0': ['q1'],
                   'q1': (['qf'], ['koko', 'gogo'])
                   })

        self.write_to_dot_to_file(hmm, "split_states_before")
        hmm.split_state()
        hmm.merge_states()
        for line in hmm.get_log_lines():
            print(line)
        self.write_to_dot_to_file(hmm, "split_states_after")

示例#25

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

    def test_merge_emissions(self):
        hmm = HMM({'q0': ['q1', 'q5'],
                   'q1': (['qf'], ['koko']),
                   'q5': (['qf'], ['dag', 'kat'])
                   })

        self.write_to_dot_to_file(hmm, "merge_states_before")
        hmm.merge_emissions()
        for line in hmm.get_log_lines():
            print(line)
        self.write_to_dot_to_file(hmm, "merge_states_after")

示例#26

0

显示文件

 def test_morpheme_boundary(self):
     self.configurations["MORPHEME_BOUNDARY_FLAG"] = True
     self.initialise_segment_table("plural_english_segment_table.txt")
     hmm = HMM({
         INITIAL_STATE: ['q1'],
         'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
         'q2': ([FINAL_STATE], ['z'])
     })
     grammar = Grammar(hmm)
     self.assertCountEqual(['dog', 'kat', 'dogz', 'katz'],
                           grammar.get_all_outputs())

示例#27

0

显示文件

文件： test_hmm.py 项目： taucompling/morphophonology_spe

 def test_remove_states(self):
     hmm = HMM({INITIAL_STATE: ['q1'],
                'q1': (['q4', FINAL_STATE], ['a']),
                'q4': (['q8', 'q8', FINAL_STATE], ['d']),
                'q7': (['q8'], ['d']),
                'q8': (['q2', FINAL_STATE], ['g']),
                'q2': (['q1', FINAL_STATE], ['z'])})
     hmm.remove_states(['q1'])
     print(hmm.inner_states)
     self.write_to_dot_to_file(hmm, 'after_remove')
     log_hmm(hmm)

示例#28

0

显示文件

文件： tagger.py 项目： 773780238/hidden-markov-model

def model_training(train_data, tags):
    #####lowercased######
    for data in train_data:
        for it in range(data.length):
            data.words[it] = data.words[it].lower()

    #####################
    S = len(tags)
    pi = np.zeros(S)
    A = np.zeros([S, S])
    B = []
    Bc = np.zeros([S, 1])
    Ac = np.zeros([S, S])
    obs_dict = {}
    states_symbols = {}
    for i in range(S):
        if not tags[i] in states_symbols.keys():
            states_symbols[tags[i]] = i
    numS = np.zeros(S)
    num1S = np.zeros(S)
    ####################################
    for data in train_data:
        firsttag = data.tags[0]
        num1S[states_symbols[firsttag]] += 1
        for i in range(data.length):
            word = data.words[i]
            tag = data.tags[i]
            if not word in obs_dict.keys():
                obs_dict[word] = len(obs_dict)
                Bc = np.append(Bc, np.zeros([S, 1]), axis=1)
            Bc[states_symbols[tag], obs_dict[word]] += 1

            numS[states_symbols[tag]] += 1
            if i != data.length - 1:
                Ac[states_symbols[tag], states_symbols[data.tags[i + 1]]] += 1
    B = np.zeros(np.shape(Bc))
    pi = normalize(num1S)
    for s in range(S):
        for sp in range(S):
            if numS[s] == 0:
                A[s, sp] = 0
            else:
                A[s, sp] = Ac[s, sp] / numS[s]
    for s in range(len(Bc)):
        for o in range(len(Bc[0])):
            if numS[s] == 0:
                B[s, o] = 0
            else:
                B[s, o] = Bc[s, o] / numS[s]
    ###################################

        model = HMM(pi, A, B, obs_dict, states_symbols)

    return model

示例#29

0

显示文件

文件： test_hypothesis.py 项目： taucompling/morphophonology_spe

    def test_morphology_only2(self):
        self.initialise_segment_table("plural_english_segment_table.txt")
        self.configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25
        data = [u'tozata', u'tozaso', u'tozakt', u'tozzookata', u'tozzookaso', u'tozzookakt', u'tozzook', u'tozdodata', u'tozdodaso', u'tozdodakt', u'tozdod', u'tozgosata', u'tozgosaso', u'tozgosakt', u'tozgos', u'toz', u'dagata', u'dagaso', u'dagakt', u'dagzookata', u'dagzookaso', u'dagzookakt', u'dagzook', u'dagdodata', u'dagdodaso', u'dagdodakt', u'dagdod', u'daggosata', u'daggosaso', u'daggosakt', u'daggos', u'dag', u'gasata', u'gasaso', u'gasakt', u'gaszookata', u'gaszookaso', u'gaszookakt', u'gaszook', u'gasdodata', u'gasdodaso', u'gasdodakt', u'gasdod', u'gasgosata', u'gasgosaso', u'gasgosakt', u'gasgos', u'gas', u'kodata', u'kodaso', u'kodakt', u'kodzookata', u'kodzookaso', u'kodzookakt', u'kodzook', u'koddodata', u'koddodaso', u'koddodakt', u'koddod', u'kodgosata', u'kodgosaso', u'kodgosakt', u'kodgos', u'kod', u'katata', u'kataso', u'katakt', u'katzookata', u'katzookaso', u'katzookakt', u'katzook', u'katdodata', u'katdodaso', u'katdodakt', u'katdod', u'katgosata', u'katgosaso', u'katgosakt', u'katgos', u'kat', u'dotata', u'dotaso', u'dotakt', u'dotzookata', u'dotzookaso', u'dotzookakt', u'dotzook', u'dotdodata', u'dotdodaso', u'dotdodakt', u'dotdod', u'dotgosata', u'dotgosaso', u'dotgosakt', u'dotgos', u'dot']
        hmm = HMM({'q0': [u'q1'],
        'q1': ([u'q2', u'q3', u'qf'], ['toz', 'dag', 'kat', 'dot', 'kod', 'gas']),
        'q2': ([u'q3',u'qf'], ['zook', 'gos', 'dod']),
        'q3': ([u'qf'], ['aso', 'akt', 'ata'])})

        self.configurations.simulation_data = data
        hypothesis = Hypothesis(Grammar(hmm, []))

示例#30

0

显示文件

文件： tagger.py 项目： superyideng/cs567_machine_learning_PAs

def model_training(train_data, tags):
    """
	Train HMM based on training data

	Inputs:
	- train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class
	- tags: (1*num_tags) a list of POS tags

	Returns:
	- model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data
	"""
    model = None
    ###################################################
    N = len(tags)
    A = np.ones((N, N)) / N
    pi = np.ones(N) / N

    state_dict, tag_dict, obs_dict = {}, {}, {}
    word_list = []

    for idx, tag in enumerate(tags):
        state_dict[tag] = idx

    for cur_line in train_data:
        pi[state_dict[cur_line.tags[0]]] += 1
        for idx in range(cur_line.length):
            tag = cur_line.tags[idx]
            word_list.append(cur_line.words[idx])
            if tag not in tag_dict:
                tag_dict[tag] = 1
            else:
                tag_dict[tag] += 1
            if idx < cur_line.length - 1:
                A[tags.index(cur_line.tags[idx]),
                  tags.index(cur_line.tags[idx + 1])] += 1

    word_list = list(set(word_list))
    for idx, word in enumerate(word_list):
        obs_dict[word] = idx

    total_tags = sum(tag_dict.values())
    for key in tag_dict.keys():
        tag_dict[key] /= total_tags

    B = np.zeros([N, len(word_list)])
    for line in train_data:
        for word, tag in zip(line.words, line.tags):
            B[state_dict[tag], obs_dict[word]] = tag_dict[tag]

    A /= np.sum(A, axis=1)[:, None]
    pi /= len(train_data)
    model = HMM(pi, A, B, obs_dict, state_dict)
    ###################################################
    return model