示例#1
0
def generate_text():
    vocab_path = '../../data/people_char_vocab.pkl'
    model_dir = '../../models/hmm'
    states = ['B', 'M', 'E', 'S']

    vocabs = load_vocab(vocab_path)
    query_vocabs = {idx: char for char, idx in vocabs.items()}
    hmm = HMM(vocabs=vocabs, states=states)
    hmm.load_model(model_dir=model_dir)

    pi = hmm.pi_cnt
    tran_p = hmm.trans_cnt  # [S, S]
    emit_p = hmm.emit_cnt  # [S, V]

    # [S, S]
    trans_cdfs = [compute_cdf(tran_p[s, :]) for s in range(tran_p.shape[0])]

    # [S, V]
    emit_cdfs = [compute_cdf(emit_p[s, :]) for s in range(emit_p.shape[0])]

    state_idx = sample_start(pi)
    out_idx = sample_output(state_idx, emit_cdfs)
    out_char = query_vocabs[out_idx]

    num_text = 1000
    print(out_char, end='')

    for i in range(num_text - 1):
        state_idx = sample_output(state=state_idx, cdfs=trans_cdfs)
        out_idx = sample_output(state=state_idx, cdfs=emit_cdfs)
        out_char = query_vocabs[out_idx]
        print(out_char, end='')
        if (i + 1) % 50 == 0:
            print()
示例#2
0
def test():
    vocab_path = '../../data/people_char_vocab.pkl'
    vocabs = load_vocab(vocab_path)
    train_data_path = '../../data/people.txt'

    gen = train_generator(train_data_path, vocabs=vocabs)
    states = ['B', 'M', 'E', 'S']
    hmm = HMM(vocabs=vocabs, states=states)
    #hmm.train(train_generator=gen)
    model_dir = '../../models/hmm'
    #hmm.save_model(model_dir=model_dir)
    hmm.load_model(model_dir=model_dir)

    sentence = "我是中国人,我爱我的祖国"
    decode_states={0: 'B', 1: 'M', 2: 'E', 3: 'S'}
    hiddens = hmm.decode(outputs=sentence, decode_states=decode_states)

    words = hmm.format_hiddens(hiddens, sentence)

    print(hiddens)
    print('/ '.join(words))

    sentence = '4月29日,雄浑悠长的钟声响起,关闭了近百日的武汉黄鹤楼重新开门迎客。这钟声,传递出中华民族从磨难中奋起的昂扬斗志,彰显出伟大民族精神在新时代焕发出的熠熠光辉。'
    hiddens = hmm.decode(outputs=sentence, decode_states=decode_states)
    words= hmm.format_hiddens(hiddens, sentence)
    print('/ '.join(words))
def lihang_example():
    T = np.array([[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]])
    E = np.array([[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]])
    pi = np.array([0.2, 0.4, 0.4])

    #states = [0, 1, 2]
    states = {'a': 0, 'b': 1, 'c': 2}
    vocabs = {'red': 0, 'white': 1}

    hmm = HMM(states=states, vocabs=vocabs, pi=pi, trans_p=T, emit_p=E)

    O = ['red', 'white', 'red']

    f_prob = hmm.forward_evaluate(O)
    print('forward prob', f_prob)

    b_prob = hmm.backward_evaluate(O)
    print('backward prob', b_prob)

    decode_states = {0: 'a', 1: 'b', 2: 'c'}
    hiddens = hmm.decode(O, decode_states=decode_states)
    print('optimal hiddens', hiddens)
示例#4
0
def train():
    vocab_path = '../../data/people_char_vocab.pkl'
    vocabs = load_vocab(vocab_path)
    train_data_path = '../../data/people.txt'

    gen = train_generator(train_data_path, vocabs=vocabs)
    states = ['B', 'M', 'E', 'S']
    hmm = HMM(vocabs=vocabs, states=states)
    hmm.train(train_generator=gen)
    model_dir = '../../models/hmm'
    hmm.save_model(model_dir=model_dir)
示例#5
0
def mini_train():
    states = ['B', 'M', 'E', 'S']
    vocabs = {'我': 0, '是': 1, '中': 2, '国': 3, '人': 4, '家': 5}
    corpus = ['我 是 中国 人', '中国 是 我 家']

    hmm = HMM(vocabs=vocabs)
    #hmm.train(train_generator=mini_generator(corpus), max_seq_len=2)
    #hmm.save_model(model_dir='../../models/hmm')
    hmm.load_model(model_dir='../../models/hmm')

    hmm.cut(sentence='我是中国人')
示例#6
0
def test_hmm():
    vocab_path = '../../data/people_char_vocab.pkl'
    model_dir = '../../models/hmm'
    states = ['B', 'M', 'E', 'S']
    decode_states = {0: 'B', 1: 'M', 2: 'E', 3: 'S'}

    vocabs = load_vocab(vocab_path)
    hmm = HMM(vocabs=vocabs, states=states)
    hmm.load_model(model_dir=model_dir)
    sentence = "我是中国人,我爱我的祖国"

    hiddens = hmm.decode(outputs=sentence, decode_states=decode_states)
    words = hmm.format_hiddens(hiddens, sentence)

    print(hiddens)
    print('/ '.join(words))

    sentence = '4月29日,雄浑悠长的钟声响起,关闭了近百日的武汉黄鹤楼重新开门迎客。' \
               '这钟声,传递出中华民族从磨难中奋起的昂扬斗志,彰显出伟大民族精神在新时代焕发出的熠熠光辉。'
    sentence = 'I love you china'
    hiddens = hmm.decode(outputs=sentence, decode_states=decode_states)
    words = hmm.format_hiddens(hiddens, sentence)
    print('/ '.join(words))
示例#7
0
    fw.close()


if __name__ == "__main__":
    data_dir = get_data_dir()
    model_dir = get_model_dir()

    model_path = os.path.join(model_dir, "hmm", "hmm.pkl")
    test_path = os.path.join(data_dir, "msr_test.utf8")
    test_result_path = os.path.join(data_dir, "msr_test_hmm.utf8")
    dict_path = os.path.join(data_dir, "msr.dict")

    word_dict = load_dictionary(dict_path=dict_path)
    print("Total number of words is: %d\n" % (len(word_dict)))

    hmm = HMM()
    hmm.load_model(model_path=model_path, is_training=False)

    seg_res = seg_on_sentence(hmm, sentence='黑夜给了我黑色的眼睛,我却用它寻找光明。')
    print("/".join(seg_res))
    seg_on_file(model=hmm,
                test_path=test_path,
                test_result_path=test_result_path,
                is_use_matching=True,
                matching_method="bimm",
                max_num_char=6,
                word_dict=word_dict)

    print("Segmentation done!", test_result_path)
示例#8
0
from cangjie.hmm.hmm import HMM
from cangjie.utils.config import get_data_dir, get_model_dir
import os

if __name__ == '__main__':
    data_dir = get_data_dir()
    model_dir = get_model_dir()

    model_path = os.path.join(model_dir, "hmm", "hmm.pkl")

    hmm = HMM()

    # train_data_path = os.path.join(data_dir, "msr_training.utf8")
    #hmm.train(train_path=train_data_path, model_path=model_path, is_incre_train=False)

    train_data_path = os.path.join(data_dir, "people.txt")
    hmm.train(train_path=train_data_path,
              model_path=model_path,
              is_incre_train=True)