示例#1
0
class HMMTagger(object):
    def __init__(self, hmm_model=None):
        self.hmm_model = HMMModel() if hmm_model is None else hmm_model

    def train_one_line(self, line):
        list_of_word_tag_pair = []
        for word_pinyin_tag in line.split():
            # extract
            word, tag = word_pinyin_tag.split('/')
            list_of_word_tag_pair.append((word, tag))

        self.hmm_model.train_one_line(list_of_word_tag_pair)

    def predict(self, line, output_graphml_file=None):
        word_list = [i.strip() for i in line.split()]

        word_tag_pair = self.hmm_model.predict(word_list, output_graphml_file)
        word_tag = ["{}/{}".format(i[0], i[1]) for i in word_tag_pair]

        word_tag_str = "  ".join(word_tag)  # using two space char for better visible for human just as origin does

        return word_tag_str

    @classmethod
    def load(cls):
        hmm_model = HMMModel.load_model(model_data_dir)

        return cls(hmm_model)
示例#2
0
def test_save_model(tmpdir):
    model_dir = tmpdir.mkdir("some_dir")
    model_dir_str = str(model_dir)

    A = {}
    B = {}

    vocabulary = set()

    hmm_model = HMMModel(A, B, vocabulary)
    hmm_model.save_model(model_dir_str)

    assert len(model_dir.listdir()) == 3
示例#3
0
文件: t.py 项目: newszeng/ai_writer
def load():
    model_dir_str = "data/hmm"

    # hmm_model = HMMModel()
    hmm_model = HMMModel.load_model(model_dir=model_dir_str)
    print(hmm_model)
    text = "它们"
    s = jieba_seg_list(text)
    result = hmm_model.predict(s)
    print(result)
示例#4
0
def test_hmm_train(train_data):
    first_train_data = train_data[0]

    hmm_model = HMMModel()

    for single_train_data in train_data:
        hmm_model.train_one_line(single_train_data)

    hmm_model.do_train()

    result = hmm_model.predict([i[0] for i in first_train_data])

    assert first_train_data == result
示例#5
0
def test_hmm_math_error():
    """add test case for integer division easily happened in python 2"""

    hmm_model = HMMModel()
    train_data = [
        [('A', 'a'), ('B', 'b')],
        [('A', 'c'), ('B', 'd')]
    ]

    for single_train_data in train_data:
        hmm_model.train_one_line(single_train_data)

    hmm_model.do_train()
示例#6
0
文件: t.py 项目: newszeng/ai_writer
def train():
    model_dir_str = "data/hmm"
    hmm_model = HMMModel()
    output = '/home/terry/pan/github/Bert-Sentence-streamlining/Bert-Sentence-streamlining/data/train_old.json'
    with open(output, 'r') as f:

        items = []
        for line in tqdm(f):
            j_content = json.loads(line)
            if j_content['label'] == "Yes":
                items.append(j_content)
                one_line = bulid_mark(j_content['sentence'])
                # print(j_content['sentence'])
                # print(one_line)
                hmm_model.train_one_line(one_line)
        hmm_model.save_model(model_dir_str)
    text = "它们的岗位,一只边牧可以管理上千头羊群呢,它们为主人忠心耿耿的守护着家畜,守护着家园"
    s = jieba_seg_list(text)
    result = hmm_model.predict(s)
    print(result)
    print(hmm_model)
示例#7
0
    def load(cls):
        hmm_model = HMMModel.load_model(model_data_dir)

        return cls(hmm_model)
示例#8
0
 def __init__(self, hmm_model=None):
     self.hmm_model = HMMModel() if hmm_model is None else hmm_model
示例#9
0
    def from_disk(self, model_path, tokenizer_list, *args, **kwargs):
        hmm_model = HMMModel.load_model(model_path)

        for tokenizer in tokenizer_list:
            tokenizer.assign_from_loader(hmm_model=hmm_model)
示例#10
0
    def __init__(self, *args, **kwargs):
        super(HMMTokenizer, self).__init__(*args, **kwargs)

        self.hmm_model = HMMModel()  # type: HMMModel
示例#11
0
class HMMTokenizer(BaseTokenizer):
    def __init__(self, *args, **kwargs):
        super(HMMTokenizer, self).__init__(*args, **kwargs)

        self.hmm_model = HMMModel()  # type: HMMModel

    def train_one_line(self, token_list):
        list_of_word_tag_pair = []
        for word in token_list:
            word = word.strip()

            tag = self._generate_char_tag_for_word(word)

            list_of_word_tag_pair.extend(list(zip(word, tag)))

        self.hmm_model.train_one_line(list_of_word_tag_pair)

    def do_train(self):
        self.hmm_model.do_train()

    @staticmethod
    def _generate_char_tag_for_word(word):
        # TODO: tag set related function should go to a standalone package
        len_of_word = len(word)

        if len_of_word == 1:
            return 'S'

        if len_of_word >= 2:
            number_of_middle = len_of_word - 2
            return 'B' + 'M' * number_of_middle + 'E'

    def predict(self, line, output_graphml_file=None):
        char_list = line

        char_tag_pair = self.hmm_model.predict(char_list, output_graphml_file)

        # TODO: current BMES decoding is not good, can't raise decoding exception

        token_list = []
        word_char = []
        for char, tag in char_tag_pair:
            # no matter what, word_char still need record
            word_char.append(char)

            if tag == "S" or tag == "E":
                # emission token word
                word = "".join(word_char)
                token_list.append(word)

                # reset word_char cache
                word_char = []

        # no matter what, char can not disappear
        if word_char:
            word = "".join(word_char)
            token_list.append(word)

        return token_list

    def segment(self, message):
        # type: (str) -> List[str]

        return self.predict(message)

    def load_model(self):
        self.hmm_model = HMMModel.load_model(self.model_dir)

    def persist_to_dir(self, output_dir):
        # type: (str) -> None
        self.hmm_model.save_model(output_dir)

    def assign_from_loader(self, *args, **kwargs):
        self.hmm_model = kwargs['hmm_model']

    def get_loader(self):
        return HMMLoader
示例#12
0
 def load_model(self):
     self.hmm_model = HMMModel.load_model(self.model_dir)