Exemplo n.º 1
0
def jiayan_cut_nostop(content, load_lm_dir):
    lm = load_lm(load_lm_dir)
    tokenizer = CharHMMTokenizer(lm)
    word_list = []
    if content != '' and content is not None:
        seg_list = tokenizer.tokenize(content)
        for word in seg_list:
            word_list.append(word)

    return " ".join(word_list)
Exemplo n.º 2
0
def train_punctuator(lm_path, data_file, cut_model, out_model):
    lm = load_lm(lm_path)
    punctuator = CRFPunctuator(lm, cut_model)
    print('Building data...')
    X, Y = punctuator.build_data(data_file)
    train_x, train_y, test_x, test_y = punctuator.split_data(X, Y)
    X[:] = []
    Y[:] = []
    print('Training...')
    punctuator.train(train_x, train_y, out_model)
    punctuator.eval(test_x, test_y, out_model)
Exemplo n.º 3
0
def train_sentencizer(lm_path, data_file, out_model):
    lm = load_lm(lm_path)
    sentencizer = CRFSentencizer(lm)
    print('Building data...')
    X, Y = sentencizer.build_data(data_file)
    train_x, train_y, test_x, test_y = sentencizer.split_data(X, Y)
    X[:] = []
    Y[:] = []
    print('Training...')
    sentencizer.train(train_x, train_y, out_model)
    sentencizer.eval(test_x, test_y, out_model)
Exemplo n.º 4
0
def jiayan_cut_sample(content, load_lm_dir):
    #stop_word=get_stop_words(stop_word_dir)
    lm = load_lm(load_lm_dir)
    tokenizer = CharHMMTokenizer(lm)
    word_list = []

    if content != '' and content is not None:
        seg_list = tokenizer.tokenize(content)
        for word in seg_list:
            #if word not in stop_word and  '\u4e00' <= word <= '\u9fa5':
            word_list.append(word)

    return word_list
Exemplo n.º 5
0
def tag_text(root, target_root):
    # jiayan ancient text cut tool
    lm = load_lm('jiayan.klm')
    punctuator = CRFPunctuator(lm, 'cut_model')
    punctuator.load('punc_model')
            
    make_dir(target_root)
    file_ob_list = get_all_files(root)
    for f_name in tqdm.tqdm(file_ob_list, desc="Process unmarked file"):
        file = open(root + f_name, "r", encoding='utf-8-sig',errors='ignore')
        output = open(target_root + f_name, "w+", encoding='utf-8-sig', errors='ignore')
        for line in file.readlines():
            if len(line.strip()) == 0:
                continue
            output.write(punctuator.punctuate(line.strip()))
            output.write('\n')
        file.close()
        output.close()
Exemplo n.º 6
0
def crf_punctuate(lm_path, cut_model, punc_model, text):
    lm = load_lm(lm_path)
    punctuator = CRFPunctuator(lm, cut_model)
    punctuator.load(punc_model)
    print(punctuator.punctuate(text))
Exemplo n.º 7
0
def crf_sentencize(lm_path: str, cut_model, text):
    lm = load_lm(lm_path)
    sentencizer = CRFSentencizer(lm)
    sentencizer.load(cut_model)
    print(sentencizer.sentencize(text))
Exemplo n.º 8
0
def hmm_tokenize(lm_path: str, text: str):
    lm = load_lm(lm_path)
    tokenizer = CharHMMTokenizer(lm)
    print(list(tokenizer.tokenize(text)))
 def load_tokenizer(self):
     if self.tokenizer is None:
         lm = jiayan.load_lm("source/jiayan.klm")
         self.tokenizer = jiayan.CharHMMTokenizer(lm)
Exemplo n.º 10
0
from jiayan import PMIEntropyLexiconConstructor
from jiayan import CharHMMTokenizer
from jiayan import WordNgramTokenizer
from jiayan import CRFSentencizer
from jiayan import CRFPunctuator
from jiayan import CRFPOSTagger
from jiayan import load_lm
import os
import shutil
from tqdm import tqdm
import re
from collections import Counter
from pprint import pprint

lm = load_lm('/home/zy/mnt/nlp_test/Jiayan/jiayan_models/jiayan.klm')
tokenizer = CharHMMTokenizer(lm)
out_ls = []
words = []
reg = "[^0-9A-Za-z\u4e00-\u9fa5]"
fp = open("origin_record.txt", "r")
fg = open("tmp.out", "w")
ls = fp.readlines()
for i in ls:
    i = re.sub(reg, "", i)
    tstr = ""
    gg = list(tokenizer.tokenize(i))
    for j in gg:
        tstr = tstr + j + " "
        words.append(j)
    tstr += "\n"
    out_ls.append(tstr)
Exemplo n.º 11
0
    f.write("\n")
    f.close()


def list_to_text(p_list):
    txt = ""
    for p in p_list:
        txt = txt + p + " "
    return txt[:-1]


if __name__ == '__main__':
    lm_path = 'C:/TJlab/Tang/chinese_poetry/jiayan.klm'
    print('\nTokenizing test text with HMM...')
    # init_file()
    lm = load_lm(lm_path)
    hmm_tokenizer = CharHMMTokenizer(lm)
    tang_tokenizer = TangCharHMMTokenizer(lm)

    # f = open("resource/qujiang_raw.txt", encoding='utf-8')
    # line = f.readline()
    # while line:
    #     # list_to_file("resource/qujiang_hmm.txt", list(tang_tokenizer.tokenize(line)))
    #     list_to_file("resource/qujiang_tang.txt", tang_tokenizer.intervene_tokenize(line))
    #     # list_to_file("resource/qujiang_tang_trans.txt", tang_tokenizer.intervene(line))
    #     line = f.readline()
    # f.close()

    text0 = "送春归,三月尽日日暮时。去年杏园花飞御沟绿,何处送春曲江曲。今年杜鹃花落子规啼,送春何处西江西。帝城送春犹怏怏" \
            ",天涯送春能不加惆怅。莫惆怅,送春人。冗员无替五年罢,应须准拟再送浔阳春。五年炎凉凡十变,又知此身健不健。" \
            "好去今年江上春,明年未死还相见。"