Пример #1
0
    def train(self, filename):
        verbose = False
        mc = 10  # min_count
        ft = 0.3  # force_abs_threshold
        nt = -0.3  # nonspace_threshold
        st = 0.3  # space_threshold

        model = CountSpace()

        rootDirPath = self.util.getRootPath("SmiToText.SmiToText")
        corpus_fname = rootDirPath + os.path.sep + "data" + os.path.sep + "koDetokenizerData" + os.path.sep + "ko_law_common_space.txt"
        model_fname = rootDirPath + os.path.sep + "kosoy-models" + os.path.sep + "soyspacing.model"

        ### 학습
        # model.train(corpus_fname)
        # model.save_model(model_fname, json_format=False)

        ## 모델 로드
        model.load_model(model_fname, json_format=False)

        #sent = '이건진짜좋은영화 라라랜드진짜좋은영화'
        # sent = '그일단그구성원인사람들과,,'
        sent = 'DAB는, 결정과 관련한 각 위원들의 모든 일당 수수료와 경비에 대한 청구금액이 완전하게 지급될 때 까지는, 결정문을 발급할 의무를 갖지 아니한다.'

        sent_input = sent.replace(" ", "")

        # with parameters
        setn_output_1, tags = model.correct(doc=sent_input,
                                            verbose=verbose,
                                            force_abs_threshold=ft,
                                            nonspace_threshold=nt,
                                            space_threshold=st,
                                            min_count=mc)

        # without parameters
        setn_output_2, tags = model.correct(sent_input)

        print(sent)
        print(setn_output_1)
        print(setn_output_2)
Пример #2
0
class Pixir:
    def __init__(self, max_seq_len):
        self.max_seq_len = max_seq_len

        self.input_text = None
        self.input_tokens = None
        self.input_embedding = None

        self.spacing_model = CountSpace()
        self.stage1_generator = None
        self.bert_model = None

    def load_spacing_model(self, model_path):
        self.spacing_model.load_model(model_path, json_format=False)

    def load_bert_model(self, model_path):
        paths = get_checkpoint_paths(model_path)
        self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint,
                                                             training=False, seq_len=self.max_seq_len)

    def load_stage1_generator(self, model_path):
        self.stage1_generator = Stage1WGANGP(768, 64, 0.1, 0.1, 1, 1, 1).generator
        # self.stage1_generator.load_weights(model_path)

    def spacing(self, text):
        sentence_corrected, tags = self.spacing_model.correct(text)
        self.input_text = sentence_corrected
        print(self.input_text)

    def tokenize(self):
        tokenizer = FullTokenizer('vocab.korean.rawtext.list')
        tokens = tokenize(self.input_text, tokenizer, self.max_seq_len)
        self.input_tokens = tokens

    def embedding(self):
        segments = np.ones_like(self.input_tokens)

        self.input_embedding = self.bert_model.predict([self.input_tokens, segments])

    def generate_stage1(self):
        z_noise = np.random.normal(0, 1, (self.input_embedding.shape[0], 100))

        img, _ = self.stage1_generator.predict([self.input_embedding, z_noise])
        img = (img + 1) / 2
        return Image.fromarray(img)

    def text2img(self, input_text):
        self.spacing(input_text)
        self.tokenize()
        self.embedding()
        img = self.generate_stage1()
        return img
Пример #3
0
def run_preprocess(inputPath: str, outputPath: str, modelPath: str,
                   module: str):
    if module == "countSpace":
        model = CountSpace()
        model.load_model(modelPath, json_format=False)
        with open(inputPath, 'r', encoding='utf-8') as inputData, \
                open(outputPath, 'w', encoding='utf-8') as outputData:
            for sentence in inputData:
                sentence = sentence.strip()
                if not sentence: continue
                sentence_corrected, _ = model.correct(sentence)
                outputData.writelines(sentence_corrected + "\n")
    elif module == "normalizer":
        print("do something")
    elif module == "noun":
        print("do something")
Пример #4
0
def apply_space_correct(corpus_fname, model_fname, output_corpus_fname, with_label=False):
    model = CountSpace()
    model.load_model(model_fname, json_format=False)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_corpus_fname, 'w', encoding='utf-8') as f2:
        for sentence in f1:
            if with_label:
                sentence, label = sentence.strip().split("\u241E")
            else:
                sentence = sentence.strip()
                label = None
            if not sentence: continue
            sent_corrected, _ = model.correct(sentence)
            if with_label:
                f2.writelines(sent_corrected + "\u241E" + label + "\n")
            else:
                f2.writelines(sent_corrected + "\n")
Пример #5
0
# model.load_model('model_spacing_3.h5', json_format=False)
# model.train('./korquad_3.txt')
# model.save_model('model_spacing_4.h5', json_format=False)

verbose = False
mc = 10  # min_count
ft = 0.4  # force_abs_threshold
nt = -0.3  # nonspace_threshold
st = 0.4  # space_threshold

sentence = '지않고'

# with parameters
sentence_corrected, tags = model.correct(doc=sentence,
                                         verbose=verbose,
                                         force_abs_threshold=ft,
                                         nonspace_threshold=nt,
                                         space_threshold=st,
                                         min_count=mc)
# without parameters
sentence_corrected, tags = model.correct(sentence)

f = open('rules.txt', mode='wt', encoding='utf-8')
# f.write('진짜 101\n')
# f.write('방울 101\n')
# f.write('나는 101\n')
# f.write('너를 101\n')
# f.write('영화 101\n')
# f.write('마리의 1001\n')
# f.write('강아지가 10001\n')
# f.write('저글링을 10001\n')
# f.write('한다 101\n')
Пример #6
0
model = CountSpace()
model.load_model(model_fname, json_format=False)

rule_dict = RuleDict('rules.txt')


text1 = '감사합니다 앞으로도 잘부탁드려요 풍성한토핑 맛난피자로 보답하겠습니다'
text2 = '맛있게 잘 먹었습니다~'
text3 = '마시써효!!!떡볶이도좋아요'
text4 = '불고기는 처음 시켜봤는데 상상 그이상....'
text5 = '냠냠~너무 맛있어용^^ 또 시켜먹어요넘나맛있네여피짜로덤왜인기가잇는지알겟둠원픽예약임툐쿄'
text6 = '영등포피자중 이찌방'
text7 = 'ㅋㅋㅋㅋ 파인애플 당연 추가한줄알고 실수했네요죄송염~~오늘도 맛나게 잘 먹겠습니다^^샐러드가 생각보다 푸짐하게 왔네요'

sent_corrected, tags = model.correct(text1, rules=rule_dict)
sent_corrected2, tags = model.correct(text2, rules=rule_dict)
sent_corrected3, tags = model.correct(text3, rules=rule_dict)
sent_corrected4, tags = model.correct(text4, rules=rule_dict)
sent_corrected5, tags = model.correct(text5, rules=rule_dict)
sent_corrected6, tags = model.correct(text6, rules=rule_dict)
sent_corrected7, tags = model.correct(text7, rules=rule_dict)

print('======soynlp====')
print(sent_corrected)
print(sent_corrected2)
print(sent_corrected3)
print(sent_corrected4)
print(sent_corrected5)
print(sent_corrected6)
print(sent_corrected7)
Пример #7
0
class Tag_dict:
    def __init__(self, content):
        self.content = content
        self.komoran = Komoran(userdic=os.getcwd() + '/user_dic.txt')
        self.model = CountSpace()
        self.adjective_dict = dict()  # 형용사: VA, VCN, VCP
        self.adverb_dict = dict()  # 부사: MAG
        self.conjunction_dict = dict()  # 접속사: MAJ
        self.determiner_dict = dict()  # 관형사: MM
        self.eomi_dict = dict()  # 어미: EC, EF, ETM, ETN
        self.josa_dict = dict(
        )  # 조사: JC, JKC, JKG, JKV, JKB, JKO, JKQ, JKS, JX
        self.noun_dict = dict()  # 명사: NNG, NNB, NNP, NP, NR
        self.preEomi_dict = dict()  # 선어말어미: EP
        self.suffix_dict = dict()  # 접사: XPN, XSA, XSN, XSV
        self.verb_dict = dict()  # 동사: VV, VX
        self.wordDict = dict()

    def judge_tag(self):
        for text in self.content:
            posList = self.komoran.pos(text)
            for pos in posList:
                # preprocessing
                word = re.sub("[ㄱ-ㅎ|ㅏ-ㅣ|.,?!]", repl="", string=str(pos[0]))
                if word == "":
                    continue

                # seperate tag & count
                tagName = tag_switch(pos[1])
                if tagName != -1:
                    if tagName == "adjective":
                        self.adjective_dict = tag_cnt(word,
                                                      self.adjective_dict)
                    elif tagName == "adverb":
                        self.adverb_dict = tag_cnt(word, self.adverb_dict)
                    elif tagName == "conjunction":
                        self.conjunction_dict = tag_cnt(
                            word, self.conjunction_dict)
                    elif tagName == "determiner":
                        self.determiner_dict = tag_cnt(word,
                                                       self.determiner_dict)
                    elif tagName == "eomi":
                        self.eomi_dict = tag_cnt(word, self.eomi_dict)
                    elif tagName == "josa":
                        self.josa_dict = tag_cnt(word, self.josa_dict)
                    elif tagName == "noun":
                        self.noun_dict = tag_cnt(word, self.noun_dict)
                    elif tagName == "preEomi":
                        self.preEomi_dict = tag_cnt(word, self.preEomi_dict)
                    elif tagName == "suffix":
                        self.suffix_dict = tag_cnt(word, self.suffix_dict)
                    elif tagName == "verb":
                        self.verb_dict = tag_cnt(word, self.verb_dict)

    def cnt_origin_word(self):
        if type(self.wordDict) is list:
            return
        for text in self.content:
            sent_corrected, tags = self.model.correct(text)
            words = del_special_char(sent_corrected).split(" ")
            for word in words:
                if word not in self.wordDict.keys():
                    self.wordDict[word] = 0
                self.wordDict[word] += 1

    def print_len(self):
        print("text line:", len(self.content))

    def print_noun_list(self):
        self.judge_tag()
        print(self.noun_dict)

    def print_tag_frequency(self, cnt=30):
        """
        print dict values frequency (descending)

        Args:
            :param: cnt(int)
        Returns:
            :param: tagDict(1st ~ until cnt-th) (dict)
        """
        self.judge_tag()

        self.adjective_dict = sorted(self.adjective_dict.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
        self.adverb_dict = sorted(self.adverb_dict.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
        self.conjunction_dict = sorted(self.conjunction_dict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
        self.determiner_dict = sorted(self.determiner_dict.items(),
                                      key=lambda x: x[1],
                                      reverse=True)
        self.eomi_dict = sorted(self.eomi_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
        self.josa_dict = sorted(self.josa_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
        self.noun_dict = sorted(self.noun_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
        self.preEomi_dict = sorted(self.preEomi_dict.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
        self.suffix_dict = sorted(self.suffix_dict.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
        self.verb_dict = sorted(self.verb_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)

        print("형용사(adjective):")
        print(self.adjective_dict[:cnt])
        print("\n부사(adverb):")
        print(self.adverb_dict[:cnt])
        print("\n접속사(conjunction):")
        print(self.conjunction_dict[:cnt])
        print("\n관형사(determiner):")
        print(self.determiner_dict[:cnt])
        print("\n어미(eomi):")
        print(self.eomi_dict[:cnt])
        print("\n조사(josa):")
        print(self.josa_dict[:cnt])
        print("\n명사(noun):")
        print(self.noun_dict[:cnt])
        print("\n선어말어미(preEomi):")
        print(self.preEomi_dict[:cnt])
        print("\n접사(suffix):")
        print(self.suffix_dict[:cnt])
        print("\n동사(verb):")
        print(self.verb_dict[:cnt])

    def print_origin_frequency(self, cnt=30):
        """
        print origin values frequency (descending)

        Args:
            :param: cnt(int)
        """
        self.cnt_origin_word()
        self.wordDict = sorted(self.wordDict.items(),
                               key=lambda x: x[1],
                               reverse=True)
        print(self.wordDict[:cnt])

    def print_dict(self, tagName):
        self.judge_tag()

        if tagName == "adjective":
            for tag in self.adjective_dict.keys():
                print(tag)
        elif tagName == "adverb":
            for tag in self.adverb_dict.keys():
                print(tag)
        elif tagName == "conjunction":
            for tag in self.conjunction_dict.keys():
                print(tag)
        elif tagName == "determiner":
            for tag in self.determiner_dict.keys():
                print(tag)
        elif tagName == "eomi":
            for tag in self.eomi_dict.keys():
                print(tag)
        elif tagName == "josa":
            for tag in self.josa_dict.keys():
                print(tag)
        elif tagName == "noun":
            for tag in self.noun_dict.keys():
                print(tag)
        elif tagName == "preEomi":
            for tag in self.preEomi_dict.keys():
                print(tag)
        elif tagName == "suffix":
            for tag in self.suffix_dict.keys():
                print(tag)
        elif tagName == "verb":
            for tag in self.verb_dict.keys():
                print(tag)

    def print_morph(self):
        for text in self.content:
            result = self.komoran.morphs(text)
            print(result)

    def print_pos(self):
        for text in self.content:
            result = self.komoran.pos(text)
            print(result)

    def save_compare(self, form):
        result = ""
        if form is "morph":
            for text in self.content:
                result += text + str(self.komoran.morphs(text)) + "\n\n"
        elif form is "pos":
            for text in self.content:
                result += text + str(self.komoran.pos(text)) + "\n\n"

        save_text_file(filename, result, form)

    def save_origin_frequency(self):
        result = ""
        self.judge_tag()
        self.cnt_origin_word()
        if type(self.wordDict) is dict:
            self.wordDict = sorted(self.wordDict.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
        """ save result as .txt """
        for key_value in self.wordDict:
            result += str(key_value) + "\n"
        save_text_file(filename, result, "origin")
        """ save new word dict to misspell_origin.xlsx """
        # load existence values & make as a dictionary
        pastData = read_xlsx_file()
        pastDataDict = dict()
        for i in range(pastData.shape[0]):
            valList = list()
            for j in range(1, pastData.shape[1]):
                if type(pastData.loc[i][j]) is str:
                    valList.append(pastData.loc[i][j])
                else:
                    break
            pastDataDict[pastData.loc[i][0]] = valList
        pastData_keyList = list(pastDataDict.keys())  # for delete overlap word

        # make current values as a list
        current_data_list = list(dict(self.wordDict).keys())

        # make new dict list (delete overlap word)
        newDictList = list(set(pastData_keyList + current_data_list))
        newDictList.remove("")  # delete empty element

        # re-write contents (data/misspell_origin.xlsx)
        rewrite_xlxs_file(pastDataDict, newDictList, "misspell_origin.xlsx")
        print(
            "===== Finish: save new word list to data/misspell_origin.xlsx ====="
        )

    def save_noun_standard(self):
        # count origin word frequency
        self.judge_tag()
        self.cnt_origin_word()
        if type(self.wordDict) is dict:
            self.wordDict = sorted(self.wordDict.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

        # load existence values & make as a dictionary
        pastData = read_xlsx_file("noun_standard")
        pastDataDict = dict()
        for i in range(pastData.shape[0]):
            valList = list()
            for j in range(1, pastData.shape[1]):
                if type(pastData.loc[i][j]) is str:
                    valList.append(pastData.loc[i][j])
                else:
                    break
            pastDataDict[pastData.loc[i][0]] = valList
        pastData_keyList = list(pastDataDict.keys())  # for delete overlap word

        # make current values as a list
        current_data_list = list()
        for noun in self.noun_dict.keys():
            current_data_list.append(noun)
        for key_value in self.wordDict:
            tmp = key_value[0]
            for noun in current_data_list:
                if noun in key_value[0]:
                    tmp = tmp.replace(noun, "")
            if tmp != "":
                current_data_list.append(tmp)

        # make new dict list (delete overlap word)
        newDictList = list(set(pastData_keyList + current_data_list))
        if "" in newDictList:
            newDictList.remove("")  # delete empty element

        # re-write contents (data/misspell_origin.xlsx)
        rewrite_xlxs_file(pastDataDict, newDictList,
                          "misspell_noun_standard.xlsx")
        print(
            "===== Finish: save new word list to data/misspell_noun_standard.xlsx ====="
        )