Пример #1
0
    def __init__(self, max_seq_len):
        self.max_seq_len = max_seq_len

        self.input_text = None
        self.input_tokens = None
        self.input_embedding = None

        self.spacing_model = CountSpace()
        self.stage1_generator = None
        self.bert_model = None
Пример #2
0
class Pixir:
    def __init__(self, max_seq_len):
        self.max_seq_len = max_seq_len

        self.input_text = None
        self.input_tokens = None
        self.input_embedding = None

        self.spacing_model = CountSpace()
        self.stage1_generator = None
        self.bert_model = None

    def load_spacing_model(self, model_path):
        self.spacing_model.load_model(model_path, json_format=False)

    def load_bert_model(self, model_path):
        paths = get_checkpoint_paths(model_path)
        self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint,
                                                             training=False, seq_len=self.max_seq_len)

    def load_stage1_generator(self, model_path):
        self.stage1_generator = Stage1WGANGP(768, 64, 0.1, 0.1, 1, 1, 1).generator
        # self.stage1_generator.load_weights(model_path)

    def spacing(self, text):
        sentence_corrected, tags = self.spacing_model.correct(text)
        self.input_text = sentence_corrected
        print(self.input_text)

    def tokenize(self):
        tokenizer = FullTokenizer('vocab.korean.rawtext.list')
        tokens = tokenize(self.input_text, tokenizer, self.max_seq_len)
        self.input_tokens = tokens

    def embedding(self):
        segments = np.ones_like(self.input_tokens)

        self.input_embedding = self.bert_model.predict([self.input_tokens, segments])

    def generate_stage1(self):
        z_noise = np.random.normal(0, 1, (self.input_embedding.shape[0], 100))

        img, _ = self.stage1_generator.predict([self.input_embedding, z_noise])
        img = (img + 1) / 2
        return Image.fromarray(img)

    def text2img(self, input_text):
        self.spacing(input_text)
        self.tokenize()
        self.embedding()
        img = self.generate_stage1()
        return img
Пример #3
0
 def __init__(self, content):
     self.content = content
     self.komoran = Komoran(userdic=os.getcwd() + '/user_dic.txt')
     self.model = CountSpace()
     self.adjective_dict = dict()  # 형용사: VA, VCN, VCP
     self.adverb_dict = dict()  # 부사: MAG
     self.conjunction_dict = dict()  # 접속사: MAJ
     self.determiner_dict = dict()  # 관형사: MM
     self.eomi_dict = dict()  # 어미: EC, EF, ETM, ETN
     self.josa_dict = dict(
     )  # 조사: JC, JKC, JKG, JKV, JKB, JKO, JKQ, JKS, JX
     self.noun_dict = dict()  # 명사: NNG, NNB, NNP, NP, NR
     self.preEomi_dict = dict()  # 선어말어미: EP
     self.suffix_dict = dict()  # 접사: XPN, XSA, XSN, XSV
     self.verb_dict = dict()  # 동사: VV, VX
     self.wordDict = dict()
Пример #4
0
def run_preprocess(inputPath: str, outputPath: str, modelPath: str,
                   module: str):
    if module == "countSpace":
        model = CountSpace()
        model.load_model(modelPath, json_format=False)
        with open(inputPath, 'r', encoding='utf-8') as inputData, \
                open(outputPath, 'w', encoding='utf-8') as outputData:
            for sentence in inputData:
                sentence = sentence.strip()
                if not sentence: continue
                sentence_corrected, _ = model.correct(sentence)
                outputData.writelines(sentence_corrected + "\n")
    elif module == "normalizer":
        print("do something")
    elif module == "noun":
        print("do something")
Пример #5
0
def apply_space_correct(corpus_fname, model_fname, output_corpus_fname, with_label=False):
    model = CountSpace()
    model.load_model(model_fname, json_format=False)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_corpus_fname, 'w', encoding='utf-8') as f2:
        for sentence in f1:
            if with_label:
                sentence, label = sentence.strip().split("\u241E")
            else:
                sentence = sentence.strip()
                label = None
            if not sentence: continue
            sent_corrected, _ = model.correct(sentence)
            if with_label:
                f2.writelines(sent_corrected + "\u241E" + label + "\n")
            else:
                f2.writelines(sent_corrected + "\n")
Пример #6
0
def train_module(corpus, moduleName: str, saveModulePath: str):
    if moduleName == "countSpace":
        model = CountSpace()
        model.train(corpus)
        model.save_model(saveModulePath, json_format=False)
    elif moduleName == "normalizer":
        print("s")
    elif moduleName == "noun":
        print("s")
Пример #7
0
    def train(self, filename):
        verbose = False
        mc = 10  # min_count
        ft = 0.3  # force_abs_threshold
        nt = -0.3  # nonspace_threshold
        st = 0.3  # space_threshold

        model = CountSpace()

        rootDirPath = self.util.getRootPath("SmiToText.SmiToText")
        corpus_fname = rootDirPath + os.path.sep + "data" + os.path.sep + "koDetokenizerData" + os.path.sep + "ko_law_common_space.txt"
        model_fname = rootDirPath + os.path.sep + "kosoy-models" + os.path.sep + "soyspacing.model"

        ### 학습
        # model.train(corpus_fname)
        # model.save_model(model_fname, json_format=False)

        ## 모델 로드
        model.load_model(model_fname, json_format=False)

        #sent = '이건진짜좋은영화 라라랜드진짜좋은영화'
        # sent = '그일단그구성원인사람들과,,'
        sent = 'DAB는, 결정과 관련한 각 위원들의 모든 일당 수수료와 경비에 대한 청구금액이 완전하게 지급될 때 까지는, 결정문을 발급할 의무를 갖지 아니한다.'

        sent_input = sent.replace(" ", "")

        # with parameters
        setn_output_1, tags = model.correct(doc=sent_input,
                                            verbose=verbose,
                                            force_abs_threshold=ft,
                                            nonspace_threshold=nt,
                                            space_threshold=st,
                                            min_count=mc)

        # without parameters
        setn_output_2, tags = model.correct(sent_input)

        print(sent)
        print(setn_output_1)
        print(setn_output_2)
Пример #8
0
def train_space_model(corpus_fname, model_fname):
    model = CountSpace()
    model.train(corpus_fname)
    model.save_model(model_fname, json_format=False)
Пример #9
0
# model = CountSpace()
# model.load_model('model_spacing.h5', json_format=False)
# model.train('./korquad_1.txt')
# model.save_model('model_spacing_2.h5', json_format=False)

# model.train(corpus_file_name)
# model.save_model('model_spacing.h5', json_format=False)
# model = CountSpace.load_model('model_spacing.h5', json_format=False)
# model.train()

# model_2_file_name = '../KorQuAD_2.1_train_00/korquad2.1_train_0.json'
# model_2 = CountSpace()
# model.train(model_2_file_name)
# model.save_model('model_2_spacing', json_format=False)

model = CountSpace()
model.load_model('model_spacing', json_format=False)
model.train('korquad.txt')
model.save_model('korean_spacing_model.h5', json_format=False)

# model = CountSpace()
# model.load_model('model_spacing_3.h5', json_format=False)
# model.train('./korquad_3.txt')
# model.save_model('model_spacing_4.h5', json_format=False)

verbose = False
mc = 10  # min_count
ft = 0.4  # force_abs_threshold
nt = -0.3  # nonspace_threshold
st = 0.4  # space_threshold
Пример #10
0
import pickle
from krwordrank.word import summarize_with_keywords
from wordcloud import WordCloud
from soykeyword.lasso import LassoKeywordExtractor
from soynlp.noun import LRNounExtractor_v2
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer
from soyspacing.countbase import RuleDict, CountSpace

import matplotlib.pyplot as plt

space = CountSpace()
space.load_model('soyspacing.model', json_format=False)

with open('grouped.pickle', 'rb') as f:
    grouped = pickle.load(f)
with open('nouns.pickle', 'rb') as f:
    nouns = pickle.load(f)
with open('words.pickle', 'rb') as f:
    words = pickle.load(f)

scores = {w: s.score for w, s in nouns.items()}
#scores.update(
#    {w:s.cohesion_forward+scores.get(w, 0) for w,s in words.items()})
#print(scores["가"])
tokenizer = MaxScoreTokenizer(scores)


#tokenizer = LTokenizer(scores)
def keywords(doc):
    space.correct(doc)
    tokens = tokenizer.tokenize(doc, flatten=False)
try:
    from soyspacing.countbase import CountSpace
    from soyspacing.countbase import RuleDict
except:
    #!pip install soyspacing
    from soyspacing.countbase import CountSpace
    from soyspacing.countbase import RuleDict

import re
import json

# Soyspacing 모델 학습하기
corpus_fname = './134963_norm.txt'
rule_dict = RuleDict('./space_rules.txt')
model = CountSpace()
model.train(corpus_fname)

# Soyspacing parameter 정하기
verbose = False
mc = 10  # min_count
ft = 0.3  # force_abs_threshold
nt = -0.3  # nonspace_threshold
st = 0.3  # space_threshold

for i in range(1, 30, 2):
    if i < 9:
        i_start = "0{}".format(i)
        i_end = "0{}".format(i + 1)
    elif i == 9:
        i_start = "09"
        i_end = "10"
Пример #12
0
from soyspacing.countbase import CountSpace
from soyspacing.countbase import RuleDict


모델을 저장 =====soynlp=======
model_fname = 'ver1spacing.h5'
model.save_model(model_fname, json_format=False)

model = CountSpace()
model.load_model(model_fname, json_format=False)

rule_dict = RuleDict('rules.txt')


text1 = '감사합니다 앞으로도 잘부탁드려요 풍성한토핑 맛난피자로 보답하겠습니다'
text2 = '맛있게 잘 먹었습니다~'
text3 = '마시써효!!!떡볶이도좋아요'
text4 = '불고기는 처음 시켜봤는데 상상 그이상....'
text5 = '냠냠~너무 맛있어용^^ 또 시켜먹어요넘나맛있네여피짜로덤왜인기가잇는지알겟둠원픽예약임툐쿄'
text6 = '영등포피자중 이찌방'
text7 = 'ㅋㅋㅋㅋ 파인애플 당연 추가한줄알고 실수했네요죄송염~~오늘도 맛나게 잘 먹겠습니다^^샐러드가 생각보다 푸짐하게 왔네요'

sent_corrected, tags = model.correct(text1, rules=rule_dict)
sent_corrected2, tags = model.correct(text2, rules=rule_dict)
sent_corrected3, tags = model.correct(text3, rules=rule_dict)
sent_corrected4, tags = model.correct(text4, rules=rule_dict)
sent_corrected5, tags = model.correct(text5, rules=rule_dict)
sent_corrected6, tags = model.correct(text6, rules=rule_dict)
sent_corrected7, tags = model.correct(text7, rules=rule_dict)

print('======soynlp====')
Пример #13
0
def hangul_to_kana_converter(x) :
    output = x
    for idx, each_output in enumerate(output) :
        for key, value in hanbon_dict.items() :
            output = output.replace(key, value)
    return output


def kana_to_hangul_converter(x) :
    output = x
    for idx, each_output in enumerate(output) :
        for key, value in hanbon_dict.items() :
            output = output.replace(value, key)
    return output

spacing_model = CountSpace()
spacing_model.load_model('./embedding/healthnews_spacing_model', json_format=False)
sp = spm.SentencePieceProcessor()
sp.Load('healthcare_hanbon.model')
ftmodel = FastText.load('./embedding/fasttext_healthqna.model')
def ft_dimension_retriever(x) :
    try :
        return ftmodel.wv[x]
    except :
        return np.repeat(0,200)
    
def final_meanvector_retriever(x) :
    return np.mean(ft_dimension_retriever([kana_to_hangul_converter(each) for each in sp.EncodeAsPieces(hangul_to_kana_converter(split_text_cleaner(spacing_model.correct(str(x))[0])))]), axis=0)

classifier = joblib.load('ensembled_classifier.pkl')
Пример #14
0
class Tag_dict:
    def __init__(self, content):
        self.content = content
        self.komoran = Komoran(userdic=os.getcwd() + '/user_dic.txt')
        self.model = CountSpace()
        self.adjective_dict = dict()  # 형용사: VA, VCN, VCP
        self.adverb_dict = dict()  # 부사: MAG
        self.conjunction_dict = dict()  # 접속사: MAJ
        self.determiner_dict = dict()  # 관형사: MM
        self.eomi_dict = dict()  # 어미: EC, EF, ETM, ETN
        self.josa_dict = dict(
        )  # 조사: JC, JKC, JKG, JKV, JKB, JKO, JKQ, JKS, JX
        self.noun_dict = dict()  # 명사: NNG, NNB, NNP, NP, NR
        self.preEomi_dict = dict()  # 선어말어미: EP
        self.suffix_dict = dict()  # 접사: XPN, XSA, XSN, XSV
        self.verb_dict = dict()  # 동사: VV, VX
        self.wordDict = dict()

    def judge_tag(self):
        for text in self.content:
            posList = self.komoran.pos(text)
            for pos in posList:
                # preprocessing
                word = re.sub("[ㄱ-ㅎ|ㅏ-ㅣ|.,?!]", repl="", string=str(pos[0]))
                if word == "":
                    continue

                # seperate tag & count
                tagName = tag_switch(pos[1])
                if tagName != -1:
                    if tagName == "adjective":
                        self.adjective_dict = tag_cnt(word,
                                                      self.adjective_dict)
                    elif tagName == "adverb":
                        self.adverb_dict = tag_cnt(word, self.adverb_dict)
                    elif tagName == "conjunction":
                        self.conjunction_dict = tag_cnt(
                            word, self.conjunction_dict)
                    elif tagName == "determiner":
                        self.determiner_dict = tag_cnt(word,
                                                       self.determiner_dict)
                    elif tagName == "eomi":
                        self.eomi_dict = tag_cnt(word, self.eomi_dict)
                    elif tagName == "josa":
                        self.josa_dict = tag_cnt(word, self.josa_dict)
                    elif tagName == "noun":
                        self.noun_dict = tag_cnt(word, self.noun_dict)
                    elif tagName == "preEomi":
                        self.preEomi_dict = tag_cnt(word, self.preEomi_dict)
                    elif tagName == "suffix":
                        self.suffix_dict = tag_cnt(word, self.suffix_dict)
                    elif tagName == "verb":
                        self.verb_dict = tag_cnt(word, self.verb_dict)

    def cnt_origin_word(self):
        if type(self.wordDict) is list:
            return
        for text in self.content:
            sent_corrected, tags = self.model.correct(text)
            words = del_special_char(sent_corrected).split(" ")
            for word in words:
                if word not in self.wordDict.keys():
                    self.wordDict[word] = 0
                self.wordDict[word] += 1

    def print_len(self):
        print("text line:", len(self.content))

    def print_noun_list(self):
        self.judge_tag()
        print(self.noun_dict)

    def print_tag_frequency(self, cnt=30):
        """
        print dict values frequency (descending)

        Args:
            :param: cnt(int)
        Returns:
            :param: tagDict(1st ~ until cnt-th) (dict)
        """
        self.judge_tag()

        self.adjective_dict = sorted(self.adjective_dict.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
        self.adverb_dict = sorted(self.adverb_dict.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
        self.conjunction_dict = sorted(self.conjunction_dict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
        self.determiner_dict = sorted(self.determiner_dict.items(),
                                      key=lambda x: x[1],
                                      reverse=True)
        self.eomi_dict = sorted(self.eomi_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
        self.josa_dict = sorted(self.josa_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
        self.noun_dict = sorted(self.noun_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
        self.preEomi_dict = sorted(self.preEomi_dict.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
        self.suffix_dict = sorted(self.suffix_dict.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
        self.verb_dict = sorted(self.verb_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)

        print("형용사(adjective):")
        print(self.adjective_dict[:cnt])
        print("\n부사(adverb):")
        print(self.adverb_dict[:cnt])
        print("\n접속사(conjunction):")
        print(self.conjunction_dict[:cnt])
        print("\n관형사(determiner):")
        print(self.determiner_dict[:cnt])
        print("\n어미(eomi):")
        print(self.eomi_dict[:cnt])
        print("\n조사(josa):")
        print(self.josa_dict[:cnt])
        print("\n명사(noun):")
        print(self.noun_dict[:cnt])
        print("\n선어말어미(preEomi):")
        print(self.preEomi_dict[:cnt])
        print("\n접사(suffix):")
        print(self.suffix_dict[:cnt])
        print("\n동사(verb):")
        print(self.verb_dict[:cnt])

    def print_origin_frequency(self, cnt=30):
        """
        print origin values frequency (descending)

        Args:
            :param: cnt(int)
        """
        self.cnt_origin_word()
        self.wordDict = sorted(self.wordDict.items(),
                               key=lambda x: x[1],
                               reverse=True)
        print(self.wordDict[:cnt])

    def print_dict(self, tagName):
        self.judge_tag()

        if tagName == "adjective":
            for tag in self.adjective_dict.keys():
                print(tag)
        elif tagName == "adverb":
            for tag in self.adverb_dict.keys():
                print(tag)
        elif tagName == "conjunction":
            for tag in self.conjunction_dict.keys():
                print(tag)
        elif tagName == "determiner":
            for tag in self.determiner_dict.keys():
                print(tag)
        elif tagName == "eomi":
            for tag in self.eomi_dict.keys():
                print(tag)
        elif tagName == "josa":
            for tag in self.josa_dict.keys():
                print(tag)
        elif tagName == "noun":
            for tag in self.noun_dict.keys():
                print(tag)
        elif tagName == "preEomi":
            for tag in self.preEomi_dict.keys():
                print(tag)
        elif tagName == "suffix":
            for tag in self.suffix_dict.keys():
                print(tag)
        elif tagName == "verb":
            for tag in self.verb_dict.keys():
                print(tag)

    def print_morph(self):
        for text in self.content:
            result = self.komoran.morphs(text)
            print(result)

    def print_pos(self):
        for text in self.content:
            result = self.komoran.pos(text)
            print(result)

    def save_compare(self, form):
        result = ""
        if form is "morph":
            for text in self.content:
                result += text + str(self.komoran.morphs(text)) + "\n\n"
        elif form is "pos":
            for text in self.content:
                result += text + str(self.komoran.pos(text)) + "\n\n"

        save_text_file(filename, result, form)

    def save_origin_frequency(self):
        result = ""
        self.judge_tag()
        self.cnt_origin_word()
        if type(self.wordDict) is dict:
            self.wordDict = sorted(self.wordDict.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
        """ save result as .txt """
        for key_value in self.wordDict:
            result += str(key_value) + "\n"
        save_text_file(filename, result, "origin")
        """ save new word dict to misspell_origin.xlsx """
        # load existence values & make as a dictionary
        pastData = read_xlsx_file()
        pastDataDict = dict()
        for i in range(pastData.shape[0]):
            valList = list()
            for j in range(1, pastData.shape[1]):
                if type(pastData.loc[i][j]) is str:
                    valList.append(pastData.loc[i][j])
                else:
                    break
            pastDataDict[pastData.loc[i][0]] = valList
        pastData_keyList = list(pastDataDict.keys())  # for delete overlap word

        # make current values as a list
        current_data_list = list(dict(self.wordDict).keys())

        # make new dict list (delete overlap word)
        newDictList = list(set(pastData_keyList + current_data_list))
        newDictList.remove("")  # delete empty element

        # re-write contents (data/misspell_origin.xlsx)
        rewrite_xlxs_file(pastDataDict, newDictList, "misspell_origin.xlsx")
        print(
            "===== Finish: save new word list to data/misspell_origin.xlsx ====="
        )

    def save_noun_standard(self):
        # count origin word frequency
        self.judge_tag()
        self.cnt_origin_word()
        if type(self.wordDict) is dict:
            self.wordDict = sorted(self.wordDict.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

        # load existence values & make as a dictionary
        pastData = read_xlsx_file("noun_standard")
        pastDataDict = dict()
        for i in range(pastData.shape[0]):
            valList = list()
            for j in range(1, pastData.shape[1]):
                if type(pastData.loc[i][j]) is str:
                    valList.append(pastData.loc[i][j])
                else:
                    break
            pastDataDict[pastData.loc[i][0]] = valList
        pastData_keyList = list(pastDataDict.keys())  # for delete overlap word

        # make current values as a list
        current_data_list = list()
        for noun in self.noun_dict.keys():
            current_data_list.append(noun)
        for key_value in self.wordDict:
            tmp = key_value[0]
            for noun in current_data_list:
                if noun in key_value[0]:
                    tmp = tmp.replace(noun, "")
            if tmp != "":
                current_data_list.append(tmp)

        # make new dict list (delete overlap word)
        newDictList = list(set(pastData_keyList + current_data_list))
        if "" in newDictList:
            newDictList.remove("")  # delete empty element

        # re-write contents (data/misspell_origin.xlsx)
        rewrite_xlxs_file(pastDataDict, newDictList,
                          "misspell_noun_standard.xlsx")
        print(
            "===== Finish: save new word list to data/misspell_noun_standard.xlsx ====="
        )
Пример #15
0
from soyspacing.countbase import RuleDict, CountSpace

corpus_fname = 'sentences.txt'
model = CountSpace()
model.train(corpus_fname)
model.save_model("soispace.model")