Exemplo n.º 1
0
def train_module(corpus, moduleName: str, saveModulePath: str):
    if moduleName == "countSpace":
        model = CountSpace()
        model.train(corpus)
        model.save_model(saveModulePath, json_format=False)
    elif moduleName == "normalizer":
        print("s")
    elif moduleName == "noun":
        print("s")
Exemplo n.º 2
0
    def __init__(self, max_seq_len):
        self.max_seq_len = max_seq_len

        self.input_text = None
        self.input_tokens = None
        self.input_embedding = None

        self.spacing_model = CountSpace()
        self.stage1_generator = None
        self.bert_model = None
Exemplo n.º 3
0
 def __init__(self, content):
     self.content = content
     self.komoran = Komoran(userdic=os.getcwd() + '/user_dic.txt')
     self.model = CountSpace()
     self.adjective_dict = dict()  # 형용사: VA, VCN, VCP
     self.adverb_dict = dict()  # 부사: MAG
     self.conjunction_dict = dict()  # 접속사: MAJ
     self.determiner_dict = dict()  # 관형사: MM
     self.eomi_dict = dict()  # 어미: EC, EF, ETM, ETN
     self.josa_dict = dict(
     )  # 조사: JC, JKC, JKG, JKV, JKB, JKO, JKQ, JKS, JX
     self.noun_dict = dict()  # 명사: NNG, NNB, NNP, NP, NR
     self.preEomi_dict = dict()  # 선어말어미: EP
     self.suffix_dict = dict()  # 접사: XPN, XSA, XSN, XSV
     self.verb_dict = dict()  # 동사: VV, VX
     self.wordDict = dict()
Exemplo n.º 4
0
def run_preprocess(inputPath: str, outputPath: str, modelPath: str,
                   module: str):
    if module == "countSpace":
        model = CountSpace()
        model.load_model(modelPath, json_format=False)
        with open(inputPath, 'r', encoding='utf-8') as inputData, \
                open(outputPath, 'w', encoding='utf-8') as outputData:
            for sentence in inputData:
                sentence = sentence.strip()
                if not sentence: continue
                sentence_corrected, _ = model.correct(sentence)
                outputData.writelines(sentence_corrected + "\n")
    elif module == "normalizer":
        print("do something")
    elif module == "noun":
        print("do something")
Exemplo n.º 5
0
def apply_space_correct(corpus_fname, model_fname, output_corpus_fname, with_label=False):
    model = CountSpace()
    model.load_model(model_fname, json_format=False)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_corpus_fname, 'w', encoding='utf-8') as f2:
        for sentence in f1:
            if with_label:
                sentence, label = sentence.strip().split("\u241E")
            else:
                sentence = sentence.strip()
                label = None
            if not sentence: continue
            sent_corrected, _ = model.correct(sentence)
            if with_label:
                f2.writelines(sent_corrected + "\u241E" + label + "\n")
            else:
                f2.writelines(sent_corrected + "\n")
Exemplo n.º 6
0
    def train(self, filename):
        verbose = False
        mc = 10  # min_count
        ft = 0.3  # force_abs_threshold
        nt = -0.3  # nonspace_threshold
        st = 0.3  # space_threshold

        model = CountSpace()

        rootDirPath = self.util.getRootPath("SmiToText.SmiToText")
        corpus_fname = rootDirPath + os.path.sep + "data" + os.path.sep + "koDetokenizerData" + os.path.sep + "ko_law_common_space.txt"
        model_fname = rootDirPath + os.path.sep + "kosoy-models" + os.path.sep + "soyspacing.model"

        ### 학습
        # model.train(corpus_fname)
        # model.save_model(model_fname, json_format=False)

        ## 모델 로드
        model.load_model(model_fname, json_format=False)

        #sent = '이건진짜좋은영화 라라랜드진짜좋은영화'
        # sent = '그일단그구성원인사람들과,,'
        sent = 'DAB는, 결정과 관련한 각 위원들의 모든 일당 수수료와 경비에 대한 청구금액이 완전하게 지급될 때 까지는, 결정문을 발급할 의무를 갖지 아니한다.'

        sent_input = sent.replace(" ", "")

        # with parameters
        setn_output_1, tags = model.correct(doc=sent_input,
                                            verbose=verbose,
                                            force_abs_threshold=ft,
                                            nonspace_threshold=nt,
                                            space_threshold=st,
                                            min_count=mc)

        # without parameters
        setn_output_2, tags = model.correct(sent_input)

        print(sent)
        print(setn_output_1)
        print(setn_output_2)
Exemplo n.º 7
0
def train_space_model(corpus_fname, model_fname):
    model = CountSpace()
    model.train(corpus_fname)
    model.save_model(model_fname, json_format=False)
Exemplo n.º 8
0
# model = CountSpace()
# model.load_model('model_spacing.h5', json_format=False)
# model.train('./korquad_1.txt')
# model.save_model('model_spacing_2.h5', json_format=False)

# model.train(corpus_file_name)
# model.save_model('model_spacing.h5', json_format=False)
# model = CountSpace.load_model('model_spacing.h5', json_format=False)
# model.train()

# model_2_file_name = '../KorQuAD_2.1_train_00/korquad2.1_train_0.json'
# model_2 = CountSpace()
# model.train(model_2_file_name)
# model.save_model('model_2_spacing', json_format=False)

model = CountSpace()
model.load_model('model_spacing', json_format=False)
model.train('korquad.txt')
model.save_model('korean_spacing_model.h5', json_format=False)

# model = CountSpace()
# model.load_model('model_spacing_3.h5', json_format=False)
# model.train('./korquad_3.txt')
# model.save_model('model_spacing_4.h5', json_format=False)

verbose = False
mc = 10  # min_count
ft = 0.4  # force_abs_threshold
nt = -0.3  # nonspace_threshold
st = 0.4  # space_threshold
Exemplo n.º 9
0
import pickle
from krwordrank.word import summarize_with_keywords
from wordcloud import WordCloud
from soykeyword.lasso import LassoKeywordExtractor
from soynlp.noun import LRNounExtractor_v2
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer
from soyspacing.countbase import RuleDict, CountSpace

import matplotlib.pyplot as plt

space = CountSpace()
space.load_model('soyspacing.model', json_format=False)

with open('grouped.pickle', 'rb') as f:
    grouped = pickle.load(f)
with open('nouns.pickle', 'rb') as f:
    nouns = pickle.load(f)
with open('words.pickle', 'rb') as f:
    words = pickle.load(f)

scores = {w: s.score for w, s in nouns.items()}
#scores.update(
#    {w:s.cohesion_forward+scores.get(w, 0) for w,s in words.items()})
#print(scores["가"])
tokenizer = MaxScoreTokenizer(scores)


#tokenizer = LTokenizer(scores)
def keywords(doc):
    space.correct(doc)
    tokens = tokenizer.tokenize(doc, flatten=False)
Exemplo n.º 10
0
def hangul_to_kana_converter(x) :
    output = x
    for idx, each_output in enumerate(output) :
        for key, value in hanbon_dict.items() :
            output = output.replace(key, value)
    return output


def kana_to_hangul_converter(x) :
    output = x
    for idx, each_output in enumerate(output) :
        for key, value in hanbon_dict.items() :
            output = output.replace(value, key)
    return output

spacing_model = CountSpace()
spacing_model.load_model('./embedding/healthnews_spacing_model', json_format=False)
sp = spm.SentencePieceProcessor()
sp.Load('healthcare_hanbon.model')
ftmodel = FastText.load('./embedding/fasttext_healthqna.model')
def ft_dimension_retriever(x) :
    try :
        return ftmodel.wv[x]
    except :
        return np.repeat(0,200)
    
def final_meanvector_retriever(x) :
    return np.mean(ft_dimension_retriever([kana_to_hangul_converter(each) for each in sp.EncodeAsPieces(hangul_to_kana_converter(split_text_cleaner(spacing_model.correct(str(x))[0])))]), axis=0)

classifier = joblib.load('ensembled_classifier.pkl')