def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 10 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue soy_tokens = soy_tokenizer.tokenize(word) if ' '.join(tokens) == ' '.join(soy_tokens): continue if is_all_nng(mcab.pos(word)): #print("nouns only : {}".format(word)) #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) continue if len(soy_tokens) > 1: continue #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) words = re.findall(' '.join(tokens), sentences) if len(words) < (cnt * 0.05): # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong)) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
def soy_tokenize(model_fname, input_sentence): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) tokens = tokenizer.tokenize(input_sentence) tokenized_sent = ' '.join(tokens) return tokenized_sent
def soy_tokenize(corpus_fname, model_fname, output_fname): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() normalized_sent = emoticon_normalize(sentence, num_repeats=3) tokens = tokenizer.tokenize(normalized_sent) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 100 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')
from soynlp import DoublespaceLineCorpus from soynlp.noun import LRNounExtractor from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer corpus_path = "text/news/articles.txt" #corpus_path = "text/news/input5-1.txt" corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) #for n_sent, sent in enumerate(corpus): # print('sent %d: %s %s\n'%(n_sent, sent, '' )) we = WordExtractor() we.train(corpus) scores = we.word_scores() print(scores.keys()) ''' sentences = DoublespaceLineCorpus(corpus_path, iter_sent=False) noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(sentences) n = nouns.keys() lists="" for a in n: lists+=a lists+=" " print(lists) ''' #top = sorted(nouns.items(), key=lambda x:-x[1].frequency)[:1] #print(top)
data = pd.read_pickle('./backend/textengines/data/dc_data.pkl') soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model' sentences = data["title"].values word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.train(sentences) word_extractor.save(soynlp_model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} # soyToken = LTokenizer(scores=scores) # soyToken.tokenize(data["title"].values[0]) ############################################################################# file = open("./backend/textengines/data/dc_title.txt", "w", encoding="utf-8") for title in data["title"].values: file.write(title) file.write("\n") file.close() spm_train = """--input=./backend/textengines/data/dc_title.txt \ --model_prefix=sentencepice \ --vocab_size=32000 \ --model_type=bpe \ --character_coverage=0.9995"""