예제 #1
0
def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 10 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            soy_tokens = soy_tokenizer.tokenize(word)
            if ' '.join(tokens) == ' '.join(soy_tokens):
                continue

            if is_all_nng(mcab.pos(word)):
                #print("nouns only : {}".format(word))
                #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))
                continue

            if len(soy_tokens) > 1:
                continue

            #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))

            words = re.findall(' '.join(tokens), sentences)
            if len(words) < (cnt * 0.05):
                # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류
                (cho, jung, jong) = hgtk.letter.decompose(word[-1])
                if 'ㄱ' <= jong <= 'ㅎ':
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
                else:
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
                print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong))
                f2.writelines(dic_line + '\n')
                f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
예제 #2
0
def soy_tokenize(model_fname, input_sentence):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward *
              math.exp(scores[key].right_branching_entropy))
        for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    tokens = tokenizer.tokenize(input_sentence)
    tokenized_sent = ' '.join(tokens)

    return tokenized_sent
예제 #3
0
def soy_tokenize(corpus_fname, model_fname, output_fname):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0
                                   )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()

    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            normalized_sent = emoticon_normalize(sentence, num_repeats=3)
            tokens = tokenizer.tokenize(normalized_sent)
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')
예제 #4
0
def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 100 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            (cho, jung, jong) = hgtk.letter.decompose(word[-1])
            if 'ㄱ' <= jong <= 'ㅎ':
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
            else:
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
            f2.writelines(dic_line + '\n')
            f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')
예제 #5
0
파일: soy.py 프로젝트: jjayd/project
from soynlp import DoublespaceLineCorpus
from soynlp.noun import LRNounExtractor
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

corpus_path = "text/news/articles.txt"
#corpus_path = "text/news/input5-1.txt"

corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

#for n_sent, sent in enumerate(corpus):
#    print('sent %d: %s %s\n'%(n_sent, sent, '' ))

we = WordExtractor()
we.train(corpus)
scores = we.word_scores()
print(scores.keys())
'''
sentences = DoublespaceLineCorpus(corpus_path, iter_sent=False)
noun_extractor = LRNounExtractor()
nouns = noun_extractor.train_extract(sentences)
n = nouns.keys()
lists=""
for a in n:
	lists+=a
	lists+=" "
print(lists)
'''
#top = sorted(nouns.items(), key=lambda x:-x[1].frequency)[:1]
#print(top)
예제 #6
0
data = pd.read_pickle('./backend/textengines/data/dc_data.pkl')

soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model'

sentences = data["title"].values

word_extractor = WordExtractor(
    min_frequency=100,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
)

word_extractor.train(sentences)
word_extractor.save(soynlp_model_fname)

scores = word_extractor.word_scores()
scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
# soyToken = LTokenizer(scores=scores)
# soyToken.tokenize(data["title"].values[0])
#############################################################################
file = open("./backend/textengines/data/dc_title.txt", "w", encoding="utf-8")
for title in data["title"].values:
    file.write(title)
    file.write("\n")
file.close()

spm_train = """--input=./backend/textengines/data/dc_title.txt \
               --model_prefix=sentencepice \
               --vocab_size=32000 \
               --model_type=bpe \
               --character_coverage=0.9995"""