示例#1
0
class SoyNLPTokenizer(BaseTokenizer):
    """
    Tokenize text using MaxScoreTokenizer of SoyNLP
    """
    def __init__(self):
        self.tokenizer = None
        self.scores = list()
        self.word_extractor = WordExtractor(min_count=100,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)

    def fit(self, sentences):
        self.word_extractor.train(sentences)
        scores = self.word_extractor.extract()
        scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \
                   (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()]
        self.scores = scores
        self.tokenizer = MaxScoreTokenizer(scores=self.scores)

    def state_dict(self):
        return {'scores': self.scores}

    def load_state_dict(self, state_dict):
        self.scores = state_dict['scores']
        self.tokenizer = MaxScoreTokenizer(scores=self.scores)

    def tokenize(self, sentence):
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        return tokenized_sentence
def build_tokenizer():
    """
    Train soynlp tokenizer which will be used to tokenize Korean input sentence
    Returns:

    """
    print(f'Now building soynlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_txt = os.path.join(data_dir, 'train.txt')

    with open(train_txt, encoding='utf-8') as f:
        lines = f.readlines()

    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }

    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
示例#3
0
def data_tokenize(news_title, tdm_vocab):

    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    tokenizer = LTokenizer(scores=cohesion_score)

    cluster_data = []
    bert_null_list = []

    for title in news_title:
        title = test(title)
        sent = tokenizer.tokenize(title, flatten=False)
        sentence = []
        for i in sent:
            if i[0] in tdm_vocab:
                sentence.append(i[0])

        cluster_data.append(sentence)

    return cluster_data
示例#4
0
 def getTokenizer(self, contents):
     corpus = SentiCorpus(contents, iter_sent=True)
     word_extractor = WordExtractor(corpus)
     word_extractor.train(corpus)
     words_scores = word_extractor.extract()
     scores = {w: s.cohesion_forward for w, s in words_scores.items()}
     return LTokenizer(scores=scores)
示例#5
0
def build_tokenizer():
    """
    Train soynlp tokenizer which will be used to tokenize Korean input sentence using whole corpus
    Returns:

    """
    print(f'Now building soy-nlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'train_soynlp.csv')

    df = pd.read_csv(train_file, encoding='utf-8')

    # if encounters non-text row, we should skip it
    kor_lines = [
        row.korean for _, row in df.iterrows() if type(row.korean) == str
    ]

    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(kor_lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }

    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
示例#6
0
def pmi_test(corpus_path):
    print('PMI test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True)

    for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]:
        pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]])
        print('pmi {} = {:.3f}'.format(pair_, pmi))
    print('computed PMI')
def soynlp_tokenizer(corpus):
    from soynlp.tokenizer import LTokenizer
    from soynlp.word import WordExtractor
    from soynlp.noun import LRNounExtractor_v2

    # word extractor
    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)
    word_extractor.train(corpus)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(corpus)  # list of str like

    noun_scores = {noun: score.score for noun, score in nouns.items()}
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    tokenizer = LTokenizer(scores=combined_scores)
    return tokenizer
示例#8
0
 def _extracte(self) -> None:
     self.extractor = WordExtractor()
     self.extractor.train(self.corpus)
     self.words = self.extractor.extract()
     self.cohesion_score = {
         word: score.cohesion_forward
         for word, score in self.words.items()
     }
     self.tokenizer = LTokenizer(scores=self.cohesion_score)
def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 10 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            soy_tokens = soy_tokenizer.tokenize(word)
            if ' '.join(tokens) == ' '.join(soy_tokens):
                continue

            if is_all_nng(mcab.pos(word)):
                #print("nouns only : {}".format(word))
                #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))
                continue

            if len(soy_tokens) > 1:
                continue

            #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))

            words = re.findall(' '.join(tokens), sentences)
            if len(words) < (cnt * 0.05):
                # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류
                (cho, jung, jong) = hgtk.letter.decompose(word[-1])
                if 'ㄱ' <= jong <= 'ㅎ':
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
                else:
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
                print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong))
                f2.writelines(dic_line + '\n')
                f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
示例#10
0
def data_tokenize(news_title):

    word_extractor = WordExtractor(
        min_frequency=100, # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
    tokenizer = LTokenizer(scores=cohesion_score)

    return tokenizer
示例#11
0
def word_extractor_test(corpus_path):
    print('WordExtractor test')
    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor

    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    word_scores = word_extractor.extract()

    print('top 20 left frequency * forward cohesion words')
    topwords = sorted(word_scores, key=lambda x: -word_scores[x].cohesion_forward * word_scores[x].leftside_frequency)[:20]
    for word in topwords:
        print('word = {}, cohesion = {}'.format(word, word_scores[word].cohesion_forward))
    print('word extractor test has been done\n\n')
示例#12
0
    def __init__(self, model_path: str = None):
        self.word_extractor = WordExtractor(min_frequency=5,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)
        self.unk = 0
        self.pad = 1
        self.sos = 2
        self.eos = 3

        if model_path:
            with open(model_path, 'rb') as readFile:
                self.cohesion_score = dill.load(readFile)
        else:
            self.cohesion_score = {}
        self.tokenizer = LTokenizer(scores=self.cohesion_score)
        self.tok_to_id, self.id_to_tok = self._build_dict()
示例#13
0
def word_extractor_test(corpus_path):
    print('WordExtractor test')
    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor

    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    word_scores = word_extractor.extract()

    print('top 20 left frequency * forward cohesion words')
    topwords = sorted(word_scores,
                      key=lambda x: -word_scores[x].cohesion_forward *
                      word_scores[x].leftside_frequency)[:20]
    for word in topwords:
        print('word = {}, cohesion = {}'.format(
            word, word_scores[word].cohesion_forward))
    print('word extractor test has been done\n\n')
示例#14
0
def word_extract(datas):
    we = WordExtractor(
    min_frequency=10,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
    )
    we.train(datas)
    words = we.extract()
    print('단어   (빈도수, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:10]:
        print('%s     (%d, %.3f, %.3f)' % (
            word, 
            score.leftside_frequency, 
            score.cohesion_forward,
            score.right_branching_entropy
            )
         )
    return 
示例#15
0
 def _get_tokenizer(self, df):
     """
     Generate a torkenizer by extracting words
     Args:
         dataframe: data corpus of one language
     Returns:
         tokenizer
     """
     word_extractor = WordExtractor()
     word_extractor.train(df)
     words = word_extractor.extract()
     print(f'length of words is {len(words)}')
     cohesion_scores = {
         word: score.cohesion_forward
         for word, score in words.items()
     }
     tokenizer = LTokenizer(scores=cohesion_scores)
     return tokenizer
def main(args):
    # Find patterns and extract words from a given set of documents
    sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True)
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)

    # word extractor
    word_extractor.train(sentences)
    words = word_extractor.extract()
    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    print('Word   (Freq, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(),
                              key=lambda x: word_score(x[1]),
                              reverse=True)[:30]:
        print('%s     (%d, %.3f, %.3f)' %
              (word, score.leftside_frequency, score.cohesion_forward,
               score.right_branching_entropy))

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(args.corpus_fname)  # list of str like
    noun_scores = {noun: score.score for noun, score in nouns.items()}

    # combined score
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    # maxScore tokenizer
    tokenizer = MaxScoreTokenizer(scores=combined_scores)

    # save tokenizer
    with open(args.tokenizer_path, 'wb') as f:
        pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
示例#17
0
def pmi_test(corpus_path):
    print('pmi test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001)

    rows, cols = x_pmi.nonzero()
    data = x_pmi.data

    print('row  shape = {}'.format(rows.shape))
    print('col  shape = {}'.format(cols.shape))
    print('data shape = {}'.format(data.shape))

    for indpt in data.argsort()[-150:-100]:
        i = rows[indpt]
        j = cols[indpt]
        pair = (idx2vocab[i], idx2vocab[j])
        value = data[indpt]
        print('pmi {} = {:.3f}'.format(pair, value))
    print('computed pmi')
示例#18
0
    def soynlp_tokenizer(self):
        def word_score(score): return (score.cohesion_forward * math.exp(score.right_branching_entropy))

        if self.mode == 'serve':
            with open(self.data_path, 'r') as file:
                word_score_dict = json.load(file)
        elif self.mode == 'train':
            word_extractor = WordExtractor()
            word_extractor.train(self.train_corpus)
            words = word_extractor.extract()
            word_score_dict = { word:word_score(score) for word, score, in words.items()}

            with open('./models/word_dict.json', 'w') as file:
                json.dump(word_score_dict, file)
        else:
            pass
        
        tokenizer = MaxScoreTokenizer(scores=word_score_dict)
        return tokenizer
示例#19
0
def soy_tokenize(model_fname, input_sentence):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward *
              math.exp(scores[key].right_branching_entropy))
        for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    tokens = tokenizer.tokenize(input_sentence)
    tokenized_sent = ' '.join(tokens)

    return tokenized_sent
示例#20
0
def compute_soy_word_score(corpus_fname, model_fname):
    sentences = [sent.strip() for sent in open(corpus_fname, 'r').readlines()]
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.train(sentences)
    word_extractor.save(model_fname)
示例#21
0
    def __init__(self, pre_trained=True, analyzer='Hannanum'):
        self.pre_trained = pre_trained

        if analyzer == 'Hannanum':
            self.analyzer = tag.Hannanum()
        elif analyzer == 'Kkma':
            self.analyzer = tag.Kkma()
        elif analyzer == 'Komoran':
            self.analyzer = tag.Komoran()
        elif analyzer == 'Mecab':
            self.analyzer = tag.Mecab()
        elif analyzer == 'Okt':
            self.analyzer = tag.Okt()
        else:
            if pre_trained == False:
                pass
            else:
                print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt')

        self.WordExtractor = WordExtractor(min_frequency=0)
        self.noun_extractor = LRNounExtractor(verbose=False)
        self.word_score = {}
示例#22
0
def soy_tokenize(corpus_fname, model_fname, output_fname):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0
                                   )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()

    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            normalized_sent = emoticon_normalize(sentence, num_repeats=3)
            tokens = tokenizer.tokenize(normalized_sent)
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')
示例#23
0
def build_tokenizer():
    """
    입력되는 한국어 문장을 tokenize 할 soynlp tokenizer를 학습한다
    """
    print(f'Now building soy-nlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'corpus.csv')
    """
    학습되는 데이터가 있는 경로 지정 후 파일을 불러온다
    """
    df = pd.read_csv(train_file, encoding='utf-8')
    """
    text인 행만 분석한다
    """
    kor_lines = [
        row.korean for _, row in df.iterrows() if type(row.korean) == str
    ]
    """
    soynlp 모듈에서 가져온 WordExtractor 함수로 branching entropy, accessor variety, cohesion score의 단어 score 도출한다
    이 단어 score들은 각각 다른 방법으로 token의 경계를 찾는 값이다
    그 중 cohesion score(단어를 구성하는 글자들이 얼마나 같이 나오는지에 대한 값)만 추출한다.
    자세한 단어 score의 식과 코드는 https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb 에 자세히 나와있다.
    """
    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(kor_lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }
    """
    pickle로 저장한다
    """
    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
示例#24
0
def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 100 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            (cho, jung, jong) = hgtk.letter.decompose(word[-1])
            if 'ㄱ' <= jong <= 'ㅎ':
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
            else:
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
            f2.writelines(dic_line + '\n')
            f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')
示例#25
0
def Makegraph_Wordcloud_Soynlp(target):
    try:
        if flag_login == 0 or flag_login == None or flag_login == '':
            Login()
        #elif flag_prepro == 0:
        #messagebox.showwarning('주의', '데이터 전처리 후 실행해주세요.')
        #return
        else:
            data_wordcloud_soynlp = pd.DataFrame(data_origin[target],
                                                 columns=['contents'])
            data_wordcloud_soynlp['contents'] = data_origin[target].apply(
                lambda x: re.sub('[^가-힣]', ' ', x))

            word_extractor = WordExtractor(
                min_frequency=10,  # 가변화하기 (ex. data_origin.len() 비례)
                min_cohesion_forward=0.05,
                min_right_branching_entropy=0.0)
            word_extractor.train(data_wordcloud_soynlp['contents'].values)
            words = word_extractor.extract()

            cohesion_score = {
                word: score.cohesion_forward
                for word, score in words.items()
            }  # force : 여기인가?
            # force join words
            cohesion_score['숙소제공'] = 1
            cohesion_score['교통비지급'] = 1
            cohesion_score['인센티브'] = 1
            cohesion_score['초과근무시간확대'] = 1
            cohesion_score['복지포인트'] = 1
            cohesion_score['인사우대'] = 1
            cohesion_score['근평가점'] = 1
            cohesion_score['주거이전수당'] = 1

            tokenizer = LTokenizer(scores=cohesion_score)
            data_wordcloud_soynlp['tokenizer'] = data_wordcloud_soynlp[
                'contents'].apply(
                    lambda x: tokenizer.tokenize(x, remove_r=True))

            words = list()
            for i in data_wordcloud_soynlp['tokenizer'].values:
                for j in i:
                    words.append(j)

            count_soynlp = Counter(words)
            words_dict_soynlp = dict(count_soynlp.most_common(100))  # 빈도 상위 n개

            csv_stopwords = pd.read_csv('stopwords.csv',
                                        encoding='cp949',
                                        skiprows=0)  # with open 변경
            stopwords = list()
            for i in csv_stopwords.values:
                for j in i:
                    stopwords.append(j)

            for word in stopwords:
                words_dict_soynlp.pop(word, None)

            wordcloud = WordCloud(
                font_path='NanumGothic.ttf',
                width=500,
                height=500,
                background_color='white').generate_from_frequencies(
                    words_dict_soynlp)

            plt.clf()
            plt.figure(figsize=(20, 20))
            plt.imshow(wordcloud)
            plt.axis('off')
            #plt.show()
            plt.savefig(resultdir + filename_dateflag + target +
                        ' - wordcloud_soynlp.png',
                        dpi=100)
            '''
            # 빈도그래프(temp)
            plt.clf()
            plt.style.use('ggplot')
            plt.figure(figsize = (len(list(words_dict_soynlp.keys())[:20])*0.6, 10)) # grid size 가변화
            plt.title('상위 10개 빈출단어')
            plt.bar(list(words_dict_soynlp.keys())[:20], list(words_dict_soynlp.values())[:20])
            plt.xticks(rotation = 45, ha = 'right') # x축 라벨 회전
            plt.savefig(resultdir + filename_dateflag + target + ' - wordfrequency.png', dpi = 200)
            '''

        messagebox.showinfo(
            '작업', '워드클라우드(Soynlp) 생성이 완료되었습니다.\n\nresult폴더에 결과물이 저장되었습니다.')
    except Exception as e:
        Log(desc=e)
        messagebox.showerror('경고', str(e) + ' 열을 찾을 수 없습니다.')
示例#26
0
raw_data = []
for sent in news_title:
    preprocess = test(sent)
    data.append(preprocess)
    raw_data.append(sent)

# --------------------------토크나이저 로드--------------------

import numpy as np

from soynlp.word import WordExtractor
from soynlp.utils import DoublespaceLineCorpus
from soynlp.tokenizer import LTokenizer

word_extractor = WordExtractor(
    min_frequency=100,  # example
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0)

word_extractor.train(news_title)
words = word_extractor.extract()

cohesion_score = {
    word: score.cohesion_forward
    for word, score in words.items()
}
tokenizer = LTokenizer(scores=cohesion_score)

# # --------------------------word2vec 데이터 전처리--------------------

cluster_data = []
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

from gensim.test.utils import common_texts, get_tmpfile

if __name__ == '__main__':
    # Load Data
    file_name = 'wikiPlotText_m.txt'

    # 한국어 Cohesion score 사용 형태소 분석기
    corpus = DoublespaceLineCorpus(file_name, iter_sent=True)
    word_extractor = WordExtractor(corpus)
    word_extractor.train(corpus)
    words_scores = word_extractor.extract()
    scores = {w:s.cohesion_forword for w, s in words_scores.items()}
    tokenizer = LTokenizer(scores=scores)

    # {'games':games, 'corpus':corpus, 'titles':titles}
    games_data = fh.getGamesData(fh.getStoragePath()+file_name, tokenizer=tokenizer)
    tokenized_contents = games_data['corpus_words']

    # Vectorizing
    # sg=1(skip-gram), 0(CBOW)
    model_path = 'models/word2vec_ko.model'
    if os.path.isfile(model_path): word2vec_model = Word2Vec.load(model_path)
    else:
        path = get_tmpfile(model_path)
    f.write("ㅋ이 들어간 chat 중 ㅋ의 길이의 최빈값(상위3개): " + str(Counter(np_single).most_common()[:3])+'\n') #상위 3개 확인
    n, bins, patches = plt.hist(np_single, bins=sentence_cnt)  # ㅋ이 들어간 문장 중 ㅋ의 평균 출현 횟수에 대한 히스토그램
    plt.savefig(result_path+"/"+file_num+".png")
    f.close()

raw_time, raw_chat = read_data(file_name)
laugh_check(raw_chat)

'''
통계에 기반하여 단어를 찾아내는 비지도 학습법
1. Accessor Variety
2. Branching Entropy
3. Cohesion score
'''

word_extractor = WordExtractor(
    min_frequency=20,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
) #여기서는 Cohesion Score 사용

word_extractor.train(raw_chat)
words = word_extractor.extract()

'''
print("word extraction 길이: ",len(words), " \n결과: ")
print(words)
#words_score = {word : score.cohesion_forward for word, score in words.items()}
#tokenizer = LTokenizer(scores=words_score)
'''
示例#29
0
class SoyTokenizer:
    def __init__(self, model_path: str = None):
        self.word_extractor = WordExtractor(min_frequency=5,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)
        self.unk = 0
        self.pad = 1
        self.sos = 2
        self.eos = 3

        if model_path:
            with open(model_path, 'rb') as readFile:
                self.cohesion_score = dill.load(readFile)
        else:
            self.cohesion_score = {}
        self.tokenizer = LTokenizer(scores=self.cohesion_score)
        self.tok_to_id, self.id_to_tok = self._build_dict()

    def tokenize(self, sent: str):
        return self.tokenizer.tokenize(sent)

    def text_to_id(self, sent: str):
        toks = self.tokenize(sent)
        outp = []
        for s in toks:
            try:
                outp.append(self.tok_to_id[s])
            except KeyError:
                outp.append(self.unk)
        return outp

    def id_to_text(self, idxs: list):
        return [self.id_to_tok[i] for i in idxs]

    def train(self, sentences, add_whitespace: bool = False):
        sentences = self.preprocess(sentences)
        self.word_extractor.train(sentences)
        words = self.word_extractor.extract()
        self.cohesion_score = {
            word: score.cohesion_forward
            for word, score in words.items()
        }

        # add whitespace tokens
        if add_whitespace:
            whitetokens = []
            for s in sentences:
                whitetokens += s.split(' ')
            whitetokens = list(set(whitetokens))

            for t in whitetokens:
                self.cohesion_score.update({t: 1.0})

        self.tok_to_id, self.id_to_tok = self._build_dict()

    def save_model(self, model_path: str, model_prefix: str):
        with open(os.path.join(model_path, model_prefix + '.model'),
                  'wb') as saveFile:
            dill.dump(self.cohesion_score, saveFile)

    def _build_dict(self):
        tok_to_id = {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3}
        id_to_tok = {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>'}
        for i, key in enumerate(self.cohesion_score.keys()):
            tok_to_id[key] = i + 4
            id_to_tok[i + 4] = key
        return tok_to_id, id_to_tok

    def preprocess(self, sents: list):
        n_str_pattern = re.compile(pattern='[\\d\\-?/_!\\.,]')
        doublespacing = re.compile(pattern='\\s\\s+')

        sents = [n_str_pattern.sub(repl=' ', string=w) for w in sents]
        sents = [doublespacing.sub(repl=' ', string=w).strip() for w in sents]
        sents = [u.lower() for u in sents]
        return sents

    def __len__(self):
        return len(self.cohesion_score)
示例#30
0
    temp = dict()
    temp['noun'] = word.lower()
    temp['score'] = score
    temp['freq'] = freq

    nouns_list.append(temp)

df_nouns = pd.DataFrame(nouns_list)
df_nouns = df_nouns.sort_values(by=['score'], ascending=False)
nouns_candidates_list = df_nouns.loc[df.score > NOUNS_THRESHOLD].noun.tolist()
print('nouns_candidates_list : {}\n'.format(len(nouns_candidates_list)))

print(''' words extractor ''')
word_extractor = WordExtractor(min_frequency=100,
                               min_cohesion_forward=0.05,
                               min_right_branching_entropy=0.0)

word_extractor.train(corpus)
words = word_extractor.extract()
words = {k: v for k, v in words.items() if len(k) > 1}
words_list = list()
for k, v in words.items():
    temp = dict()
    cohesion = v.cohesion_forward
    branching_entropy = v.left_branching_entropy
    left_freq = v.leftside_frequency
    right_freq = v.rightside_frequency
    score = cohesion * branching_entropy

    temp['word'] = k.lower()
示例#31
0
文件: soy.py 项目: jjayd/project
from soynlp.noun import NewsNounExtractor
from soynlp import DoublespaceLineCorpus
from soynlp.noun import LRNounExtractor
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

corpus_path = "text/news/articles.txt"
#corpus_path = "text/news/input5-1.txt"

corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

#for n_sent, sent in enumerate(corpus):
#    print('sent %d: %s %s\n'%(n_sent, sent, '' ))

we = WordExtractor()
we.train(corpus)
scores = we.word_scores()
print(scores.keys())
'''
sentences = DoublespaceLineCorpus(corpus_path, iter_sent=False)
noun_extractor = LRNounExtractor()
nouns = noun_extractor.train_extract(sentences)
n = nouns.keys()
lists=""
for a in n:
	lists+=a
	lists+=" "
print(lists)
'''
#top = sorted(nouns.items(), key=lambda x:-x[1].frequency)[:1]
#print(top)