예제 #1
0
def noun_extractor_test(corpus_path):
    from soynlp import DoublespaceLineCorpus
    from soynlp.noun import LRNounExtractor
    from soynlp.noun import NewsNounExtractor
    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)

    # LRNounExtractor
    print('LRNounExtractor test\n{}'.format('-' * 40))
    noun_extractor = LRNounExtractor()
    noun_scores = noun_extractor.train_extract(corpus)

    print('{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(noun_scores)))
    topwords = sorted(
        noun_scores,
        key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, noun_scores[word].score))

    # NewsNounExtractor
    print('\nNewsNounExtractor test\n{}'.format('-' * 40))
    newsnoun_extractor = NewsNounExtractor()
    newsnoun_scores = newsnoun_extractor.train_extract(corpus)

    print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(newsnoun_scores)))
    topwords = sorted(newsnoun_scores,
                      key=lambda x: -newsnoun_scores[x].score *
                      newsnoun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word,
                                             newsnoun_scores[word].score))
    print('noun extractor test has been done\n\n')
예제 #2
0
def noun_extractor_test(corpus_path):
    from soynlp import DoublespaceLineCorpus
    from soynlp.noun import LRNounExtractor
    from soynlp.noun import NewsNounExtractor
    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    
    # LRNounExtractor
    print('LRNounExtractor test\n{}'.format('-'*40))
    noun_extractor = LRNounExtractor()
    noun_scores = noun_extractor.train_extract(corpus)

    print('{}\n{} words are extracted\ntop 20 frequency * score'.format('-'*30, len(noun_scores)))
    topwords = sorted(noun_scores, key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, noun_scores[word].score))

    # NewsNounExtractor
    print('\nNewsNounExtractor test\n{}'.format('-'*40))
    newsnoun_extractor = NewsNounExtractor()
    newsnoun_scores = newsnoun_extractor.train_extract(corpus)

    print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format('-'*30, len(newsnoun_scores)))
    topwords = sorted(newsnoun_scores, key=lambda x: -newsnoun_scores[x].score * newsnoun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, newsnoun_scores[word].score))
    print('noun extractor test has been done\n\n')
예제 #3
0
def train():

    normed_path = path['norm']

    noun_src_path = path['noun']['src']
    noun_lrgraph_path = path['noun']['lrgraph']
    noun_trained_path = path['noun']['train']['pkl']
    noun_readable_path = path['noun']['train']['readable']
    noun_result_path = path['noun']['result']

    corpus = DoublespaceLineCorpus(normed_path, iter_sent=True)

    noun_extractor = LRNounExtractor(verbose=False, min_num_of_features=1)
    nouns = noun_extractor.train_extract(corpus, minimum_noun_score=0.5)

    word_freq = noun_extractor._wordset_l_counter
    lrgraph = noun_extractor.lrgraph
    words = noun_extractor.words

    trained_data = {}
    trained_data['lrgraph'] = lrgraph
    trained_data['words'] = words
    trained_data['word_freq'] = word_freq

    with open(noun_src_path, 'wb') as f:
        pickle.dump(trained_data, f)

    with open(noun_lrgraph_path, 'w', encoding='utf8') as f:
        json.dump(lrgraph, f, ensure_ascii=False, indent=4)

    params = {}
    for noun, noun_score in nouns.items():
        params[noun] = {
            'frequency': noun_score.frequency,
            'score': noun_score.score,
            'known_r_ratio': noun_score.known_r_ratio
        }

    with open(noun_trained_path, 'wb') as f:
        pickle.dump(params, f)

    with open(noun_readable_path, 'w', encoding='utf8') as f:
        json.dump(sorted(params.items()), f, ensure_ascii=False, indent=4)

    with open(noun_result_path, 'w', encoding='utf8') as f:
        json.dump(sorted(params), f, ensure_ascii=False, indent=4)

    update_user_dict()
    update(forced=True)
예제 #4
0
def train_extractor(begin_d=None,
                    end_d=None,
                    sections: list = None,
                    base_dir='./out',
                    tokenizer=None):
    _, sentences, corpus_class = make_corpus(begin_d=begin_d,
                                             end_d=end_d,
                                             sections=sections,
                                             base_dir=base_dir)
    # nouns = get_noun_words(begin_d='20201101', end_d='20201130')

    noun_extractor = LRNounExtractor()
    nouns = noun_extractor.train_extract(sentences)  # list of str like
    noun_score = dict([(key, val.score) for key, val in nouns.items()])
    if tokenizer is None:
        tokenize = lambda x: x.strip().split()
    elif tokenizer == 'max_score_tokenizer':
        tokenize = MaxScoreTokenizer(noun_score)
    elif tokenizer == 'ltokenizer':
        tokenize = LTokenizer(noun_score)
    else:
        raise NotImplementedError

    if sections is not None and len(sections) >= 1:
        min_tf = 10
        min_df = 2
    else:
        min_tf = 20
        min_df = 2

    keyword_extractor = CorpusbasedKeywordExtractor(
        min_tf=min_tf,
        min_df=min_df,
        # tokenize=lambda x: x.strip().split(),
        tokenize=tokenize,
        verbose=True)
    # docs: list of str like
    keyword_extractor.train(sentences)
    return keyword_extractor, nouns, corpus_class
예제 #5
0
def get_keyword(characters):
    df = pd.read_csv("./MbtiApp/keyword/roles.csv")
    stopwords = pd.read_csv("./MbtiApp/keyword/stopwords.csv")["stopwords"]
    sentences = df.iloc[:, 2]
    sentences = list(sentences) + list(characters["feature_total"])
    # 명사 추출
    noun_extractor = LRNounExtractor()
    nouns = noun_extractor.train_extract(sentences)
    nouns = sorted(nouns, key=lambda x: len(x), reverse=True)

    # stopwords 제거
    for sw in stopwords:
        if sw in nouns:
            nouns.remove(sw)

    personal = []
    for i, row in characters.iterrows():
        noun_sen = ""
        for noun in nouns:
            if noun in row["feature_total"]:
                noun_sen = noun_sen + " #" + noun
        personal.append(noun_sen)
    characters["personal"] = personal
    return characters
예제 #6
0
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)

stopwords_kr = [
    '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말',
    '너무'
]


def displayWordCloud(data=None,
                     backgroundcolor='white',
                     width=800,
                     height=600):
    wordcloud = WordCloud(font_path=fontpath,
                          stopwords=stopwords_kr,
                          background_color=backgroundcolor,
                          width=width,
                          height=height).generate(data)
    plt.figure(figsize=(15, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()


# noun_extractor = LRNounExtractor(verbose=True)
noun_extractor = LRNounExtractor()
nouns = noun_extractor.train_extract(content)
# nouns = noun_extractor.extract()

print(nouns)
# displayWordCloud(' '.join(nouns))