def noun_extractor_test(corpus_path): from soynlp import DoublespaceLineCorpus from soynlp.noun import LRNounExtractor from soynlp.noun import NewsNounExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) # LRNounExtractor print('LRNounExtractor test\n{}'.format('-' * 40)) noun_extractor = LRNounExtractor() noun_scores = noun_extractor.train_extract(corpus) print('{}\n{} words are extracted\ntop 20 frequency * score'.format( '-' * 30, len(noun_scores))) topwords = sorted( noun_scores, key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, noun_scores[word].score)) # NewsNounExtractor print('\nNewsNounExtractor test\n{}'.format('-' * 40)) newsnoun_extractor = NewsNounExtractor() newsnoun_scores = newsnoun_extractor.train_extract(corpus) print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format( '-' * 30, len(newsnoun_scores))) topwords = sorted(newsnoun_scores, key=lambda x: -newsnoun_scores[x].score * newsnoun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, newsnoun_scores[word].score)) print('noun extractor test has been done\n\n')
def noun_extractor_test(corpus_path): from soynlp import DoublespaceLineCorpus from soynlp.noun import LRNounExtractor from soynlp.noun import NewsNounExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) # LRNounExtractor print('LRNounExtractor test\n{}'.format('-'*40)) noun_extractor = LRNounExtractor() noun_scores = noun_extractor.train_extract(corpus) print('{}\n{} words are extracted\ntop 20 frequency * score'.format('-'*30, len(noun_scores))) topwords = sorted(noun_scores, key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, noun_scores[word].score)) # NewsNounExtractor print('\nNewsNounExtractor test\n{}'.format('-'*40)) newsnoun_extractor = NewsNounExtractor() newsnoun_scores = newsnoun_extractor.train_extract(corpus) print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format('-'*30, len(newsnoun_scores))) topwords = sorted(newsnoun_scores, key=lambda x: -newsnoun_scores[x].score * newsnoun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, newsnoun_scores[word].score)) print('noun extractor test has been done\n\n')
def train(): normed_path = path['norm'] noun_src_path = path['noun']['src'] noun_lrgraph_path = path['noun']['lrgraph'] noun_trained_path = path['noun']['train']['pkl'] noun_readable_path = path['noun']['train']['readable'] noun_result_path = path['noun']['result'] corpus = DoublespaceLineCorpus(normed_path, iter_sent=True) noun_extractor = LRNounExtractor(verbose=False, min_num_of_features=1) nouns = noun_extractor.train_extract(corpus, minimum_noun_score=0.5) word_freq = noun_extractor._wordset_l_counter lrgraph = noun_extractor.lrgraph words = noun_extractor.words trained_data = {} trained_data['lrgraph'] = lrgraph trained_data['words'] = words trained_data['word_freq'] = word_freq with open(noun_src_path, 'wb') as f: pickle.dump(trained_data, f) with open(noun_lrgraph_path, 'w', encoding='utf8') as f: json.dump(lrgraph, f, ensure_ascii=False, indent=4) params = {} for noun, noun_score in nouns.items(): params[noun] = { 'frequency': noun_score.frequency, 'score': noun_score.score, 'known_r_ratio': noun_score.known_r_ratio } with open(noun_trained_path, 'wb') as f: pickle.dump(params, f) with open(noun_readable_path, 'w', encoding='utf8') as f: json.dump(sorted(params.items()), f, ensure_ascii=False, indent=4) with open(noun_result_path, 'w', encoding='utf8') as f: json.dump(sorted(params), f, ensure_ascii=False, indent=4) update_user_dict() update(forced=True)
def train_extractor(begin_d=None, end_d=None, sections: list = None, base_dir='./out', tokenizer=None): _, sentences, corpus_class = make_corpus(begin_d=begin_d, end_d=end_d, sections=sections, base_dir=base_dir) # nouns = get_noun_words(begin_d='20201101', end_d='20201130') noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(sentences) # list of str like noun_score = dict([(key, val.score) for key, val in nouns.items()]) if tokenizer is None: tokenize = lambda x: x.strip().split() elif tokenizer == 'max_score_tokenizer': tokenize = MaxScoreTokenizer(noun_score) elif tokenizer == 'ltokenizer': tokenize = LTokenizer(noun_score) else: raise NotImplementedError if sections is not None and len(sections) >= 1: min_tf = 10 min_df = 2 else: min_tf = 20 min_df = 2 keyword_extractor = CorpusbasedKeywordExtractor( min_tf=min_tf, min_df=min_df, # tokenize=lambda x: x.strip().split(), tokenize=tokenize, verbose=True) # docs: list of str like keyword_extractor.train(sentences) return keyword_extractor, nouns, corpus_class
def get_keyword(characters): df = pd.read_csv("./MbtiApp/keyword/roles.csv") stopwords = pd.read_csv("./MbtiApp/keyword/stopwords.csv")["stopwords"] sentences = df.iloc[:, 2] sentences = list(sentences) + list(characters["feature_total"]) # 명사 추출 noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(sentences) nouns = sorted(nouns, key=lambda x: len(x), reverse=True) # stopwords 제거 for sw in stopwords: if sw in nouns: nouns.remove(sw) personal = [] for i, row in characters.iterrows(): noun_sen = "" for noun in nouns: if noun in row["feature_total"]: noun_sen = noun_sen + " #" + noun personal.append(noun_sen) characters["personal"] = personal return characters
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) stopwords_kr = [ '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말', '너무' ] def displayWordCloud(data=None, backgroundcolor='white', width=800, height=600): wordcloud = WordCloud(font_path=fontpath, stopwords=stopwords_kr, background_color=backgroundcolor, width=width, height=height).generate(data) plt.figure(figsize=(15, 10)) plt.imshow(wordcloud) plt.axis("off") plt.show() # noun_extractor = LRNounExtractor(verbose=True) noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(content) # nouns = noun_extractor.extract() print(nouns) # displayWordCloud(' '.join(nouns))