def _train_noun_extractor( self, sents, min_num_of_features=1, max_frequency_when_noun_is_eojeol=30, # noun init min_noun_score=0.3, min_noun_frequency=1, min_eojeol_frequency=1): # noun extraction self.noun_extractor = LRNounExtractor_v2( extract_pos_feature=False, extract_determiner=False, extract_compound=False, ensure_normalized=self._ensure_normalized, verbose=self._verbose, min_num_of_features=min_num_of_features, max_frequency_when_noun_is_eojeol=max_frequency_when_noun_is_eojeol ) self.noun_extractor.train(sents, min_eojeol_frequency) nouns = self.noun_extractor.extract(min_noun_score, min_noun_frequency, reset_lrgraph=False) return nouns
def soynlp_tokenizer(corpus): from soynlp.tokenizer import LTokenizer from soynlp.word import WordExtractor from soynlp.noun import LRNounExtractor_v2 # word extractor word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(corpus) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # noun extractor noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(corpus) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} combined_scores = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items() } combined_scores.update({ subword: cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores) }) tokenizer = LTokenizer(scores=combined_scores) return tokenizer
def get_noun_words(begin_d=None, end_d=None): _, sentences, corpus_class = make_corpus(begin_d=begin_d, end_d=end_d) noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(sentences) # list of str like # noun_words = [(-stat.score, word, stat.frequency) for word, stat in nouns.items()] return nouns
def noun_corpus(sents): noun_extractor = LRNounExtractor_v2(verbose=True, extract_compound=True) noun_extractor.train(sents) nouns = noun_extractor.extract() noun_scores = {noun:score[0] for noun, score in nouns.items() if len(noun) > 1} tokenizer = NounLMatchTokenizer(noun_scores) corpus = [tokenizer.tokenize(sent) for sent in sents] return corpus
def noun_extractor_test(corpus_path): from soynlp import DoublespaceLineCorpus from soynlp.noun import LRNounExtractor from soynlp.noun import LRNounExtractor_v2 from soynlp.noun import NewsNounExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) # LRNounExtractor print('LRNounExtractor test\n{}'.format('-' * 40)) noun_extractor = LRNounExtractor() noun_scores = noun_extractor.train_extract(corpus) print('{}\n{} words are extracted\ntop 20 frequency * score'.format( '-' * 30, len(noun_scores))) topwords = sorted( noun_scores, key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, noun_scores[word].score)) # NewsNounExtractor print('\nNewsNounExtractor test\n{}'.format('-' * 40)) newsnoun_extractor = NewsNounExtractor() newsnoun_scores = newsnoun_extractor.train_extract(corpus) print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format( '-' * 30, len(newsnoun_scores))) topwords = sorted(newsnoun_scores, key=lambda x: -newsnoun_scores[x].score * newsnoun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, newsnoun_scores[word].score)) print('noun extractor test has been done\n\n') # LRNounExtractor_v2 print('\nNounExtractor_v2 test\n{}'.format('-' * 40)) noun_extractor_v2 = LRNounExtractor_v2() noun_scores_v2 = noun_extractor_v2.train_extract(corpus) noun_scores_v2 = { noun: score for noun, score in noun_scores_v2.items() if len(noun) > 1 } print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format( '-' * 30, len(noun_scores_v2))) topwords = sorted(noun_scores_v2, key=lambda x: -noun_scores_v2[x].score * noun_scores_v2[ x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, noun_scores_v2[word].score)) print('noun extractor test has been done\n\n')
def train_lexicon(self, document_path): sentence_list = DoublespaceLineCorpus(document_path, iter_sent=True) compound_extractor = LRNounExtractor_v2(verbose=True) compounds = compound_extractor.train_extract(sentence_list) p = re.compile("[^a-zA-Z0-9가-힣_]+") compound_list = [n for n, score in compounds.items() if len(p.findall(n)) == 0 and score[0] + score[1] > 5 and len(n) > 2] train_ner_lexicon = [] for compound in compound_list: train_ner_lexicon.append((compound, "UNK")) for word, ner_tag in train_ner_lexicon: if word not in self.ner_lexicon.keys(): self.ner_lexicon[word] = [ner_tag]
def main(args): # Find patterns and extract words from a given set of documents sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True) word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) # word extractor word_extractor.train(sentences) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } print('Word (Freq, cohesion, branching entropy)\n') for word, score in sorted(words.items(), key=lambda x: word_score(x[1]), reverse=True)[:30]: print('%s (%d, %.3f, %.3f)' % (word, score.leftside_frequency, score.cohesion_forward, score.right_branching_entropy)) # noun extractor noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(args.corpus_fname) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} # combined score combined_scores = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items() } combined_scores.update({ subword: cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores) }) # maxScore tokenizer tokenizer = MaxScoreTokenizer(scores=combined_scores) # save tokenizer with open(args.tokenizer_path, 'wb') as f: pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
def get_data(text): delli = [ '등', '것', '위', '대', '뒤', '오', '통', '또', '수', '말', '더', '못', '새', '인', '있', '점', '올', '많', '때', '측', '기자', '종목', '수익률', 'https' ] noun_extractor = LRNounExtractor_v2(verbose=False) nouns = noun_extractor.train_extract(text.split(' ')) nouns_data = Counter() for word in delli: if (word in nouns): del nouns[word] else: continue for word, data in nouns.items(): nouns_data += Counter({word: int(data[0])}) return nouns_data
def build_vocab(config, data=None): if data is not None: sents = MyIterator(data) else: sents = MyIterator(config.data_path) noun_extractor = LRNounExtractor_v2(verbose=False) nouns = noun_extractor.train_extract(sents) noun_dict = {} for noun, score in nouns.items(): if score.frequency >= config.min_frequency and score.score >= config.min_score and len(noun) > config.min_length: noun_dict[noun] = score.score vocab_path = os.path.join(config.save_path,'vocab.pkl') config.vocab_path = vocab_path #save_pickle(vocab_path, noun_dict) tokenizer = MaxScoreTokenizer(noun_dict) if data is not None: word2vec_corpus = Word2VecCorpus(data, tokenizer) else: word2vec_corpus = Word2VecCorpus(config.data_path, tokenizer) word2vec_model = Word2Vec( word2vec_corpus, size=config.word_hidden_size, alpha=0.025, window=5, min_count=config.min_frequency, sg=0, negative=5) word2vec_path = os.path.join(config.save_path, 'word2vec{}.model'.format(config.word_hidden_size)) config.word2vec_path = word2vec_path #word2vec_model.save(word2vec_path) return noun_dict, word2vec_model
def _extract_nouns(self, sentences): noun_extractor = LRNounExtractor_v2(l_max_length=self.l_max_length, r_max_length=self.r_max_length, min_eojeol_count=2, min_num_of_features=2, max_count_when_noun_is_eojeol=15, extract_compound=False, logpath=self.logpath, extract_pos_feature=True, verbose=self.verbose) noun_extractor.train(sentences) nouns = noun_extractor.extract( reset_lrgraph=False, min_count=10, minimum_noun_score=0.4, ) self._lrgraph = LRGraph({ l: {r: v for r, v in rdict.items()} for l, rdict in noun_extractor.lrgraph._lr.items() }) self._num_of_eojeols = noun_extractor._num_of_eojeols self._num_of_covered_eojeols = noun_extractor._num_of_covered_eojeols self.noun_extractor = noun_extractor if self.verbose: message = 'noun extraction was done. {} % eojeols are covered'.format( '%.2f' % (100 * self._num_of_covered_eojeols / self._num_of_eojeols)) self._print(message, replace=True, newline=True) return nouns
def noun_extract(datas): ne = LRNounExtractor_v2(verbose=True) nouns = ne.train_extract(datas) print(list(ne._compounds_components.items())[:5]) return nouns
data_path = company_name + '_labeled_data.csv' # csv 파일로 불러오기 #contents는 각 기사 스트링으로 바꿔 리스트에 넣은거, points는 클래스 0or 1 contents, points = tool.loading_rdata(data_path) # 사전 파일 만들기 if os.path.isfile('preprocessed_' + company_name + '.csv') == False: print("\n") print('"preprocessed_' + company_name + '.csv" deos not EXIST!') print('MAKE "preprocessed_' + company_name + '.csv" FILE... 가즈아~!!') print("\n") doc = pd.read_csv(data_path, index_col='datetime') contents = [] for i in range(len(doc['text'])): if len(doc.iloc[i]['text']) > 100: contents.append(doc.iloc[i]['text']) noun_extractor = LRNounExtractor_v2(verbose=True) nouns = noun_extractor.train_extract(contents, min_noun_frequency=20) match_tokenizer = NounLMatchTokenizer(nouns) f = open('preprocessed_' + company_name + '.csv', 'w', newline='', encoding='utf-8') fieldnames = ['text', 'num'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() test = [] for j in range(len(contents)): temp_list = match_tokenizer.tokenize(contents[j]) del_list2 = [] for i in range(len(temp_list)):
# %% # soynlp 사용 명사 추출 language_list = ["python", "java", "c", "etc"] # language_list = ["java", "etc"] from soynlp.noun import LRNounExtractor_v2 for language_name in language_list: title_list = [] df = pd.read_csv( f"../../analyze_data/{language_name}/{language_name}_team.csv") titles = df["title"] for title in titles: title_list.append(title) noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(title_list) if language_name == "python": del nouns["Python"] if language_name == "java": del nouns["Java"] # for noun, lank in nouns.items(): # print(noun, lank) displayWordCloud(language_name, " ".join(nouns)) # noun_extractor._compounds_components.get(title_list, None) # %% # language_list = ["java", "etc"] from soynlp.noun import LRNounExtractor_v2
def analyzeSentence(sentences): noun_extractor = LRNounExtractor_v2(verbose=True) nouns = noun_extractor.train_extract(sentences) return nouns
def __init__(self, sents): self.inst = LRNounExtractor_v2(verbose=False, extract_compound=True) self.inst.train(sents) self.inst.extract()
import pandas as pd import re from collections import Counter from itertools import chain WORDS_THRESHOLD = 2 NOUNS_THRESHOLD = 0.9 TOKENS_THRESHOLD = 0.15 USERS_THRESHOLD = 0.15 df = pd.read_pickle('./data/df_raw.pkl') corpus = df.text.tolist() print(''' nouns extractor ''') noun_extractor = LRNounExtractor_v2(verbose=True, extract_compound=True) noun_extractor.train(corpus) nouns = noun_extractor.extract(min_noun_frequency=100, min_noun_score=0.3) nouns_list = list() for k, v in nouns.items(): word = k score = v.score freq = v.frequency temp = dict() temp['noun'] = word.lower() temp['score'] = score temp['freq'] = freq nouns_list.append(temp)
'noun_scores': [], 'text': text } with open('scores_dictionary.pickle', 'wb') as fw: pickle.dump(scores_dictionary, fw) print("dumping complete") with open('scores_dictionary.pickle', 'rb') as fr: scores_dictionary = pickle.load(fr) print("loading complete") ## noun score # 명사만 noun_extractor = LRNounExtractor_v2(verbose=True, extract_compound=False) # 복합어 추출 X nouns = noun_extractor.train_extract(text) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} print("extracting noun") #print(list(noun_extractor._compounds_components.items())[:5]) scores_dictionary['noun_scores'] = noun_scores with open('scores_dictionary.pickle', 'wb') as fw: pickle.dump(scores_dictionary, fw) print("dumping complete") """ Noun = [] for noun, score in nouns.items():