def build_tokenizer(): """ Train soynlp tokenizer which will be used to tokenize Korean input sentence using whole corpus Returns: """ print(f'Now building soy-nlp tokenizer . . .') data_dir = Path().cwd() / 'data' train_file = os.path.join(data_dir, 'train_soynlp.csv') df = pd.read_csv(train_file, encoding='utf-8') # if encounters non-text row, we should skip it kor_lines = [ row.korean for _, row in df.iterrows() if type(row.korean) == str ] word_extractor = WordExtractor(min_frequency=5) word_extractor.train(kor_lines) word_scores = word_extractor.extract() cohesion_scores = { word: score.cohesion_forward for word, score in word_scores.items() } with open('pickles/tokenizer.pickle', 'wb') as pickle_out: pickle.dump(cohesion_scores, pickle_out)
def compute_soy_word_score(corpus_fname, model_fname): sentences = [sent.strip() for sent in open(corpus_fname, 'r').readlines()] word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(sentences) word_extractor.save(model_fname)
def pmi_test(corpus_path): print('PMI test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True) for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]: pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]]) print('pmi {} = {:.3f}'.format(pair_, pmi)) print('computed PMI')
def data_tokenize(news_title, tdm_vocab): word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_score) cluster_data = [] bert_null_list = [] for title in news_title: title = test(title) sent = tokenizer.tokenize(title, flatten=False) sentence = [] for i in sent: if i[0] in tdm_vocab: sentence.append(i[0]) cluster_data.append(sentence) return cluster_data
class SoyNLPTokenizer(BaseTokenizer): """ Tokenize text using MaxScoreTokenizer of SoyNLP """ def __init__(self): self.tokenizer = None self.scores = list() self.word_extractor = WordExtractor(min_count=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) def fit(self, sentences): self.word_extractor.train(sentences) scores = self.word_extractor.extract() scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \ (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()] self.scores = scores self.tokenizer = MaxScoreTokenizer(scores=self.scores) def state_dict(self): return {'scores': self.scores} def load_state_dict(self, state_dict): self.scores = state_dict['scores'] self.tokenizer = MaxScoreTokenizer(scores=self.scores) def tokenize(self, sentence): tokenized_sentence = self.tokenizer.tokenize(sentence) return tokenized_sentence
def getTokenizer(self, contents): corpus = SentiCorpus(contents, iter_sent=True) word_extractor = WordExtractor(corpus) word_extractor.train(corpus) words_scores = word_extractor.extract() scores = {w: s.cohesion_forward for w, s in words_scores.items()} return LTokenizer(scores=scores)
def soynlp_tokenizer(corpus): from soynlp.tokenizer import LTokenizer from soynlp.word import WordExtractor from soynlp.noun import LRNounExtractor_v2 # word extractor word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(corpus) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # noun extractor noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(corpus) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} combined_scores = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items() } combined_scores.update({ subword: cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores) }) tokenizer = LTokenizer(scores=combined_scores) return tokenizer
def build_tokenizer(): """ Train soynlp tokenizer which will be used to tokenize Korean input sentence Returns: """ print(f'Now building soynlp tokenizer . . .') data_dir = Path().cwd() / 'data' train_txt = os.path.join(data_dir, 'train.txt') with open(train_txt, encoding='utf-8') as f: lines = f.readlines() word_extractor = WordExtractor(min_frequency=5) word_extractor.train(lines) word_scores = word_extractor.extract() cohesion_scores = { word: score.cohesion_forward for word, score in word_scores.items() } with open('pickles/tokenizer.pickle', 'wb') as pickle_out: pickle.dump(cohesion_scores, pickle_out)
def data_tokenize(news_title): word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = {word:score.cohesion_forward for word, score in words.items()} tokenizer = LTokenizer(scores=cohesion_score) return tokenizer
def word_extractor_test(corpus_path): print('WordExtractor test') from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) word_extractor = WordExtractor() word_extractor.train(corpus) word_scores = word_extractor.extract() print('top 20 left frequency * forward cohesion words') topwords = sorted(word_scores, key=lambda x: -word_scores[x].cohesion_forward * word_scores[x].leftside_frequency)[:20] for word in topwords: print('word = {}, cohesion = {}'.format(word, word_scores[word].cohesion_forward)) print('word extractor test has been done\n\n')
def word_extract(datas): we = WordExtractor( min_frequency=10, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) we.train(datas) words = we.extract() print('단어 (빈도수, cohesion, branching entropy)\n') for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:10]: print('%s (%d, %.3f, %.3f)' % ( word, score.leftside_frequency, score.cohesion_forward, score.right_branching_entropy ) ) return
def _get_tokenizer(self, df): """ Generate a torkenizer by extracting words Args: dataframe: data corpus of one language Returns: tokenizer """ word_extractor = WordExtractor() word_extractor.train(df) words = word_extractor.extract() print(f'length of words is {len(words)}') cohesion_scores = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_scores) return tokenizer
def word_extractor_test(corpus_path): print('WordExtractor test') from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) word_extractor = WordExtractor() word_extractor.train(corpus) word_scores = word_extractor.extract() print('top 20 left frequency * forward cohesion words') topwords = sorted(word_scores, key=lambda x: -word_scores[x].cohesion_forward * word_scores[x].leftside_frequency)[:20] for word in topwords: print('word = {}, cohesion = {}'.format( word, word_scores[word].cohesion_forward)) print('word extractor test has been done\n\n')
def pmi_test(corpus_path): print('pmi test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001) rows, cols = x_pmi.nonzero() data = x_pmi.data print('row shape = {}'.format(rows.shape)) print('col shape = {}'.format(cols.shape)) print('data shape = {}'.format(data.shape)) for indpt in data.argsort()[-150:-100]: i = rows[indpt] j = cols[indpt] pair = (idx2vocab[i], idx2vocab[j]) value = data[indpt] print('pmi {} = {:.3f}'.format(pair, value)) print('computed pmi')
def main(args): # Find patterns and extract words from a given set of documents sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True) word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) # word extractor word_extractor.train(sentences) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } print('Word (Freq, cohesion, branching entropy)\n') for word, score in sorted(words.items(), key=lambda x: word_score(x[1]), reverse=True)[:30]: print('%s (%d, %.3f, %.3f)' % (word, score.leftside_frequency, score.cohesion_forward, score.right_branching_entropy)) # noun extractor noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(args.corpus_fname) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} # combined score combined_scores = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items() } combined_scores.update({ subword: cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores) }) # maxScore tokenizer tokenizer = MaxScoreTokenizer(scores=combined_scores) # save tokenizer with open(args.tokenizer_path, 'wb') as f: pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
def soynlp_tokenizer(self): def word_score(score): return (score.cohesion_forward * math.exp(score.right_branching_entropy)) if self.mode == 'serve': with open(self.data_path, 'r') as file: word_score_dict = json.load(file) elif self.mode == 'train': word_extractor = WordExtractor() word_extractor.train(self.train_corpus) words = word_extractor.extract() word_score_dict = { word:word_score(score) for word, score, in words.items()} with open('./models/word_dict.json', 'w') as file: json.dump(word_score_dict, file) else: pass tokenizer = MaxScoreTokenizer(scores=word_score_dict) return tokenizer
def build_tokenizer(): """ 입력되는 한국어 문장을 tokenize 할 soynlp tokenizer를 학습한다 """ print(f'Now building soy-nlp tokenizer . . .') data_dir = Path().cwd() / 'data' train_file = os.path.join(data_dir, 'corpus.csv') """ 학습되는 데이터가 있는 경로 지정 후 파일을 불러온다 """ df = pd.read_csv(train_file, encoding='utf-8') """ text인 행만 분석한다 """ kor_lines = [ row.korean for _, row in df.iterrows() if type(row.korean) == str ] """ soynlp 모듈에서 가져온 WordExtractor 함수로 branching entropy, accessor variety, cohesion score의 단어 score 도출한다 이 단어 score들은 각각 다른 방법으로 token의 경계를 찾는 값이다 그 중 cohesion score(단어를 구성하는 글자들이 얼마나 같이 나오는지에 대한 값)만 추출한다. 자세한 단어 score의 식과 코드는 https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb 에 자세히 나와있다. """ word_extractor = WordExtractor(min_frequency=5) word_extractor.train(kor_lines) word_scores = word_extractor.extract() cohesion_scores = { word: score.cohesion_forward for word, score in word_scores.items() } """ pickle로 저장한다 """ with open('pickles/tokenizer.pickle', 'wb') as pickle_out: pickle.dump(cohesion_scores, pickle_out)
from soynlp.tokenizer import LTokenizer from tensorflow.keras.preprocessing.text import text_to_word_sequence data = pd.read_pickle('./backend/textengines/data/dc_data.pkl') soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model' sentences = data["title"].values word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.train(sentences) word_extractor.save(soynlp_model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} # soyToken = LTokenizer(scores=scores) # soyToken.tokenize(data["title"].values[0]) ############################################################################# file = open("./backend/textengines/data/dc_title.txt", "w", encoding="utf-8") for title in data["title"].values: file.write(title) file.write("\n") file.close() spm_train = """--input=./backend/textengines/data/dc_title.txt \ --model_prefix=sentencepice \
lines = f.read().splitlines() re.sub(r"[\[\]<>~]", ' ', lines[0]) re.sub(r"['~]", ' ', lines[0]) re.sub(r'"', ' ', lines[0]) text = [] for line in lines: line = re.sub(r"[\[\]<>~]", ' ', line) line = re.sub(r"['~]", ' ', line) line = re.sub(r'"', ' ', line) line = re.sub('\\W', ' ', line) text.append(line) # word_score word_extractor = WordExtractor(min_frequency=5) word_extractor.train(text) print("train word_extractor complete") words_scores = word_extractor.extract() print('complete to extract words_scores') scores_dictionary = { 'words_scores': words_scores, 'noun_scores': [], 'text': text } with open('scores_dictionary.pickle', 'wb') as fw: pickle.dump(scores_dictionary, fw) print("dumping complete") with open('scores_dictionary.pickle', 'rb') as fr:
class SoyTokenizer: def __init__(self, model_path: str = None): self.word_extractor = WordExtractor(min_frequency=5, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) self.unk = 0 self.pad = 1 self.sos = 2 self.eos = 3 if model_path: with open(model_path, 'rb') as readFile: self.cohesion_score = dill.load(readFile) else: self.cohesion_score = {} self.tokenizer = LTokenizer(scores=self.cohesion_score) self.tok_to_id, self.id_to_tok = self._build_dict() def tokenize(self, sent: str): return self.tokenizer.tokenize(sent) def text_to_id(self, sent: str): toks = self.tokenize(sent) outp = [] for s in toks: try: outp.append(self.tok_to_id[s]) except KeyError: outp.append(self.unk) return outp def id_to_text(self, idxs: list): return [self.id_to_tok[i] for i in idxs] def train(self, sentences, add_whitespace: bool = False): sentences = self.preprocess(sentences) self.word_extractor.train(sentences) words = self.word_extractor.extract() self.cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # add whitespace tokens if add_whitespace: whitetokens = [] for s in sentences: whitetokens += s.split(' ') whitetokens = list(set(whitetokens)) for t in whitetokens: self.cohesion_score.update({t: 1.0}) self.tok_to_id, self.id_to_tok = self._build_dict() def save_model(self, model_path: str, model_prefix: str): with open(os.path.join(model_path, model_prefix + '.model'), 'wb') as saveFile: dill.dump(self.cohesion_score, saveFile) def _build_dict(self): tok_to_id = {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3} id_to_tok = {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>'} for i, key in enumerate(self.cohesion_score.keys()): tok_to_id[key] = i + 4 id_to_tok[i + 4] = key return tok_to_id, id_to_tok def preprocess(self, sents: list): n_str_pattern = re.compile(pattern='[\\d\\-?/_!\\.,]') doublespacing = re.compile(pattern='\\s\\s+') sents = [n_str_pattern.sub(repl=' ', string=w) for w in sents] sents = [doublespacing.sub(repl=' ', string=w).strip() for w in sents] sents = [u.lower() for u in sents] return sents def __len__(self): return len(self.cohesion_score)
from soynlp.noun import NewsNounExtractor from soynlp import DoublespaceLineCorpus from soynlp.noun import LRNounExtractor from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer corpus_path = "text/news/articles.txt" #corpus_path = "text/news/input5-1.txt" corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) #for n_sent, sent in enumerate(corpus): # print('sent %d: %s %s\n'%(n_sent, sent, '' )) we = WordExtractor() we.train(corpus) scores = we.word_scores() print(scores.keys()) ''' sentences = DoublespaceLineCorpus(corpus_path, iter_sent=False) noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(sentences) n = nouns.keys() lists="" for a in n: lists+=a lists+=" " print(lists) ''' #top = sorted(nouns.items(), key=lambda x:-x[1].frequency)[:1] #print(top)
class KoreanTokenizer: ''' A class to tokenize a Korean sentence. Attributes ---------- pre_trained : bool | If True, one of pre-trained Korean analyzer, provided by KoNLPy, will be used (default : True) | If False, unsupervised KoreanTokenizer is initialized, based on soynlp L-Tokenizer. Argument 'anaylzer' is ignored. analyzer : str | Type of KoNLPy analyzer (default : Hannanum) | Available analyzers are: Hannanum, Kkma, Komoran, Mecab, Okt | Note: Mecab needs to be installed separately before being used. Methods ------- train | Trains KoreanTokenizer on a corpus, only when 'pre_trained' argument is False. tokenize | Tokenizes the input sentence and returns its tokens. extract_noun | Extracts nouns from the input sentence. ''' def __init__(self, pre_trained=True, analyzer='Hannanum'): self.pre_trained = pre_trained if analyzer == 'Hannanum': self.analyzer = tag.Hannanum() elif analyzer == 'Kkma': self.analyzer = tag.Kkma() elif analyzer == 'Komoran': self.analyzer = tag.Komoran() elif analyzer == 'Mecab': self.analyzer = tag.Mecab() elif analyzer == 'Okt': self.analyzer = tag.Okt() else: if pre_trained == False: pass else: print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt') self.WordExtractor = WordExtractor(min_frequency=0) self.noun_extractor = LRNounExtractor(verbose=False) self.word_score = {} def train(self, text): ''' A method to train the KoreanTokenizer on a corpus. If KoreanTokenizer.pre_trained == False, this method does nothing. Attributes ---------- text : str | An input text in str type ''' if self.pre_trained == True: print('A pre-trained KoreanTokenizer is being used. No need to train it.') return else: self.WordExtractor.train(text) self.words = self.WordExtractor.extract() def calculate_word_score(word, score): cohesion = score.cohesion_forward branching_entropy = score.right_branching_entropy word_score = cohesion * exp(branching_entropy) return word_score self.word_score = {word:calculate_word_score(word, score) for word, score in self.words.items()} def tokenize(self, text): ''' A method to tokenize input text. Attriubutes ----------- text : str | An input text to be tokenized Output ------ tokens : list | List of tokens (in str) that consist of the input text ''' if self.pre_trained == True: return self.analyzer.morphs(text) else: if not self.word_score: print('An unsupervised KoreanTokenizer should be trained first, before tokenizing.') return self.tokenizer = LTokenizer(scores=self.word_score) result = self.tokenizer.tokenize(text) return result def extract_noun(self, text): ''' A method to extract nouns from input text Attributes ---------- text : str | An input text from which nouns will be extracted Output ------ nouns : list | List of noun tokens (in str) in the input text ''' if self.pre_trained == True: return self.analyzer.nouns(text)
class Embedding: MODEL_SAVED_DIR = "saved_model/fasttext.model" TOKENIZER_SAVED_DIR = "saved_model\\tokenizer.pkl" def __init__(self, dataset: pd.DataFrame, word_train: bool): self.dataset = dataset self.corpus = dataset["TITLE"] + dataset["TEXTCONTENT"] if word_train == False: self.fasttext = FastText.load(self.MODEL_SAVED_DIR) self._load_tokenizer() self._tokenize() else: self._extracte() self._tokenize() self._save_tokenizer() self._train() self.idx_word_dict = dict( zip(np.arange(4, len(self.fasttext.wv.vectors) + 4), self.fasttext.wv.index2word)) self.idx_word_dict[0] = '<PAD>' self.idx_word_dict[1] = '<STA>' self.idx_word_dict[2] = '<EOS>' self.idx_word_dict[3] = '<UNK>' def _extracte(self) -> None: self.extractor = WordExtractor() self.extractor.train(self.corpus) self.words = self.extractor.extract() self.cohesion_score = { word: score.cohesion_forward for word, score in self.words.items() } self.tokenizer = LTokenizer(scores=self.cohesion_score) def _tokenize(self) -> pd.DataFrame: self.corpus = self.corpus.apply( lambda text: self.tokenizer.tokenize(text)) self.dataset["TITLE"] = self.dataset["TITLE"].apply( lambda text: self.tokenizer.tokenize(text)) self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply( lambda text: self.tokenizer.tokenize(text)) def _save_tokenizer(self) -> None: with open(self.TOKENIZER_SAVED_DIR, "wb") as f: pickle.dump(self.tokenizer, f, pickle.HIGHEST_PROTOCOL) def _load_tokenizer(self) -> None: with open(self.TOKENIZER_SAVED_DIR, "rb") as f: self.tokenizer = pickle.load(f) def _train(self) -> None: self.fasttext = FastText(sentences=self.corpus, size=100, window=5, min_count=1, iter=100) self.fasttext.save(self.MODEL_SAVED_DIR) def dataset_to_embedding(self) -> pd.DataFrame: self.dataset["TITLE_IDX"] = self.dataset["TITLE"].apply( self._sentence_length_fix, args=[10]) self.dataset["TITLE"] = self.dataset["TITLE"].apply( self._sentence_length_fix, args=[10]) self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply( self._sentence_length_fix, args=[32]) for index, value in self.dataset["TITLE_IDX"].iteritems(): assert len(value) == 10 for index, value in self.dataset["TITLE"].iteritems(): assert len(value) == 10 for index, value in self.dataset["TEXTCONTENT"].iteritems(): assert len(value) == 32 self.dataset["TITLE_IDX"] = self.dataset["TITLE_IDX"].apply( lambda tokenized: np.array( [self._word_to_idx(token) for token in tokenized])) self.dataset["TITLE"] = self.dataset["TITLE"].apply( lambda tokenized: np.array( [self._word_to_vec(token) for token in tokenized])) self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply( lambda tokenized: np.array( [self._word_to_vec(token) for token in tokenized])) return self.dataset def embedding_to_sentence(self, target: list or np.array) -> list: return [self._vec_to_word(vector) for vector in target] def _sentence_length_fix(self, sentence: list or np.array, length: int) -> list or np.array: sentence_length = len(sentence) if sentence_length < length: while len(sentence) < length: sentence.append('<PAD>') elif sentence_length > length: sentence = sentence[:length] return sentence def _vec_to_word(self, vector) -> str: if np.array_equal(vector, np.eye(100, dtype=np.float32)[0]): return '<PAD>' elif np.array_equal(vector, np.eye(100, dtype=np.float32)[1]): return '<STA>' elif np.array_equal(vector, np.eye(100, dtype=np.float32)[2]): return '<EOS>' elif np.array_equal(vector, np.eye(100, dtype=np.float32)[3]): return '<UNK>' return self.fasttext.wv.most_similar(positive=[vector], topn=1)[0][0] def _word_to_vec(self, word) -> np.array: try: if word == '<PAD>': return np.eye(100, dtype=np.float32)[0] elif word == '<STA>': return np.eye(100, dtype=np.float32)[1] elif word == '<EOS>': return np.eye(100, dtype=np.float32)[2] elif word == '<UNK>': return np.eye(100, dtype=np.float32)[3] return self.fasttext.wv.word_vec(word) except: return np.eye(100, dtype=np.float32)[3] def _word_to_idx(self, word) -> int: try: return list(self.idx_word_dict.keys())[list( self.idx_word_dict.values()).index(word)] except: return 3 def _idx_to_word(self, idx) -> str: return self.idx_word_dict[idx]
#print("ㅋ이 들어간 문장 중 ㅋ의 길이의 표준편차: ", np.std(np_single)) f.write("ㅋ이 들어간 chat 중 ㅋ의 평균 길이: " + str(round(np.mean(np_single),3)) + '\n') #얘가 신뢰할 수 없는 정보인게 히스토그램, 분산, 표준편차 확인해보면 값이 몰려 있음 -> 중앙값도 한 번 봐보자 f.write("ㅋ이 들어간 chat 중 ㅋ의 길이의 중앙값: " + str(np.median(np_single)) + '\n') #얘도 신뢰할 수 없는 정보인게 자료분포가 중심지향적이지 않음 -> 최빈값도 고려 f.write("ㅋ이 들어간 chat 중 ㅋ의 길이의 최빈값(상위3개): " + str(Counter(np_single).most_common()[:3])+'\n') #상위 3개 확인 n, bins, patches = plt.hist(np_single, bins=sentence_cnt) # ㅋ이 들어간 문장 중 ㅋ의 평균 출현 횟수에 대한 히스토그램 plt.savefig(result_path+"/"+file_num+".png") f.close() raw_time, raw_chat = read_data(file_name) laugh_check(raw_chat) ''' 통계에 기반하여 단어를 찾아내는 비지도 학습법 1. Accessor Variety 2. Branching Entropy 3. Cohesion score ''' word_extractor = WordExtractor( min_frequency=20, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) #여기서는 Cohesion Score 사용 ''' word_extractor.train(raw_chat) words = word_extractor.extract() print("word extraction 길이: ",len(words), " \n결과: ") print(words) #words_score = {word : score.cohesion_forward for word, score in words.items()} #tokenizer = LTokenizer(scores=words_score) '''
f.write("ㅋ이 들어간 chat 중 ㅋ의 길이의 최빈값(상위3개): " + str(Counter(np_single).most_common()[:3])+'\n') #상위 3개 확인 n, bins, patches = plt.hist(np_single, bins=sentence_cnt) # ㅋ이 들어간 문장 중 ㅋ의 평균 출현 횟수에 대한 히스토그램 plt.savefig(result_path+"/"+file_num+".png") f.close() raw_time, raw_chat = read_data(file_name) laugh_check(raw_chat) ''' 통계에 기반하여 단어를 찾아내는 비지도 학습법 1. Accessor Variety 2. Branching Entropy 3. Cohesion score ''' word_extractor = WordExtractor( min_frequency=20, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) #여기서는 Cohesion Score 사용 word_extractor.train(raw_chat) words = word_extractor.extract() ''' print("word extraction 길이: ",len(words), " \n결과: ") print(words) #words_score = {word : score.cohesion_forward for word, score in words.items()} #tokenizer = LTokenizer(scores=words_score) '''
raw_data.append(sent) # --------------------------토크나이저 로드-------------------- import numpy as np from soynlp.word import WordExtractor from soynlp.utils import DoublespaceLineCorpus from soynlp.tokenizer import LTokenizer word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_score) # # --------------------------word2vec 데이터 전처리-------------------- cluster_data = [] for k, title in enumerate(news_title): title = test(title) sent = tokenizer.tokenize(title, flatten=False)
y_train.append(i[2]) #x_data = [] #_data = [] #for i in raw_data: # x_data.append(i[1]) # y_data.append(i[2]) #x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3) word_extractor = WordExtractor(min_frequency=150, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(x_train) train_words = word_extractor.extract() train_score = { word: score.cohesion_forward for word, score in train_words.items() } tokenizer = LTokenizer(scores=train_score) train_list = [] cnt = 0 for sent in x_train: train_list.append([tokenizer.tokenize(sent), y_train[cnt]]) cnt += 1 word_extractor.train(x_test) test_words = word_extractor.extract() test_score = {
def Makegraph_Wordcloud_Soynlp(target): try: if flag_login == 0 or flag_login == None or flag_login == '': Login() #elif flag_prepro == 0: #messagebox.showwarning('주의', '데이터 전처리 후 실행해주세요.') #return else: data_wordcloud_soynlp = pd.DataFrame(data_origin[target], columns=['contents']) data_wordcloud_soynlp['contents'] = data_origin[target].apply( lambda x: re.sub('[^가-힣]', ' ', x)) word_extractor = WordExtractor( min_frequency=10, # 가변화하기 (ex. data_origin.len() 비례) min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(data_wordcloud_soynlp['contents'].values) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # force : 여기인가? # force join words cohesion_score['숙소제공'] = 1 cohesion_score['교통비지급'] = 1 cohesion_score['인센티브'] = 1 cohesion_score['초과근무시간확대'] = 1 cohesion_score['복지포인트'] = 1 cohesion_score['인사우대'] = 1 cohesion_score['근평가점'] = 1 cohesion_score['주거이전수당'] = 1 tokenizer = LTokenizer(scores=cohesion_score) data_wordcloud_soynlp['tokenizer'] = data_wordcloud_soynlp[ 'contents'].apply( lambda x: tokenizer.tokenize(x, remove_r=True)) words = list() for i in data_wordcloud_soynlp['tokenizer'].values: for j in i: words.append(j) count_soynlp = Counter(words) words_dict_soynlp = dict(count_soynlp.most_common(100)) # 빈도 상위 n개 csv_stopwords = pd.read_csv('stopwords.csv', encoding='cp949', skiprows=0) # with open 변경 stopwords = list() for i in csv_stopwords.values: for j in i: stopwords.append(j) for word in stopwords: words_dict_soynlp.pop(word, None) wordcloud = WordCloud( font_path='NanumGothic.ttf', width=500, height=500, background_color='white').generate_from_frequencies( words_dict_soynlp) plt.clf() plt.figure(figsize=(20, 20)) plt.imshow(wordcloud) plt.axis('off') #plt.show() plt.savefig(resultdir + filename_dateflag + target + ' - wordcloud_soynlp.png', dpi=100) ''' # 빈도그래프(temp) plt.clf() plt.style.use('ggplot') plt.figure(figsize = (len(list(words_dict_soynlp.keys())[:20])*0.6, 10)) # grid size 가변화 plt.title('상위 10개 빈출단어') plt.bar(list(words_dict_soynlp.keys())[:20], list(words_dict_soynlp.values())[:20]) plt.xticks(rotation = 45, ha = 'right') # x축 라벨 회전 plt.savefig(resultdir + filename_dateflag + target + ' - wordfrequency.png', dpi = 200) ''' messagebox.showinfo( '작업', '워드클라우드(Soynlp) 생성이 완료되었습니다.\n\nresult폴더에 결과물이 저장되었습니다.') except Exception as e: Log(desc=e) messagebox.showerror('경고', str(e) + ' 열을 찾을 수 없습니다.')
class KoreanTokenizer: ''' A class to tokenize a Korean sentence. Attributes ---------- **kwargs | Keyword arguments for WordExtractor object (see soynlp.word.WordExtractor) Methods ------- train | Trains KoreanTokenizer on a corpus tokenize | Tokenizes the input sentence and returns its tokens ''' from soynlp.word import WordExtractor from soynlp.utils import check_corpus from soynlp.utils import DoublespaceLineCorpus from soynlp.tokenizer import LTokenizer def __init__(self, **kwargs): if 'sents' in kwargs.keys(): del kwargs['sents'] print("WARNING: 'sents' argument is ignored.") self.WordExtractor = WordExtractor(**kwargs) def train(self, text, **kwargs): ''' A method to train the KoreanTokenizer object. Attributes ---------- text : iterable or DoublespaceLineCorpus | A input text in any iterable type (e.g. list) | or a DoublespaceLineCorpus object (see soynlp.utils.DoublespaceLineCorpus) **kwargs | Keyword arguments for WordExtractor.train() method (see soynlp.word.WordExtractor.train) ''' if 'sents' in kwargs.keys(): del kwargs['sents'] print("WARNING: 'sents' argument is ignored; WordExtractor is trained on 'text' argument only.") self.WordExtractor.train(text, **kwargs) self.words = self.WordExtractor.extract() def calculate_word_score(word, score): cohesion = score.cohesion_forward branching_entropy = score.right_branching_entropy word_score = cohesion * exp(branching_entropy) return word_score self.word_score = {word:calculate_word_score(word, score) for word, score in words.items()} def tokenize(self, text, **kwargs): ''' A method to tokenize the input text Attributes ---------- text : str | An input text in str type **kwargs | Keyword arguments for LTokenizer.tokenize() method (see soynlp.tokenizer.LTokenizer.tokenize) ''' if 'sentence' in kwargs.keys(): del kwargs['sentence'] print("WARNING: 'sentence' argument is ignored; word_tokenizer tokenizes 'text' argument only.") if not self.word_score: print('KoreanTokenizer should be trained first, before tokenizing.') return self.tokenizer = LTokenizer(scores=self.word_score) result = self.tokenizer.tokenize(text, **kwargs) return result
temp['score'] = score temp['freq'] = freq nouns_list.append(temp) df_nouns = pd.DataFrame(nouns_list) df_nouns = df_nouns.sort_values(by=['score'], ascending=False) nouns_candidates_list = df_nouns.loc[df.score > NOUNS_THRESHOLD].noun.tolist() print('nouns_candidates_list : {}\n'.format(len(nouns_candidates_list))) print(''' words extractor ''') word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(corpus) words = word_extractor.extract() words = {k: v for k, v in words.items() if len(k) > 1} words_list = list() for k, v in words.items(): temp = dict() cohesion = v.cohesion_forward branching_entropy = v.left_branching_entropy left_freq = v.leftside_frequency right_freq = v.rightside_frequency score = cohesion * branching_entropy temp['word'] = k.lower() temp['cohesion'] = cohesion temp['branching_entropy'] = branching_entropy temp['left_freq'] = left_freq
isSave = False if isSave: txtreader = txt_reader("../Data/Text/Joins/Sasul.txt", False) list_words = [] list_sents = [] #선택적 한계가 생길 것이다. for i, doc in enumerate(txtreader): doc_text = doc.split("\t")[4] # splits with sentences sents = doc_text.split('.') for sent in sents: list_sents.append(sent) print("length of list_sents = {}", len(list_sents)) word_extractor = WordExtractor(min_count=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(list_sents) # list of str or like words = word_extractor.extract() with open("words.pkl", "wb") as f: pickle.dump(words, f) else: # 통계적인 방법은 반복수가 많아야 하는 기법이다. print("Load") with open("words.pkl", "rb") as f: words_dic = pickle.load(f) print(type(words_dic)) nlphelper = nlp_helper() nlphelper.cvtWordDicToExcel(words_dic, "output")