def tokenizer_test(): from soynlp.tokenizer import LTokenizer from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import RegexTokenizer regex_tokenizer = RegexTokenizer() if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']): raise ValueError("regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format( regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!'))) ltokenizer = LTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38}) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이'))) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == ['데이터', '는', '데이터센터', '의', '데이', '데이']): raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05))) maxscore_tokenizer = MaxScoreTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38}) if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError("maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이'))) print('all tokenizer tests have been successed\n\n')
def __init__(self): # Pretrained Word2Vec Model (ko.bin) self.load("ko.bin") # label -> For Tokenized # label_nt -> For No Tokenized label = list() label_nt = list() # Priority Score scores = {'메일' : 1, '이메일' : 1,'교수님' : 1 , '교수': 0.8,'학식':1,'기식':1,'오늘':0.9,'넘버':0.7,'소웨':0.8,\ '연락처' : 1, '전화번호' : 1, '번호' : 0.8, '핸드폰' : 1, '휴대폰' : 1, '전화' : 0.8,'전번' : 0.5,\ '사무실' : 1, '연구실' : 1, '랩실' : 1, '렙실' : 1, '어디':1,'학생식당':1,'기숙사식당':1,'학과사무실':1,'과사':0.8,'과사무실':1.0,'위치':0.8,'소중사':1.0,'소프트웨어중심사업단':1.0} # Soynlp Tokenizer tokenizer = MaxScoreTokenizer(scores=scores) # Read Data f = open("intend_label.txt", 'r') while True: line = f.readline() if not line: break line = line.replace("\n","") label_nt.append(line) b = tokenizer.tokenize(line) label.append(b) f.close() self.tokenizer = tokenizer self.label = label self.label_nt = label_nt self.files = Files() self.prep = Preprocess()
def _extract_compound_nouns(self, eojeols, nouns, suffix): def parse_compound(tokens): for token in tokens[:-1]: if token[3] <= 0: return None # Noun* + Josa if len(tokens) >= 3 and (tokens[-1][0] in suffix): return ''.join(t[0] for t in tokens[:-1]) # all tokens are noun if tokens[-1][3] > 0: return ''.join(t[0] for t in tokens) # else, not compound return None tokenizer = MaxScoreTokenizer( scores={noun: 1 for noun in nouns if len(noun) > 1}) compounds, removals = {}, set() for word, count in eojeols.items(): # format: [(word, begin, end, score, length)] tokens = tokenizer.tokenize(word, flatten=False)[0] noun = parse_compound(tokens) if noun is not None: compounds[noun] = compounds.get(noun, 0) + count removals.add(word) return compounds, removals
class SoyNLPTokenizer(BaseTokenizer): """ Tokenize text using MaxScoreTokenizer of SoyNLP """ def __init__(self): self.tokenizer = None self.scores = list() self.word_extractor = WordExtractor(min_count=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) def fit(self, sentences): self.word_extractor.train(sentences) scores = self.word_extractor.extract() scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \ (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()] self.scores = scores self.tokenizer = MaxScoreTokenizer(scores=self.scores) def state_dict(self): return {'scores': self.scores} def load_state_dict(self, state_dict): self.scores = state_dict['scores'] self.tokenizer = MaxScoreTokenizer(scores=self.scores) def tokenize(self, sentence): tokenized_sentence = self.tokenizer.tokenize(sentence) return tokenized_sentence
class MaxScoreTokenizerKorean(SpecialTokenizer): def __init__(self, scores=None): from soynlp.tokenizer import MaxScoreTokenizer self.inst = MaxScoreTokenizer(scores=scores) self.OUT_TYPE = [list, str] def __call__(self, *args, **kwargs): tokens = self.inst.tokenize(args[0]) return tokens
def tokenize(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) token_list = [] for num, input in enumerate(token): if (token[num] in scores) == True: token_list.append(token[num]) elif (token[num] in scores) == False: kkma_token = self.t.morphs(token[num]) token_list= token_list + kkma_token return token_list
class SoyNLPTokenizer(BaseTokenizer): """ Tokenize text using MaxScoreTokenizer of SoyNLP """ def __init__(self, config): with open(config.soynlp_scores, "r") as f: scores = [line.strip().split("\t") for line in f] scores = {word: float(score) for word, score in scores} self.tokenizer = MaxScoreTokenizer(scores=scores) def tokenize(self, sentence): tokenized_sentence = self.tokenizer.tokenize(sentence) return tokenized_sentence
def find_pro_name(self, inp): name_dic = {'Yenewondim Sinshaw', '강경란', '고영배', '고정길', '김도형',\ '김동윤','김민구','김성수','김승운','노병희','류기열','변광준',\ '손경아','안정섭','오상윤','위규범','윤대균','이석원','이정태',\ '이택균','이환용','임재성','정크리스틴','정태선','조영종',\ '최경희','최영준','최재영','한경식','황원준','Paul rajib','학과사무실','과사','과사무실',\ '경란','경란이','영배','정길','정길이','도형','도형이','동윤',\ '민구','성수','승운','승운이','병희','기열','기열이','광준','광준이',\ '경아','정섭','정섭이','상윤','상윤이','규범','규범이','대균',\ '대균이','석원','석원이','정태','택균','택균이','환용','환용이',\ '정크','정크리','크리스틴','태선','태선이','영종','영종이',\ '경희','영준','영준이','재영','재영이','경식','경식이','원준',\ '원준이','예나원딤','에나원딤','yenewondim','Yenewondim',\ 'yenawondim','Yenawondim','라집','폴라집','Paul','폴',\ 'Paulrajib','paulrajib','제작자','소중사','소프트웨어중심사업단',\ 'Ibrahim Mohd Ali Alsofyani','Ran Rong','감동근','구형일','권익진',\ '김도희','김상배','김상완','김상인','김영길','김영진',\ '김재현','나상신','박성진','박용배','박익모','선우명훈','양상식',\ '양회석','오성근','윤원식','이교범','이기근','이상용','이재진',\ '이정원','이종욱','이채우','이해영','정기현','조성준','조위덕',\ '조중열','좌동경','지동우','허용석','허준석','홍송남','홍영대',\ '곽진','김강석','김기형','김상곤','김재훈','손태식','예홍진',\ '유재희','홍만표','경민호','고욱','김지은','김현희','김효동',\ '석혜정','신현준','오규환','이경원','이윤진','이주엽','임유상',\ '장우진','정태영','최정주','구자열','박승규','백호기','이병묵',\ '이태공','홍성표','란롱'} name_dic_list = list(name_dic) scores_name = {str(name_dic_list[step]) : 1.0 for step, inputs in enumerate(name_dic_list)} tokenizer_name = MaxScoreTokenizer(scores=scores_name) c = tokenizer_name.tokenize(inp) c = self.prep.replace(c) # Check Professor name professor_name = "0" # initial number check = 0 for step, inputs in enumerate(name_dic): for i in range(len(c)): if c[i] == inputs: professor_name = inputs check = check + 1 c = self.tokenizer.tokenize(inp) if professor_name == "0" : for i in range(len(c)): if self.find_extra_name(c[i])==True: professor_name = "1" # Wrong name break; if check > 1 : professor_name = "2" # More than two Professor names return professor_name
def tokenize(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) token_list = [] for num, input in enumerate(token): twit_token = self.t.pos(token[num], norm=True, stem=True) for i in range(0, len(twit_token), 1): if twit_token[i][1] != "Josa" and twit_token[i][1] != "Punctuation" and \ twit_token[i][1] != "KoreanParticle" : token_list.append(twit_token[i][0]) return token_list
def tokenizer_test(): from soynlp.tokenizer import LTokenizer from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import RegexTokenizer regex_tokenizer = RegexTokenizer() if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']): raise ValueError( "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format( regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!'))) ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38}) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError( "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이'))) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == ['데이터', '는', '데이터센터', '의', '데이', '데이']): raise ValueError( "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}". format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05))) maxscore_tokenizer = MaxScoreTokenizer({ '데이터': 0.4, '데이': 0.35, '데이터센터': 0.38 }) if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError( "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이'))) print('all tokenizer tests have been successed\n')
def rankFunction(texts): scores = { '선박명': 0.5, '총톤수는': 0.7, '년': 0.5, '월': 0.5, '일': 0.5, '시': 0.5, '분': 0.5, '울산': 0.5, '예정': 0.5 } tokenizer = MaxScoreTokenizer(scores=scores) keywords = tokenizer.tokenize(texts) return keywords
def noun_extract_dup(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) noun_list = [] compared_noun_list = self.t.nouns(sentence) for num, input in enumerate(token): if (token[num] in scores) == True: noun_list.append(token[num]) elif (token[num] in scores) == False: twit_token = self.t.nouns(token[num]) noun_list= noun_list + twit_token diff_noun_list = list(set(noun_list) - set(compared_noun_list)) diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys())) noun_list = list(set(noun_list) - set(diff_noun_list)) return noun_list
class LRMaxScoreTagger: def __init__(self, domain_dictionary_folders=None, use_base_dictionary=True, dictionary_word_mincount=3, evaluator=None, sents=None, lrgraph=None, lrgraph_lmax=12, lrgraph_rmax=8, base_tokenizer=None, preference=None, verbose=False ): self.dictionary = Dictionary(domain_dictionary_folders, use_base_dictionary, dictionary_word_mincount, verbose=verbose) self.evaluator = evaluator if evaluator else LREvaluator() self.preference = preference if preference else {} self.lrgraph = lrgraph if lrgraph else {} if (not self.lrgraph) and (sents): self.lrgraph = _build_lrgraph(sents, lrgraph_lmax, lrgraph_rmax) self.lrgraph_norm, self.lcount, self.cohesion_l, self.droprate_l\ = self._initialize_scores(self.lrgraph) self.base_tokenizer = base_tokenizer if base_tokenizer else lambda x:x.split() if not base_tokenizer: try: self.base_tokenizer = MaxScoreTokenizer(scores=self.cohesion_l) except Exception as e: print('MaxScoreTokenizer(cohesion) exception: {}'.format(e)) def _build_lrgraph(self, sents, lmax=12, rmax=8): from collections import Counter from collections import defaultdict eojeols = Counter((eojeol for sent in sents for eojeol in sent.split() if eojeol)) lrgraph = defaultdict(lambda: defaultdict(int)) for eojeol, count in eojeols.items(): n = len(eojeol) for i in range(1, min(n, lmax)+1): (l, r) = (eojeol[:i], eojeol[i:]) if len(r) > rmax: continue lrgraph[l][r] += count return lrgraph def _initialize_scores(self, lrgraph): def to_counter(dd): return {k:sum(d.values()) for k,d in dd.items()} def to_normalized_graph(dd): normed = {} for k,d in dd.items(): sum_ = sum(d.values()) normed[k] = {k1:c/sum_ for k1,c in d.items()} return normed lrgraph_norm = to_normalized_graph(lrgraph) lcount = to_counter(lrgraph) cohesion_l = {w:pow(c/lcount[w[0]], 1/(len(w)-1)) for w, c in lcount.items() if len(w) > 1} droprate_l = {w:c/lcount[w[:-1]] for w, c in lcount.items() if len(w) > 1 and w[:-1] in lcount} return lrgraph_norm, lcount, cohesion_l, droprate_l def pos(self, sent, flatten=True, debug=False): sent_ = [self._pos(eojeol, debug) for eojeol in sent.split() if eojeol] if flatten: sent_ = [word for words in sent_ for word in words] return sent_ def _pos(self, eojeol, debug=False): candidates = self._initialize(eojeol) scores = self._scoring(candidates) best = self._find_best(scores) if best: post = self._postprocessing(eojeol, best) else: post = self._base_tokenizing_subword(eojeol, 0) if not debug: post = [w for lr in post for w in lr[:2] if w[0]] return post def _initialize(self, t): candidates = self._initialize_L(t) candidates = self._initialize_LR(t, candidates) return candidates def _initialize_L(self, t): n = len(t) candidates = [] for b in range(n): for e in range(b+2, min(n, b+self.dictionary._lmax)+1): l = t[b:e] l_pos = self.dictionary.pos_L(l) if not l_pos: continue candidates.append([l, # 0 l_pos, # 1 b, # 2 e, # 3 e-b, # 4 ]) candidates = self._remove_l_subsets(candidates) return sorted(candidates, key=lambda x:x[2]) def _remove_l_subsets(self, candidates): candidates_ = [] for pos in ['Noun', 'Verb', 'Adjective', 'Adverb', 'Exclamation']: # Sort by len_L sorted_ = sorted(filter(lambda x:x[1] == pos, candidates), key=lambda x:-x[4]) while sorted_: candidates_.append(sorted_.pop(0)) (b, e) = (candidates_[-1][2], candidates_[-1][3]) # removals = [i for i, c in enumerate(sorted_) if b < c[3] and e > c[2]] # Overlap removals = [i for i, c in enumerate(sorted_) if b <= c[2] and e >= c[3]] # Subset (Contain) for idx in reversed(removals): del sorted_[idx] return candidates_ def _initialize_LR(self, t, candidates, threshold_prop=0.001, threshold_count=2): n = len(t) expanded = [] for (l, pos, b, e, len_l) in candidates: for len_r in range(min(self.dictionary._rmax, n-e)+1): r = t[e:e+len_r] lr_prop = self.lrgraph_norm.get(l, {}).get(r, 0) lr_count = self.lrgraph.get(l, {}).get(r, 0) if (r) and ((lr_prop <= threshold_prop) or (lr_count <= threshold_count)): continue expanded.append([(l, pos), (r, None if not r else self.dictionary.pos_R(r)), b, e, e + len_r, len_r, len_l + len_r, lr_prop, lr_count ]) expanded = self._remove_r_subsets(expanded) return sorted(expanded, key=lambda x:x[2]) def _remove_r_subsets(self, expanded): expanded_ = [] for pos in ['Josa', 'Verb', 'Adjective', None]: # Sory by len_R sorted_ = sorted(filter(lambda x:x[1][1] == pos, expanded), key=lambda x:-x[5]) while sorted_: expanded_.append(sorted_.pop(0)) (b, e) = (expanded_[-1][3], expanded_[-1][4]) # removals = [i for i, c in enumerate(sorted_) if b < c[3] and e > c[2]] # Overlap removals = [i for i, c in enumerate(sorted_) if b <= c[3] and e >= c[4]] # Subset (Contain) for idx in reversed(removals): del sorted_[idx] expanded_ = [[L, R, p0, p2, len_LR, prop, count] for L, R, p0, p1, p2, len_R, len_LR, prop, count in expanded_] return expanded_ def _scoring(self, candidates): candidates = [self._to_table(c) for c in candidates] scores = self.evaluator.evaluate(candidates, self.preference if self.preference else None) return scores def _to_table(self, c): return Table(c[0], c[1], c[2], c[3], c[4], c[5], c[6], self.cohesion_l.get(c[0][0], 0), self.droprate_l.get(c[0][0], 0), self.lcount.get(c[0][0], 0) ) def _find_best(self, scores): best = [] sorted_ = sorted(scores, key=lambda x:-x[-1]) while sorted_: best.append(sorted_.pop(0)[0]) (b, e) = (best[-1][2], best[-1][3]) removals = [i for i, (c, _) in enumerate(sorted_) if b < c[3] and e > c[2]] # Overlap for idx in reversed(removals): del sorted_[idx] return sorted(best, key=lambda x:x[2]) def _postprocessing(self, t, words): n = len(t) adds = [] if words and words[0][2] > 0: adds += self._add_first_subword(t, words) if words and words[-1][3] < n: adds += self._add_last_subword(t, words, n) adds += self._add_inter_subwords(t, words) post = [w for w in words] + [self._to_table(a) for a in adds] return sorted(post, key=lambda x:x[2]) def _infer_subword_information(self, subword): pos = self.dictionary.pos_L(subword) prop = self.lrgraph_norm.get(subword, {}).get('', 0.0) count = self.lrgraph.get(subword, {}).get('', 0) if not pos: pos = self.dictionary.pos_R(subword) return (pos, prop, count) def _add_inter_subwords(self, t, words): adds = [] for i, base in enumerate(words[:-1]): if base[3] == words[i+1][2]: continue b = base[3] e = words[i+1][2] subword = t[b:e] #(pos, prop, count) = self._infer_subword_information(subword) #adds.append([(subword, pos), ('', None), b, e, e-b, prop, count, 0.0]) adds += self._base_tokenizing_subword(subword, b) return adds def _add_last_subword(self, t, words, n): b = words[-1][3] subword = t[b:] #(pos, prop, count) = self._infer_subword_information(subword) #return [[(subword, pos), ('', None), b, n, n-b, prop, count, 0.0]] return self._base_tokenizing_subword(subword, b) def _add_first_subword(self, t, words): e = words[0][2] subword = t[0:e] #(pos, prop, count) = self._infer_subword_information(subword) #return [[(subword, pos), ('', None), 0, e, e, prop, count, 0.0]] return self._base_tokenizing_subword(subword, 0) def _base_tokenizing_subword(self, t, b): subwords = [] _subwords = self.base_tokenizer.tokenize(t, flatten=False) if not _subwords: return [] for w in _subwords[0]: (pos, prop, count) = self._infer_subword_information(w[0]) subwords.append([(w[0], pos), ('', None), b+w[1], b+w[2], w[2]-w[1], prop, count, 0.0]) return subwords def add_words_into_dictionary(self, words, tag): if not (tag in self.dictionary._pos): raise ValueError('{} does not exist base dictionary'.format(tag)) self.dictionary.add_words(words, tag) def remove_words_from_dictionary(self, words, tag): if not (tag in self.dictionary._pos): raise ValueError('{} does not exist base dictionary'.format(tag)) self.dictionary.remove_words(words, tag) def save_domain_dictionary(self, folder, head=None): self.dictionary.save_domain_dictionary(folder, head) def set_word_preferance(self, words, tag, preference=10): if type(words) == str: words = {words} preference_table = self.preference.get(tag, {}) preference_table.update({word:preference for word in words}) self.preference[tag] = preference_table def save_tagger(self, fname): raise NotImplemented
class LRNounExtractor_v2: def __init__(self, max_left_length=10, max_right_length=9, predictor_headers=None, verbose=True, min_num_of_features=1, max_frequency_when_noun_is_eojeol=30, eojeol_counter_filtering_checkpoint=200000, min_eojeol_frequency=1, extract_compound=True, extract_pos_feature=False, extract_determiner=False, postprocessing=None, logpath=None): self.max_left_length = max_left_length self.max_right_length = max_right_length self.lrgraph = None self.verbose = verbose self.min_num_of_features = min_num_of_features self.max_frequency_when_noun_is_eojeol = max_frequency_when_noun_is_eojeol self.eojeol_counter_filtering_checkpoint = eojeol_counter_filtering_checkpoint self.min_eojeol_frequency = min_eojeol_frequency self.extract_compound = extract_compound self.extract_pos_feature = extract_pos_feature self.extract_determiner = extract_determiner self.logpath = logpath if logpath: check_dirs(logpath) if not postprocessing: postprocessing = [ 'detaching_features', 'ignore_features', 'ignore_NJ' ] elif isinstance(postprocessing) == str: postprocessing = [postprocessing] self.postprocessing = postprocessing if not predictor_headers: predictor_headers = self._set_default_predictor_header() self._load_predictor(predictor_headers) @property def is_trained(self): return self.lrgraph def _set_default_predictor_header(self): if self.verbose: print('[Noun Extractor] use default predictors') dirname = '/'.join( os.path.abspath(__file__).replace('\\', '/').split('/')[:-2]) predictor_header = [ '{}/trained_models/noun_predictor_ver2'.format(dirname) ] return predictor_header def _load_predictor(self, headers): if type(headers) == str: headers = [headers] pos, neg = set(), set() for header in headers: # load positive features such as Josa pos_path = '{}_pos'.format(header) with open(pos_path, encoding='utf-8') as f: pos.update({feature.strip() for feature in f}) # load negative features such as ending (Eomi) neg_path = '{}_neg'.format(header) with open(neg_path, encoding='utf-8') as f: neg.update({feature.strip() for feature in f}) # common features such as -은 (조사/어미), -라고(조사/어미) common = pos.intersection(neg) # remove common features from pos and neg pos = {feature for feature in pos if not (feature in common)} neg = {feature for feature in neg if not (feature in common)} if self.verbose: print('[Noun Extractor] num features: pos={}, neg={}, common={}'. format(len(pos), len(neg), len(common))) self._pos_features = pos self._neg_features = neg self._common_features = common def _append_features(self, feature_type, features): def check_feature_size(): return (len(self._pos_features), len(self._neg_features), len(self._common_features)) # size before n_pos, n_neg, n_common = check_feature_size() if feature_type == 'pos': commons = {f for f in features if (f in self._neg_features)} self._pos_features.update( {f for f in features if not (f in commons)}) elif feature_type == 'neg': commons = {f for f in features if (f in self._pos_features)} self._neg_features.update( {f for f in features if not (f in commons)}) elif feature_type == 'common': commons = features else: raise ValueError( 'Feature type was wrong. Choice = [pos, neg, common]') self._common_features.update(commons) # size after n_pos_, n_neg_, n_common_ = check_feature_size() if self.verbose: message = 'pos={} -> {}, neg={} -> {}, common={} -> {}'.format( n_pos, n_pos_, n_neg, n_neg_, n_common, n_common_) print('[Noun Extractor] features appended. {}'.format(message)) def train_extract(self, sentences, min_noun_score=0.3, min_noun_frequency=1, min_eojeol_frequency=1, reset_lrgraph=True): self.train(sentences) return self.extract(min_noun_score, min_noun_frequency, reset_lrgraph) def train(self, sentences): if self.verbose: print('[Noun Extractor] counting eojeols') eojeol_counter = EojeolCounter( sentences, self.min_eojeol_frequency, max_length=self.max_left_length + self.max_right_length, filtering_checkpoint=self.eojeol_counter_filtering_checkpoint, verbose=self.verbose) self._num_of_eojeols = eojeol_counter._count_sum self._num_of_covered_eojeols = 0 if self.verbose: print('[Noun Extractor] complete eojeol counter -> lr graph') self.lrgraph = eojeol_counter.to_lrgraph(self.max_left_length, self.max_right_length) if self.verbose: print('[Noun Extractor] has been trained. mem={} Gb'.format( '%.3f' % get_process_memory())) def _extract_determiner(self): raise NotImplemented def extract_domain_pos_features(self, append_extracted_features=True, noun_candidates=None, ignore_features=None, min_noun_score=0.3, min_noun_frequency=100, min_pos_score=0.3, min_pos_feature_frequency=1000, min_num_of_unique_lastchar=4, min_entropy_of_lastchar=0.5, min_noun_entropy=1.5): if self.verbose: print( '[Noun Extractor] batch prediction for extracting pos feature') if not noun_candidates: noun_candidates = self._noun_candidates_from_positive_features() prediction_scores = self._batch_predicting_nouns( noun_candidates, min_noun_score) self.lrgraph.reset_lrgraph() self._pos_features_extracted = extract_domain_pos_features( prediction_scores, self.lrgraph, self._pos_features, ignore_features, min_noun_score, min_noun_frequency, min_pos_score, min_pos_feature_frequency, min_num_of_unique_lastchar, min_entropy_of_lastchar, min_noun_entropy) if append_extracted_features: self._append_features('pos', self._pos_features_extracted) if self.verbose: print('[Noun Extractor] {} pos features were extracted'.format( len(self._pos_features_extracted))) def extract(self, min_noun_score=0.3, min_noun_frequency=1, reset_lrgraph=True): # reset covered eojeol count self._num_of_covered_eojeols = 0 # base prediction noun_candidates = self._noun_candidates_from_positive_features() if self.extract_pos_feature: if self.verbose: print('[Noun Extractor] extract and append pos features') self.extract_domain_pos_features(noun_candidates) prediction_scores = self._batch_predicting_nouns( noun_candidates, min_noun_score) if self.logpath: with open(self.logpath + '_prediction_score.log', 'w', encoding='utf-8') as f: f.write('noun score frequency\n') for word, score in sorted(prediction_scores.items(), key=lambda x: -x[1][1]): f.write('{} {} {}\n'.format(word, score[0], score[1])) # E = N*J+ or N*Posi+ if self.extract_compound: candidates = { l: sum(rdict.values()) for l, rdict in self.lrgraph._lr.items() if len(l) >= 4 } compounds = self.extract_compounds(candidates, prediction_scores, min_noun_score) else: compounds = {} # combine single nouns and compounds nouns = { noun: score for noun, score in prediction_scores.items() if score[1] >= min_noun_score } nouns.update(compounds) # frequency filtering nouns = { noun: score for noun, score in nouns.items() if score[0] >= min_noun_frequency } nouns = self._post_processing(nouns, prediction_scores, compounds) if self.verbose: print( '[Noun Extractor] {} nouns ({} compounds) with min frequency={}' .format(len(nouns), len(compounds), min_noun_frequency), flush=True) print('[Noun Extractor] flushing ... ', flush=True, end='') self._check_covered_eojeols(nouns) self._nouns = nouns if reset_lrgraph: # when extracting predicates, do not reset lrgraph. # the remained lrgraph is predicate (stem - ending) graph self.lrgraph.reset_lrgraph() nouns_ = { noun: NounScore(score[0], score[1]) for noun, score in nouns.items() } return nouns_ def _get_nonempty_features(self, word, features): return [ r for r, _ in features if (((r in self._pos_features) and (not self._exist_longer_pos(word, r))) or ( (r in self._neg_features) and (not self._exist_longer_neg(word, r)))) ] def _exist_longer_pos(self, word, r): for e in range(len(word) - 1, -1, -1): if (word[e:] + r) in self._pos_features: return True return False def _exist_longer_neg(self, word, r): for e in range(len(word) - 1, -1, -1): if (word[e:] + r) in self._neg_features: return True return False def predict(self, word, min_noun_score=0.3, debug=False): # scoring features = self.lrgraph.get_r(word, -1) pos, common, neg, unk, end = self._predict(word, features) base = pos + neg score = 0 if base == 0 else (pos - neg) / base support = pos + end + common if score >= min_noun_score else neg + end + common features_ = self._get_nonempty_features(word, features) n_features_ = len(features_) # debug code if debug: print('pos={}, common={}, neg={}, unk={}, end={}, n_features_={}'. format(pos, common, neg, unk, end, n_features_)) if n_features_ > self.min_num_of_features: return support, score else: # exception case sum_ = pos + common + neg + unk + end if sum_ == 0: return support, 0 # exception. frequent nouns may have various positive R such as Josa if ((end > self.max_frequency_when_noun_is_eojeol) and (pos >= neg)): return support, score if (common > 0 or pos > 0) and (end / sum_ >= 0.3) and (common >= neg): # 아이웨딩 + [('', 90), ('은', 3), ('측은', 1)] # 은 common / 대부분 단일어절 / 측은 unknown. # 아이엠텍 + [('은', 2), ('', 2)] support = pos + common + end return (support, support / sum_) # 경찰국 + [(은, 1), (에, 1), (에서, 1)] -> {은, 에} first_chars = set() for r, _ in features: if not r: continue if r in self._pos_features or r in self._common_features: if not self._exist_longer_pos(word, r): first_chars.add(r[0]) if not (r in self._pos_features or r in self._common_features): first_chars.add(r[0]) if len(first_chars) >= 2: support = pos + common + end return (support, support / sum_) # Handling for post-processing in NounExtractor # Case 1. # 아이러브영주사과 -> 아이러브영주사 + [(과,1)] (minimum r feature 적용해야 하는 케이스) : 복합명사 # 아이러브영주사과 + [('', 1)] 이므로, 후처리 이후 '아이러브영주사' 후보에서 제외됨 # Case 2. # 아이였으므로 -> 아이였으므 + [(로, 2)] (minimum r feature 적용) # "명사 + Unknown R" 로 후처리 return (support, 0) def _predict(self, word, features): pos, common, neg, unk, end = 0, 0, 0, 0, 0 for r, freq in features: if r == '': end += freq continue if self._exist_longer_pos(word, r): # ignore continue if self._exist_longer_neg(word, r): # negative -다고 neg += freq continue if r in self._common_features: common += freq elif r in self._pos_features: pos += freq elif r in self._neg_features: neg += freq else: unk += freq return pos, common, neg, unk, end def _noun_candidates_from_positive_features(self, condition=None): def satisfy(word, e): return word[:e] == condition # noun candidates from positive featuers such as Josa N_from_J = {} for r in self._pos_features: for l, c in self.lrgraph.get_l(r, -1): # candidates filtering for debugging # condition is first chars in L if not condition: N_from_J[l] = N_from_J.get(l, 0) + c continue # for debugging if not satisfy(l, len(condition)): continue N_from_J[l] = N_from_J.get(l, 0) + c return N_from_J def _batch_predicting_nouns(self, noun_candidates, min_noun_score=0.3): prediction_scores = {} n = len(noun_candidates) for i, word in enumerate(sorted(noun_candidates, key=lambda x: -len(x))): if self.verbose and i % 1000 == 999: percentage = '%.3f' % (100 * (i + 1) / n) print('\r -- batch prediction {} % of {} words'.format( percentage, n), flush=True, end='') # base prediction support, score = self.predict(word, min_noun_score) prediction_scores[word] = (support, score) # if their score is higher than min_noun_score, # remove eojeol pattern from lrgraph if score >= min_noun_score: for r, count in self.lrgraph.get_r(word, -1): # remove all eojeols that including word at left-side. # we have to assume that pos, neg features are incomplete self.lrgraph.remove_eojeol(word + r, count) # if (r == '' or # (r in self._pos_features) or # (r in self._common_features)): # self.lrgraph.remove_eojeol(word+r, count) if self.verbose: print( '\r[Noun Extractor] batch prediction was completed for {} words' .format(n), flush=True) return prediction_scores def extract_compounds(self, candidates, prediction_scores, min_noun_score=0.3): noun_scores = { noun: len(noun) for noun, score in prediction_scores.items() if score[1] > min_noun_score and len(noun) > 1 } self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores) candidates = { l: sum(rdict.values()) for l, rdict in self.lrgraph._lr.items() if (len(l) >= 4) and not (l in noun_scores) } n = len(candidates) compounds_scores = {} compounds_counts = {} compounds_components = {} for i, (word, count) in enumerate( sorted(candidates.items(), key=lambda x: -len(x[0]))): if self.verbose and i % 1000 == 999: percentage = '%.2f' % (100 * i / n) print('\r -- check compound {} %'.format(percentage), flush=True, end='') # skip if candidate is substring of longer compound if candidates.get(word, 0) <= 0: continue tokens = self._compound_decomposer.tokenize(word, flatten=False)[0] compound_parts = self._parse_compound(tokens) if compound_parts: # store compound components noun = ''.join(compound_parts) compounds_components[noun] = compound_parts # cumulate count and store compound score compound_score = max((prediction_scores.get(t, (0, 0))[1] for t in compound_parts)) compounds_scores[noun] = max(compounds_scores.get(noun, 0), compound_score) compounds_counts[noun] = compounds_counts.get(noun, 0) + count # reduce frequency of substrings for e in range(2, len(word)): subword = word[:e] if not subword in candidates: continue candidates[subword] = candidates.get(subword, 0) - count # eojeol coverage self.lrgraph.remove_eojeol(word) if self.verbose: print( '\r[Noun Extractor] checked compounds. discovered {} compounds' .format(len(compounds_scores))) compounds = { noun: (score, compounds_counts.get(noun, 0)) for noun, score in compounds_scores.items() } self._compounds_components = compounds_components return compounds def decompose_compound(self, word): tokens = self._compound_decomposer.tokenize(word, flatten=False)[0] compound_parts = self._parse_compound(tokens) return (word, ) if not compound_parts else compound_parts def _parse_compound(self, tokens): """Check Noun* or Noun*Josa""" # format: (word, begin, end, score, length) for token in tokens[:-1]: if token[3] <= 0: return None # Noun* + Josa if len(tokens) >= 3 and tokens[-1][0] in self._pos_features: return tuple(t[0] for t in tokens[:-1]) # all tokens are noun if tokens[-1][3] > 0: return tuple(t[0] for t in tokens) # else, not compound return None def _post_processing(self, nouns, prediction_scores, compounds): def print_status(method, nouns, removals): n_after = len(nouns) n_before = n_after + len(removals) print('[Noun Extractor] postprocessing {} : {} -> {}'.format( method, n_before, n_after)) logpath = self.logpath + '_postprocessing.log' if self.logpath else None # initialize if logpath: with open(logpath, 'w', encoding='utf-8') as f: f.write('') for method in self.postprocessing: if method == 'detaching_features': logheader = '## Ignore noun candidates from detaching pos features\n' nouns, removals = detaching_features(nouns, self._pos_features, logpath, logheader) if self.verbose: print_status(method, nouns, removals) elif method == 'ignore_features': features = {f for f in self._pos_features} # features.update(self._neg_features) features.update(self._common_features) nouns, removals = ignore_features(nouns, features, logpath) if self.verbose: print_status(method, nouns, removals) elif method == 'ignore_NJ': nouns, removals = check_N_is_NJ(nouns, self.lrgraph, logpath=logpath) if self.verbose: print_status(method, nouns, removals) return nouns def _check_covered_eojeols(self, nouns): self.lrgraph.reset_lrgraph() noun_candidates = self._noun_candidates_from_positive_features() n = len(noun_candidates) for i, word in enumerate(sorted(noun_candidates, key=lambda x: -len(x))): if self.verbose and i % 1000 == 999: percentage = '%.3f' % (100 * (i + 1) / n) print( '\r[Noun Extractor] flushing ... {} %'.format(percentage), flush=True, end='') if not (word in nouns): continue if len(word) > 1: for r, count in self.lrgraph.get_r(word, -1): # remove all eojeols that including word at left-side. # we have to assume that pos, neg features are incomplete self.lrgraph.remove_eojeol(word + r, count) self._num_of_covered_eojeols += count else: # a syllable noun is exception; remove only N + pos feature if (r == '' or (r in self._pos_features) or (r in self._common_features)): self.lrgraph.remove_eojeol(word + r, count) self._num_of_covered_eojeols += count if self.verbose: print('\r[Noun Extractor] flushing was done. mem={} Gb{}'.format( '%.3f' % get_process_memory(), ' ' * 20), flush=True) coverage = '%.2f' % (100 * self._num_of_covered_eojeols / self._num_of_eojeols) print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True)
class LRNounExtractor_v2: def __init__(self, l_max_length=10, r_max_length=9, predictor_headers=None, verbose=True, min_num_of_features=1, max_count_when_noun_is_eojeol=30, eojeol_counter_filtering_checkpoint=0, extract_compound=True, extract_determiner=False, extract_josa=False): self.l_max_length = l_max_length self.r_max_length = r_max_length self.lrgraph = None self.verbose = verbose self.min_num_of_features = min_num_of_features self.max_count_when_noun_is_eojeol = max_count_when_noun_is_eojeol self.eojeol_counter_filtering_checkpoint = eojeol_counter_filtering_checkpoint self.extract_compound = extract_compound if not predictor_headers: predictor_headers = self._set_default_predictor_header() self._load_predictor(predictor_headers) @property def is_trained(self): return self.lrgraph def _set_default_predictor_header(self): if self.verbose: print('[Noun Extractor] use default predictors') dirname = '/'.join(os.path.abspath(__file__).replace('\\', '/').split('/')[:-2]) predictor_header = ['{}/trained_models/noun_predictor_ver2'.format(dirname)] return predictor_header def _load_predictor(self, headers): if type(headers) == str: headers = [headers] pos, neg = set(), set() for header in headers: # load positive features such as Josa pos_path = '{}_pos'.format(header) with open(pos_path, encoding='utf-8') as f: pos.update({feature.strip() for feature in f}) # load negative features such as ending (Eomi) neg_path = '{}_neg'.format(header) with open(neg_path, encoding='utf-8') as f: neg.update({feature.strip() for feature in f}) # common features such as -은 (조사/어미), -라고(조사/어미) common = pos.intersection(neg) # remove common features from pos and neg pos = {feature for feature in pos if not (feature in common)} neg = {feature for feature in neg if not (feature in common)} if self.verbose: print('[Noun Extractor] num features: pos={}, neg={}, common={}'.format( len(pos), len(neg), len(common))) self._pos_features = pos self._neg_features = neg self._common_features = common def train_extract(self, sentences, minimum_noun_score=0.3, min_count=1, min_eojeol_count=1, reset_lrgraph=True): self.train(sentences, min_eojeol_count) return self.extract(minimum_noun_score, min_count, reset_lrgraph) def train(self, sentences, min_eojeol_count=1): if self.verbose: print('[Noun Extractor] counting eojeols') eojeol_counter = EojeolCounter(sentences, min_eojeol_count, max_length=self.l_max_length + self.r_max_length, filtering_checkpoint=self.eojeol_counter_filtering_checkpoint, verbose=self.verbose) self._num_of_eojeols = eojeol_counter._count_sum self._num_of_covered_eojeols = 0 if self.verbose: print('[Noun Extractor] complete eojeol counter -> lr graph') self.lrgraph = eojeol_counter.to_lrgraph( self.l_max_length, self.r_max_length) if self.verbose: print('[Noun Extractor] has been trained. mem={} Gb'.format( '%.3f'%get_process_memory())) def _extract_determiner(self): raise NotImplemented def _extract_josa(self): raise NotImplemented def extract(self, minimum_noun_score=0.3, min_count=1, reset_lrgraph=True): # reset covered eojeol count self._num_of_covered_eojeols = 0 # base prediction noun_candidates = self._noun_candidates_from_positive_features() prediction_scores = self._batch_prediction_order_by_word_length( noun_candidates, minimum_noun_score) # E = N*J+ or N*Posi+ if self.extract_compound: candidates = {l:sum(rdict.values()) for l,rdict in self.lrgraph._lr.items() if len(l) >= 4} compounds = self.extract_compounds( candidates, prediction_scores, minimum_noun_score) else: compounds = {} # combine single nouns and compounds nouns = {noun:score for noun, score in prediction_scores.items() if score[0] >= minimum_noun_score} nouns.update(compounds) # frequency filtering nouns = {noun:score for noun, score in nouns.items() if score[1] >= min_count} nouns = self._post_processing(nouns, prediction_scores, compounds) if self.verbose: print('[Noun Extractor] {} nouns ({} compounds) with min count={}'.format( len(nouns), len(compounds), min_count), flush=True) coverage = '%.2f' % (100 * self._num_of_covered_eojeols / self._num_of_eojeols) print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True) if self.verbose: print('[Noun Extractor] flushing ... ', flush=True, end='') self._nouns = nouns if reset_lrgraph: # when extracting predicates, do not reset lrgraph. # the remained lrgraph is predicate (root - ending) graph self.lrgraph.reset_lrgraph() if self.verbose: print('done. mem={} Gb'.format('%.3f'%get_process_memory())) nouns_ = {noun:NounScore(score[1], score[0]) for noun, score in nouns.items()} return nouns_ def _get_nonempty_features(self, word, features): return [r for r, _ in features if ( ( (r in self._pos_features) and (not self._exist_longer_pos(word, r)) ) or ( (r in self._neg_features) and (not self._exist_longer_neg(word, r)) ) )] def _exist_longer_pos(self, word, r): for e in range(len(word)-1, -1, -1): if (word[e:]+r) in self._pos_features: return True return False def _exist_longer_neg(self, word, r): for e in range(len(word)-1, -1, -1): if (word[e:]+r) in self._neg_features: return True return False def predict(self, word, minimum_noun_score=0.3, debug=False): # scoring features = self.lrgraph.get_r(word, -1) pos, common, neg, unk, end = self._predict(word, features) base = pos + neg score = 0 if base == 0 else (pos - neg) / base support = pos + end + common if score >= minimum_noun_score else neg + end + common # debug code if debug: print(pos, common, neg, unk, end) features_ = self._get_nonempty_features(word, features) if len(features_) > self.min_num_of_features: return score, support else: # exception case sum_ = pos + common + neg + unk + end if sum_ == 0: return 0, support # exception. frequent nouns may have various positive R such as Josa if ((end > self.max_count_when_noun_is_eojeol) and (neg >= pos) ): return score, support if (common > 0 or pos > 0) and (end / sum_ >= 0.3) and (common >= neg): # 아이웨딩 + [('', 90), ('은', 3), ('측은', 1)] # 은 common / 대부분 단일어절 / 측은 unknown. # 아이엠텍 + [('은', 2), ('', 2)] support = pos + common + end return (support / sum_, support) # 경찰국 + [(은, 1), (에, 1), (에서, 1)] -> {은, 에} first_chars = set() for r, _ in features: if not r: continue if r in self._pos_features or r in self._common_features: if not self._exist_longer_pos(word, r): first_chars.add(r[0]) if not (r in self._pos_features or r in self._common_features): first_chars.add(r[0]) if len(first_chars) >= 2: support = pos + common + end return (support / sum_, support) # Handling for post-processing in NounExtractor # Case 1. # 아이러브영주사과 -> 아이러브영주사 + [(과,1)] (minimum r feature 적용해야 하는 케이스) : 복합명사 # 아이러브영주사과 + [('', 1)] 이므로, 후처리 이후 '아이러브영주사' 후보에서 제외됨 # Case 2. # 아이였으므로 -> 아이였으므 + [(로, 2)] (minimum r feature 적용) # "명사 + Unknown R" 로 후처리 return (0, support) def _predict(self, word, features): pos, common, neg, unk, end = 0, 0, 0, 0, 0 for r, freq in features: if r == '': end += freq continue if self._exist_longer_pos(word, r): # ignore continue if self._exist_longer_neg(word, r): # negative -다고 neg += freq continue if r in self._common_features: common += freq elif r in self._pos_features: pos += freq elif r in self._neg_features: neg += freq else: unk += freq return pos, common, neg, unk, end def _noun_candidates_from_positive_features(self, condition=None): def satisfy(word, e): return word[:e] == condition # noun candidates from positive featuers such as Josa N_from_J = {} for r in self._pos_features: for l, c in self.lrgraph.get_l(r, -1): # candidates filtering for debugging # condition is first chars in L if not condition: N_from_J[l] = N_from_J.get(l,0) + c continue # for debugging if not satisfy(l, len(condition)): continue N_from_J[l] = N_from_J.get(l,0) + c # sort by length of word N_from_J = sorted(N_from_J.items(), key=lambda x:-len(x[0])) return N_from_J def _batch_prediction_order_by_word_length(self, noun_candidates, minimum_noun_score=0.3): prediction_scores = {} n = len(noun_candidates) for i, (word, _) in enumerate(noun_candidates): if self.verbose and i % 1000 == 999: percentage = '%.3f' % (100 * (i+1) / n) print('\r -- batch prediction {} % of {} words'.format( percentage, n), flush=True, end='') # base prediction score, support = self.predict(word, minimum_noun_score) prediction_scores[word] = (score, support) # if their score is higher than minimum_noun_score, # remove eojeol pattern from lrgraph if score >= minimum_noun_score: for r, count in self.lrgraph.get_r(word, -1): if r == '' or (r in self._pos_features) or (r in self._common_features): self.lrgraph.remove_eojeol(word+r, count) self._num_of_covered_eojeols += count if self.verbose: print('\r[Noun Extractor] batch prediction was completed for {} words'.format( n), flush=True) return prediction_scores def extract_compounds(self, candidates, prediction_scores, minimum_noun_score=0.3): noun_scores = {noun:len(noun) for noun, score in prediction_scores.items() if score[0] > minimum_noun_score and len(noun) > 1} self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores) candidates = {l:sum(rdict.values()) for l,rdict in self.lrgraph._lr.items() if (len(l) >= 4) and not (l in noun_scores)} n = len(candidates) compounds_scores = {} compounds_counts = {} compounds_components = {} for i, (word, count) in enumerate(sorted(candidates.items(), key=lambda x:-len(x[0]))): if self.verbose and i % 1000 == 999: percentage = '%.2f' % (100 * i / n) print('\r -- check compound {} %'.format(percentage), flush=True, end='') # skip if candidate is substring of longer compound if candidates.get(word, 0) <= 0: continue tokens = self._compound_decomposer.tokenize(word, flatten=False)[0] compound_parts = self._parse_compound(tokens) if compound_parts: # store compound components noun = ''.join(compound_parts) compounds_components[noun] = compound_parts # cumulate count and store compound score compound_score = max((prediction_scores.get(t, (0,0))[0] for t in compound_parts)) compounds_scores[noun] = max(compounds_scores.get(noun,0), compound_score) compounds_counts[noun] = compounds_counts.get(noun,0) + count # reduce frequency of substrings for e in range(2, len(word)): subword = word[:e] if not subword in candidates: continue candidates[subword] = candidates.get(subword, 0) - count # eojeol coverage self.lrgraph.remove_eojeol(word) self._num_of_covered_eojeols += count if self.verbose: print('\r[Noun Extractor] checked compounds. discovered {} compounds'.format( len(compounds_scores))) compounds = {noun:(score, compounds_counts.get(noun,0)) for noun, score in compounds_scores.items()} self._compounds_components = compounds_components return compounds def decompose_compound(self, word): tokens = self._compound_decomposer.tokenize(word, flatten=False)[0] compound_parts = self._parse_compound(tokens) return (word, ) if not compound_parts else compound_parts def _parse_compound(self, tokens): """Check Noun* or Noun*Josa""" # format: (word, begin, end, score, length) for token in tokens[:-1]: if token[3] <= 0: return None # Noun* + Josa if len(tokens) >= 3 and tokens[-1][0] in self._pos_features: return tuple(t[0] for t in tokens[:-1]) # all tokens are noun if tokens[-1][3] > 0: return tuple(t[0] for t in tokens) # else, not compound return None def _post_processing(self, nouns, prediction_scores, compounds): # TODO # Not Implemented return nouns
class ErineTokenizer(object): def __init__(self, config): self.basic_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') if not ( os.path.isfile(config.vocab_path) and os.path.isfile(config.word2vec_path) ): assert config.data_path and os.path.isfile(config.data_path), '[{}] 위치에 학습할 파일이 없습니다.'.format(config.data_path) noun_dict, word2vec_model = build_vocab(config) else: noun_dict = open_pickle(config.vocab_path) word2vec_model = Word2Vec.load(config.word2vec_path) self.config = config self.word_tokenizer = MaxScoreTokenizer(noun_dict) self.index2word = [unk_token] + word2vec_model.wv.index2word word2index = {} for index, word in enumerate(self.index2word): word2index[word] = index self.word2index = word2index self.pad_word_id = 0 self.word_vec_dim = word2vec_model.vector_size self.word_vocab_size = len(word2index) unknown_emb = np.zeros((1, self.word_vec_dim), dtype=float) embedding = word2vec_model.wv.vectors self.embedding = torch.from_numpy(np.concatenate([unknown_emb, embedding], axis=0).astype(np.float)) self.idx2tags = ['I', 'B'] self.tags2idx = {name: idx for idx, name in enumerate(self.idx2tags)} self.tag_size = len(self.idx2tags) self.vocab_size = self.basic_tokenizer.vocab_size self.pad_token_id = self.basic_tokenizer.pad_token_id self.unk_token_id = self.basic_tokenizer.unk_token_id def reset_tokenizer(self, data): noun_dict, word2vec_model = build_vocab(self.config, data) self.word_tokenizer = MaxScoreTokenizer(noun_dict) self.index2word = [unk_token] + word2vec_model.wv.index2word word2index = {} for index, word in enumerate(self.index2word): word2index[word] = index self.word2index = word2index self.pad_word_id = 0 self.word_vec_dim = word2vec_model.vector_size self.word_vocab_size = len(word2index) unknown_emb = np.zeros((1, self.word_vec_dim), dtype=float) embedding = word2vec_model.wv.vectors self.embedding = torch.from_numpy(np.concatenate([unknown_emb, embedding], axis=0).astype(np.float)) def tokenize(self, sentence: str): tokens = [] temp = '' for i in range(len(sentence)): if sentence[i] == spc_token: temp += sentence[i] continue temp += sentence[i] tokens.append(temp) temp = '' return tokens def encode(self, tokens: list, max_length=None): temp_tokens = [] for token in tokens: token = token.replace(spc_token, '') temp_tokens.append(token) sentence = spc_token.join(temp_tokens) if max_length is not None: token_ids = self.basic_tokenizer.encode(sentence, max_length=max_length, pad_to_max_length=True, add_special_tokens=False, truncation=True) else: token_ids = self.basic_tokenizer.encode(sentence, add_special_tokens=False) return token_ids def word_encode(self, sentence, max_length=None): sentence = sentence.replace(spc_token, '') word_tokens = self.word_tokenizer.tokenize(sentence) word_token_ids = [] for idx, word in enumerate(word_tokens): temp_ids = [] if word in self.word2index: temp_ids.append(self.word2index[word]) padding = [self.pad_word_id] * (len(word) - len(temp_ids)) temp_ids = temp_ids + padding word_token_ids.extend(temp_ids) if max_length is not None: word_token_ids = word_token_ids[:max_length] padding = [self.pad_word_id] * (max_length - len(word_token_ids)) word_token_ids = word_token_ids + padding return word_token_ids def decode(self, token_ids: list(), lables: list()): sentences = [] for word_tokens, word_labels in zip(token_ids, lables): sentence = "" for token_id, label in zip(word_tokens, word_labels): if self.idx2tags[label] == 'B': sentence += " " if token_id == self.basic_tokenizer.cls_token_id or token_id == self.basic_tokenizer.sep_token_id: continue sentence += self.basic_tokenizer.convert_ids_to_tokens(token_id) sentences += [sentence] return sentences def get_labels(self, tokens: list, max_length=None): labels = [1 if spc_token in token else 0 for token in tokens] if max_length is not None: labels = labels[:max_length] labels = labels length = len(labels) if max_length is not None and len(labels) < max_length: pad = [0] * (max_length - len(labels)) labels = labels + pad return labels, length def parse(self, sentence: str, max_length=None): sentence = sentence.strip() tokens = self.tokenize(sentence) token_ids = self.encode(tokens, max_length) word_token_ids = self.word_encode(sentence, max_length) lables, length = self.get_labels(tokens, max_length) return token_ids, word_token_ids, lables, length def get_id(self, token: str): if token in self.basic_tokenizer.get_vocab(): return self.basic_tokenizer.get_vocab()[token] else: return self.basic_tokenizer.unk_token_id
class LRNounExtractor_v2: def __init__(self, l_max_length=10, r_max_length=9, predictor_headers=None, verbose=True, min_num_of_features=1, max_count_when_noun_is_eojeol=30): self.l_max_length = l_max_length self.r_max_length = r_max_length self.lrgraph = None self.verbose = verbose self.min_num_of_features = min_num_of_features self.max_count_when_noun_is_eojeol = max_count_when_noun_is_eojeol if not predictor_headers: predictor_headers = self._set_default_predictor_header() self._load_predictor(predictor_headers) @property def is_trained(self): return self.lrgraph def _set_default_predictor_header(self): if self.verbose: print('[Noun Extractor] use default predictors') dirname = '/'.join(os.path.abspath(__file__).replace('\\', '/').split('/')[:-2]) predictor_header = ['{}/trained_models/noun_predictor_ver2'.format(dirname)] return predictor_header def _load_predictor(self, headers): if type(headers) == str: headers = [headers] pos, neg = set(), set() for header in headers: # load positive features such as Josa pos_path = '{}_pos'.format(header) with open(pos_path, encoding='utf-8') as f: pos.update({feature.strip() for feature in f}) # load negative features such as ending (Eomi) neg_path = '{}_neg'.format(header) with open(neg_path, encoding='utf-8') as f: neg.update({feature.strip() for feature in f}) # common features such as -은 (조사/어미), -라고(조사/어미) common = pos.intersection(neg) # remove common features from pos and neg pos = {feature for feature in pos if not (feature in common)} neg = {feature for feature in neg if not (feature in common)} if self.verbose: print('[Noun Extractor] num features: pos={}, neg={}, common={}'.format( len(pos), len(neg), len(common))) self._pos_features = pos self._neg_features = neg self._common_features = common def train_extract(self, sentences, minimum_noun_score=0.3, min_count=1, min_eojeol_count=1): self.train(sentences, min_eojeol_count) return self.extract(minimum_noun_score, min_count) def train(self, sentences, min_eojeol_count=1): if self.verbose: print('[Noun Extractor] counting eojeols') eojeol_counter = EojeolCounter(sentences, min_eojeol_count, max_length=self.l_max_length + self.r_max_length) self._num_of_eojeols = eojeol_counter._count_sum self._num_of_covered_eojeols = 0 if self.verbose: print('[Noun Extractor] complete eojeol counter -> lr graph') self.lrgraph = eojeol_counter.to_lrgraph( self.l_max_length, self.r_max_length) if self.verbose: print('[Noun Extractor] has been trained.') def extract(self, minimum_noun_score=0.3, min_count=1): # base prediction noun_candidates = self._noun_candidates_from_positive_features() prediction_scores = self._batch_prediction_order_by_word_length( noun_candidates, minimum_noun_score) # E = N*J+ or N*Posi+ candidates = {l:sum(rdict.values()) for l,rdict in self.lrgraph._lr.items() if len(l) >= 4} compounds = self.extract_compounds( candidates, prediction_scores, minimum_noun_score) # combine single nouns and compounds nouns = {noun:score for noun, score in prediction_scores.items() if score[0] >= minimum_noun_score} nouns.update(compounds) # frequency filtering nouns = {noun:score for noun, score in nouns.items() if score[1] >= min_count} nouns = self._post_processing(nouns, prediction_scores, compounds) if self.verbose: print('[Noun Extractor] {} nouns ({} compounds) with min count={}'.format( len(nouns), len(compounds), min_count), flush=True) coverage = '%.2f' % (100 * self._num_of_covered_eojeols / self._num_of_eojeols) print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True) if self.verbose: print('[Noun Extractor] flushing ... ', flush=True, end='') self._nouns = nouns self.lrgraph.reset_lrgraph() if self.verbose: print('done') nouns_ = {noun:NounScore(score[1], score[0]) for noun, score in nouns.items()} return nouns_ def _get_nonempty_features(self, word, features): return [r for r, _ in features if ( ( (r in self._pos_features) and (not self._exist_longer_pos(word, r)) ) or ( (r in self._neg_features) and (not self._exist_longer_neg(word, r)) ) )] def _exist_longer_pos(self, word, r): for e in range(len(word)-1, -1, -1): if (word[e:]+r) in self._pos_features: return True return False def _exist_longer_neg(self, word, r): for e in range(len(word)-1, -1, -1): if (word[e:]+r) in self._neg_features: return True return False def predict(self, word, minimum_noun_score=0.3, debug=False): # scoring features = self.lrgraph.get_r(word, -1) pos, common, neg, unk, end = self._predict(word, features) base = pos + neg score = 0 if base == 0 else (pos - neg) / base support = pos + end + common if score >= minimum_noun_score else neg + end + common # debug code if debug: print(pos, common, neg, unk, end) features_ = self._get_nonempty_features(word, features) if len(features_) > self.min_num_of_features: return score, support else: # exception case sum_ = pos + common + neg + unk + end if sum_ == 0: return 0, support # exception. frequent nouns may have various positive R such as Josa if ((end > self.max_count_when_noun_is_eojeol) and (neg >= pos) ): return score, support if (common > 0 or pos > 0) and (end / sum_ >= 0.3) and (common >= neg): # 아이웨딩 + [('', 90), ('은', 3), ('측은', 1)] # 은 common / 대부분 단일어절 / 측은 unknown. # 아이엠텍 + [('은', 2), ('', 2)] support = pos + common + end return (support / sum_, support) # 경찰국 + [(은, 1), (에, 1), (에서, 1)] -> {은, 에} first_chars = set() for r, _ in features: if not r: continue if r in self._pos_features or r in self._common_features: if not self._exist_longer_pos(word, r): first_chars.add(r[0]) if not (r in self._pos_features or r in self._common_features): first_chars.add(r[0]) if len(first_chars) >= 2: support = pos + common + end return (support / sum_, support) # Handling for post-processing in NounExtractor # Case 1. # 아이러브영주사과 -> 아이러브영주사 + [(과,1)] (minimum r feature 적용해야 하는 케이스) : 복합명사 # 아이러브영주사과 + [('', 1)] 이므로, 후처리 이후 '아이러브영주사' 후보에서 제외됨 # Case 2. # 아이였으므로 -> 아이였으므 + [(로, 2)] (minimum r feature 적용) # "명사 + Unknown R" 로 후처리 return (0, support) def _predict(self, word, features): pos, common, neg, unk, end = 0, 0, 0, 0, 0 for r, freq in features: if r == '': end += freq continue if self._exist_longer_pos(word, r): # ignore continue if self._exist_longer_neg(word, r): # negative -다고 neg += freq continue if r in self._common_features: common += freq elif r in self._pos_features: pos += freq elif r in self._neg_features: neg += freq else: unk += freq return pos, common, neg, unk, end def _noun_candidates_from_positive_features(self, condition=None): def satisfy(word, e): return word[:e] == condition # noun candidates from positive featuers such as Josa N_from_J = {} for r in self._pos_features: for l, c in self.lrgraph.get_l(r, -1): # candidates filtering for debugging # condition is first chars in L if not condition: N_from_J[l] = N_from_J.get(l,0) + c continue # for debugging if not satisfy(l, len(condition)): continue N_from_J[l] = N_from_J.get(l,0) + c # sort by length of word N_from_J = sorted(N_from_J.items(), key=lambda x:-len(x[0])) return N_from_J def _batch_prediction_order_by_word_length(self, noun_candidates, minimum_noun_score=0.3): prediction_scores = {} n = len(noun_candidates) for i, (word, _) in enumerate(noun_candidates): if self.verbose and i % 1000 == 999: percentage = '%.3f' % (100 * (i+1) / n) print('\r -- batch prediction {} % of {} words'.format( percentage, n), flush=True, end='') # base prediction score, support = self.predict(word, minimum_noun_score) prediction_scores[word] = (score, support) # if their score is higher than minimum_noun_score, # remove eojeol pattern from lrgraph if score >= minimum_noun_score: for r, count in self.lrgraph.get_r(word, -1): if r == '' or (r in self._pos_features): self.lrgraph.remove_eojeol(word+r, count) self._num_of_covered_eojeols += count if self.verbose: print('\r[Noun Extractor] batch prediction was completed for {} words'.format( n), flush=True) return prediction_scores def extract_compounds(self, candidates, prediction_scores, minimum_noun_score=0.3): noun_scores = {noun:len(noun) for noun, score in prediction_scores.items() if score[0] > minimum_noun_score and len(noun) > 1} self._compound_decomposer = MaxScoreTokenizer(scores=noun_scores) candidates = {l:sum(rdict.values()) for l,rdict in self.lrgraph._lr.items() if (len(l) >= 4) and not (l in noun_scores)} n = len(candidates) compounds_scores = {} compounds_counts = {} compounds_components = {} for i, (word, count) in enumerate(sorted(candidates.items(), key=lambda x:-len(x[0]))): if self.verbose and i % 1000 == 999: percentage = '%.2f' % (100 * i / n) print('\r -- check compound {} %'.format(percentage), flush=True, end='') # skip if candidate is substring of longer compound if candidates.get(word, 0) <= 0: continue tokens = self._compound_decomposer.tokenize(word, flatten=False)[0] compound_parts = self._parse_compound(tokens) if compound_parts: # store compound components noun = ''.join(compound_parts) compounds_components[noun] = compound_parts # cumulate count and store compound score compound_score = max((prediction_scores.get(t, (0,0))[0] for t in compound_parts)) compounds_scores[noun] = max(compounds_scores.get(noun,0), compound_score) compounds_counts[noun] = compounds_counts.get(noun,0) + count # reduce frequency of substrings for e in range(2, len(word)): subword = word[:e] if not subword in candidates: continue candidates[subword] = candidates.get(subword, 0) - count # eojeol coverage self.lrgraph.remove_eojeol(word) self._num_of_covered_eojeols += count if self.verbose: print('\r[Noun Extractor] checked compounds. discovered {} compounds'.format( len(compounds_scores))) compounds = {noun:(score, compounds_counts.get(noun,0)) for noun, score in compounds_scores.items()} self._compounds_components = compounds_components return compounds def decompose_compound(self, word): tokens = self._compound_decomposer.tokenize(word, flatten=False)[0] compound_parts = self._parse_compound(tokens) return (word, ) if not compound_parts else compound_parts def _parse_compound(self, tokens): """Check Noun* or Noun*Josa""" # format: (word, begin, end, score, length) for token in tokens[:-1]: if token[3] <= 0: return None # Noun* + Josa if len(tokens) >= 3 and tokens[-1][0] in self._pos_features: return tuple(t[0] for t in tokens[:-1]) # all tokens are noun if tokens[-1][3] > 0: return tuple(t[0] for t in tokens) # else, not compound return None def _post_processing(self, nouns, prediction_scores, compounds): # TODO # Not Implemented return nouns
#데이터 가져오기 data = "" for x in range(len(arr2)): data += arr2[x] iH = isHangul(data) pprint(iH) #데이터 정제 parse = re.sub("[^0-9a-zA-Z\\s]+[^ ㄱ - ㅣ 가-힣]", "", data) parse = parse.lower().split() #print(parse) for x in range(len(parse)): parse[x] = re.sub("[^ ㄱ - ㅣ 가-힣]+", "", parse[x]) try: ay = tokenizer.tokenize(parse[x]) if (ay == boundmorpheme): pasrs[x] = "" else: parse[x] = ay except: parse[x] = re.sub("[^ ㄱ - ㅣ 가-힣]+", "", parse[x]) parses = [] for x in range(len(parse)): try: parses.append(parse[x][0]) except: continue #표현 counts = Counter(parses)
cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = MaxScoreTokenizer(scores=cohesion_score) #=================LDA trian strat======================== #Generate LDAModel #k = the number of topic #alpha = ? #eta = ? #min_cf = min frequency model = tp.LDAModel(k=10, alpha=0.1, eta=0.01, min_cf=5) for i in raw_chat: model.add_doc(tokenizer.tokenize(i)) #check the number of words, vocabulary #prepare the train model.train(0) print('Total docs:', len(model.docs)) print('Total words:', model.num_words) print('Vocab size:', model.num_vocabs) #200times training for i in range(200): print('Iteration {}\tLL per word: {}'.format(i, model.ll_per_word)) model.train(1) #print the trained topic for i in range(model.k):