class koNounExtract(object): def __init__(self): self.util = Util() self.kolastChar = koLastCharCheck(); self.rootDirPath = self.util.getRootPath("SmiToText.SmiToText") def dictGenerate(self, sentence): return mecabDictLine
def extract_multi_noun(text, item_counter=0): text = text.strip() for text_array in text.split("\n"): text_array = text_array.strip() line_array_multi_noun_score = {} text_array = re.sub(r'[\w.-]+@[\w.-]+.\w+', '', text_array) text_array = re.sub( r'(http|ftp|https)://([\w+?\.\w+])+([a-zA-Z0-9\~\!\@\#\$\%\^\&\*\(\)_\-\=\+\\\/\?\.\:\;\'\,]*)?', '', text_array) multi_noun_list, multi_noun_list_score = extract_mecab_multi_noun( text_array, item_counter=item_counter) if len(multi_noun_list): for index, word in enumerate(multi_noun_list): if Util().check_email(word): continue else: add_flag = True for char in word: if char in [ "'", "`", ",", "'", "\"", "|", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "-", "_", "=", "+", "<", ">", ".", ";", ":", "ㄱ", "ㄴ", "ㄲ", "ㅂ", "ㅃ", "ㅈ", "ㅉ", "ㄷ", "ㄸ", "ㄱ", "ㅁ", "ㅇ", "ㄹ", "ㅎ", "ㅅ", "ㅆ", "ㅍ", "ㅊ", "ㅌ", "ㅋ", "ㅛ", "ㅕ", "ㅑ", "ㅐ", "ㅔ", "ㅗ", "ㅓ", "ㅏ", "ㅣ", "ㅠ", "ㅜ", "ㅡ", " " ]: add_flag = False if word == '기자' or word == str(date.today().day) + '일' \ or word.strip() == "" \ or len(word.strip()) == 1 \ : add_flag = False if check_en_stopword(word): add_flag = False if add_flag: word_score = {word: multi_noun_list_score[word]} line_array_multi_noun_score.update(word_score) # print(line_number, word) return sorted_dict(line_array_multi_noun_score)
class mecabDictGenerate(object): def __init__(self): self.util = Util() self.kolastChar = koLastCharCheck() self.rootDirPath = self.util.getRootPath("SmiToText.SmiToText") def dictGenerate(self, word, posTag="NNG", kind=''): isLastChar = self.kolastChar.lastKoTextCheck(word) if kind: kind = kind + '-' # 포커스 인,,,,NNG,*,T,포커스 인,*,*,*,* if isLastChar == 1: # print(line + ",,,,NNG,*,T,추가-" + line + ",*,*,*,*") # write_file.writelines(word + ",,,,NNG,*,T,추가-" + word + ",*,*,*,*" + "\n") mecabDictLine = word + ",,,," + posTag + ",*,T," + kind + word + ",*,*,*,*" else: # print(line + ",,,,NNG,*,F,추가-" + line + ",*,*,*,*") # write_file.writelines(word + ",,,,NNG,*,F,추가-" + word + ",*,*,*,*" + "\n") mecabDictLine = word + ",,,," + posTag + ",*,F," + kind + word + ",*,*,*,*" return mecabDictLine def mecab_dict_gen_from_file(self, input_filename, output_filename, posTag='NNG', kind=''): read_file = open(input_filename, mode='r', encoding='utf-8') write_file = open(output_filename, mode='w', encoding='utf-8') linenum = 0 while True: word = read_file.readline() word = word.strip() linenum += 1 if not word: break mecabDictLine = self.dictGenerate(word, posTag=posTag, kind=kind) print(mecabDictLine) write_file.writelines(mecabDictLine + "\n") print("LINE NUMBER END : ", linenum) write_file.close() read_file.close()
def remove_stopword(multi_noun, multi_noun_score, stop_word=[]): if len(stop_word) == 0 or stop_word == None: stop_word = all_stop_word check_multi_noun = [] check_multi_noun_score = {} for noun in multi_noun: if noun not in stop_word \ and not Util().is_int(noun) \ and not str(noun).endswith('니다') \ and not str(noun).endswith('이다'): check_multi_noun.append(noun) check_multi_noun_score[noun] = multi_noun_score[noun] return sorted_dict(check_multi_noun_score)
class koSoySpacing(object): def __init__(self): self.util = Util() def train(self, filename): verbose = False mc = 10 # min_count ft = 0.3 # force_abs_threshold nt = -0.3 # nonspace_threshold st = 0.3 # space_threshold model = CountSpace() rootDirPath = self.util.getRootPath("SmiToText.SmiToText") corpus_fname = rootDirPath + os.path.sep + "data" + os.path.sep + "koDetokenizerData" + os.path.sep + "ko_law_common_space.txt" model_fname = rootDirPath + os.path.sep + "kosoy-models" + os.path.sep + "soyspacing.model" ### 학습 # model.train(corpus_fname) # model.save_model(model_fname, json_format=False) ## 모델 로드 model.load_model(model_fname, json_format=False) #sent = '이건진짜좋은영화 라라랜드진짜좋은영화' # sent = '그일단그구성원인사람들과,,' sent = 'DAB는, 결정과 관련한 각 위원들의 모든 일당 수수료와 경비에 대한 청구금액이 완전하게 지급될 때 까지는, 결정문을 발급할 의무를 갖지 아니한다.' sent_input = sent.replace(" ", "") # with parameters setn_output_1, tags = model.correct(doc=sent_input, verbose=verbose, force_abs_threshold=ft, nonspace_threshold=nt, space_threshold=st, min_count=mc) # without parameters setn_output_2, tags = model.correct(sent_input) print(sent) print(setn_output_1) print(setn_output_2)
def check_stopword(multi_noun, multi_noun_score, stop_word=[]): if len(stop_word) == 0 or stop_word == None: stop_word = all_stop_word check_multi_noun = [] check_multi_noun_score = {} for noun in multi_noun: # print(noun.replace(' ', '')) # print(len( # set(stop_word).difference(noun.replace(' ', ''))) == len(stop_word)) if len(set(stop_word).difference(noun.split())) == len(stop_word) \ and len(set(stop_word).difference([noun.replace(' ', '')])) == len(stop_word) \ and not Util().is_int(noun) \ and not str(noun).endswith('니다') \ and not str(noun).endswith('이다'): check_multi_noun.append(noun) check_multi_noun_score[noun] = multi_noun_score[noun] return sorted_dict(check_multi_noun_score)
import argparse import os from nltk.tokenize import sent_tokenize from SmiToText.tokenizer.nltk import nltkSentTokenizer from SmiToText.util.util import Util from SmiToText.tokenizer import mecab ''' Mecab 을 이용한 명사 추출기 Pos tagger 이용 ''' util = Util() def expect_nng_text(text): nng_list = [] vv_list = [] replace_char_list = [ '[', ']', '\'', '\"', ')', '(', '「', '」', '-', '’', ':', '/', '”', '“', '〃', '~', '-', 'ㆍ', '○', '◇', '△', '〈', '〉', '·', '+', '…', '#', '=' ] check_word_end = [',', '.', ':', ';', '!', '?', '\"', '\''] check_vv = [ 'EC', 'VV', 'X',
def __init__(self): self.util = Util()
else: features.append('eos') return features def sent2words(self, sent): return [word for word, tag in sent] def sent2tags(self, sent): return [tag for word, tag in sent] def sent2features(self, sent): return [self.word2features(sent, i) for i in range(len(sent))] if __name__ == '__main__': util = Util() rootDirPath = util.getRootPath('SmiToText.SmiToText') koSpaceCheck = koCrfSpacing() #### 학습 koSpaceCheck.raw2corpus( rootDirPath + '/data/koDetokenizerData/ko_law_common_space.txt', rootDirPath + '/data/koDetokenizerData/ko_law_common_space.txt.copus') koSpaceCheck.raw2corpus( rootDirPath + '/data/koDetokenizerData/kospacing_Train.txt', rootDirPath + '/data/koDetokenizerData/kospacing_Train.txt.copus') # raw_train.txt에 뭔가 긴 글이 있음 train_sents = koSpaceCheck.corpus2sent(
def __init__(self): self.util = Util() self.kolastChar = koLastCharCheck() self.rootDirPath = self.util.getRootPath("SmiToText.SmiToText")