def classify(document, label_dict, tfidf_train_vect): # 형태소 분석기(Mecab) mecab = Mecab(dicpath='C:\\mecab\\mecab-ko-dic') # 입력받은 문서를 길이 2이상의 명사로만 추출하여 재구성 document = [ i[0] for i in mecab.pos(document) if (((i[1] == "NNG") or (i[1] == "NNP") and (len(i[0]) > 1))) ] document = " ".join(document) clf = load_clf() # 분류모델 load X = tfidf_train_vect.transform([document ]) # train tfidf vector에 맞춰 입력문서 vectorize y = clf.predict(X)[0] # 입력문서의 예측 클래스 proba = clf.predict_proba(X) # 입력문서의 예측 클래스(확률값) 반환 proba_max = np.max(proba) # 입력문서의 예측 클래스(확률값) 중 가장 확률값 반환 return label_dict[y], proba_max, y
def read_doc(file_name): with open(file_name) as f: doc = f.read() tagger = Mecab() sentences = [] # [n_sentences, n_words] tags = [] start_sentence=0 for sep in rLINE_SEP.finditer(doc): sentence = doc[start_sentence:sep.start(0)] sentence = clear_str(sentence) start_sentence = sep.end(0) if len(sentence) < 10: continue poss = tagger.pos(sentence) sentences.append([word for word, _ in poss]) tags.append([tag for _, tag in poss]) return sentences, tags
def gen_summary(text, max_length): """Clean sentence""" global counter_konlpy global total_dataset mecab = Mecab() text = re.sub('[0-9]', '', text) text = mecab.nouns(text) text = ['GO'] + text text = empty_remover(text) if len(text) >= max_length: text = text[0:max_length] else: text = text + ["PAD"] * (max_length - len(text)) text = text[0:max_length] counter_konlpy += 1 sys.stdout.write("\rParsed: %d / %d" % (counter_konlpy, total_dataset)) sys.stdout.flush() return ' '.join(text)
def __init__(self, vocab_path="./transformer/vocbulary.voc", tagger=Mecab()): # tagger: 형태소 분석기 -> Mecab() (default) self.tagger = tagger # 단어 사전 형태별 저장 self.vocab, self.word2idx, \ self.idx2word, self.vocab_len \ = self._load_vocab_file(vocab_path) # start, end, unk 토큰 지정 self.STD = "<START>" self.END = "<END>" self.UNK = "<UNK>" # start, end, unk 토큰 인덱스 번호 self.STD_IDX = self.word2idx[self.STD] self.END_IDX = self.word2idx[self.END] self.UNK_IDX = self.word2idx[self.UNK]
def clean_str(s): """Clean sentence""" global counter_konlpy global total_dataset #global stopwords s = re.sub('[0-9]', '', s) mecab = Mecab() result = [] try: result = mecab.nouns(s) except ValueError: resunt = [] if len(result) > 1000: result = result[0:1000] counter_konlpy += 1 sys.stdout.write("\rParsed: %d / %d" % (counter_konlpy, total_dataset)) sys.stdout.flush() return ' '.join(result)
def find_common_topic(self): """ Finds common topic of overall conversation, and stores into csv file. :return: void. csv file will be generated """ # Get conversation self._rewind() all_conversations = self._preprocess(self.get_all_conversations()) # perform nlp on all words of conversation mecab = Mecab() category = ['NNP', 'NNG'] keywords = [ classification[0] for classification in mecab.pos(str(all_conversations)) if classification[1] in category ] freq = Counter(keywords).most_common(300) return freq
def extract_key_phrases(text): """Return a set of key phrases. :param text: A string. """ t = Mecab() tags_ko = t.pos(text) textlist = [x[0] for x in tags_ko] tags_ko = filter_for_tags(tags_ko) tags_ko = normalize(tags_ko) word_set_list = list(tags_ko) graph = build_graph(word_set_list) calculated_page_rank = nx.pagerank(graph, weight='weight') # most important words in ascending order of importance keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get,reverse=True) return keyphrases[0:3]
def __init__(self, input_path=None, max_len=None, min_len=None, use_min_cnt=None, word_min_cnt=None, load_preprocessed=False, dir_path=None): if not load_preprocessed: """ This must be changed """ self.df = pd.read_table(input_path) self.max_len, self.min_len = max_len, min_len self.use_min_cnt = use_min_cnt self.word_min_cnt = word_min_cnt self.tagger = Mecab() else: self.load(dir_path)
def generate_worldloud(text): from wordcloud import WordCloud from konlpy.tag import Mecab phrases = '' for phrase in text: phrases += phrase + ' ' mecab = Mecab() nouns = mecab.nouns(phrases) words = '' for word in nouns: words += word + ' ' wordcloud = WordCloud(font_path='/Library/fonts/AppleGothic.ttf', background_color='white', width=600, height=400) wordcloud.generate_from_text(words) wordcloud.to_file('gachi/static/gachi/images/wordcloud.png')
class NLPParser(): nlplib = Mecab() @staticmethod def parse(text): corpus = NLPParser.nlplib.pos(text) return corpus @staticmethod def get_sentence(corpus): result = [] sentence = [] for word in corpus: sentence.append(word) if not word[1] in ['EF', 'SF']: continue if len(sentence) > 1: result.append(sentence) sentence = [] return result
def __init__(self, args): super().__init__(args) save_dir = p.join(args.path.embed, self.name) self.encoder_path = p.join(save_dir, f"{self.name}.bin") self.idf_encoder_path = p.join(save_dir, f"{self.name}_idf.bin") self.idf_path = p.join(save_dir, "idf.bin") if self.args.model.tokenizer_name == "": print("Using Mecab tokenizer") mecab = Mecab() self.tokenizer = mecab.morphs elif self.args.model.tokenizer_name in [ "monologg/kobert", "monologg/distilkobert" ]: print("Using KoBert tokenizer") self.tokenizer = KoBertTokenizer.from_pretrained( args.model.tokenizer_name).tokenize else: print("Using AutoTokenizer: ", args.model.tokenizer_name) self.tokenizer = AutoTokenizer.from_pretrained( args.model.tokenizer_name, use_fast=True).tokenize self.b = self.args.retriever.b self.k1 = self.args.retriever.k1 self.encoder = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=(1, 2), use_idf=False, norm=None) self.idf_encoder = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=(1, 2), norm=None, smooth_idf=False) self.dls = np.zeros(len(self.contexts)) for idx, context in enumerate(self.contexts): self.dls[idx] = len(context) self.avdl = np.mean(self.dls) self.p_embedding = None self.idf = None
def remove_particle(training_args): """ remove particle Args: training_args """ # load tokenizer mecab = Mecab() kkma = Kkma() hannanum = Hannanum() # load prediction file with open(os.path.join(training_args.output_dir, "predictions.json"), "r") as f: prediction_json = json.load(f) prediction_dict = dict() for mrc_id in prediction_json.keys(): final_predictions = prediction_json[mrc_id] pos_tag = mecab.pos(final_predictions) # 조사가 있는 경우 삭제 if final_predictions[-1] == "의": min_len = min(len(kkma.pos(final_predictions)[-1][0]), len(mecab.pos(final_predictions)[-1][0]), len(hannanum.pos(final_predictions)[-1][0])) if min_len == 1: final_predictions = final_predictions[:-1] elif pos_tag[-1][-1] in { "JX", "JKB", "JKO", "JKS", "ETM", "VCP", "JC" }: final_predictions = final_predictions[:-len(pos_tag[-1][0])] prediction_dict[str(mrc_id)] = final_predictions # save final results with open(os.path.join(training_args.output_dir, "final_predictions.json"), 'w', encoding='utf-8') as make_file: json.dump(prediction_dict, make_file, indent="\t", ensure_ascii=False) print(prediction_dict)
def __init__(self): self.twit = Okt() self.mecab = Mecab() # 정규 표현식 리스트 self.regex_ls = [ '[\t\n\r\f\v]', '\(.+?\)', '\[.+?\]', '\<.+?\>', '◀.+?▶', '(?<=▶).+', '(?<=▷).+', '(?<=※).+', '(?<=Copyrights).+', '[\w]+@[a-zA-Z]+\.[a-zA-Z]+[\.]?[a-z]*', '[가-힣]+기자', '[가-힣]+ 기자', '[가-힣]+ 선임기자', '[가-힣]+ 동아닷컴 기자', '[\{\}\[\]\/?,;·:“‘|\)*~`!^\-_+<>@○▲▶■◆\#$┌─┐&\\\=\(\'\"├┼┤│┬└┴┘|ⓒ]', '[0-9]+[년월분일시]*', '사진=[가-힣]*', '사진제공=[가-힣]*' ] # 제거대상 리스트, 불용어 리스트 with open('../preprocessing_data/stopword_list.json', 'r', encoding='UTF-8') as f: load_file = json.load(f) self.word_to_be_cleaned_ls = load_file['clean'] self.stopword_ls = load_file['stopword']
def extract_docs(csv_path): mecab = Mecab() sts, labels, tags = [], [], [] with open(csv_path, encoding='utf8') as f: reader = csv.reader(f, delimiter='|', escapechar=':', quoting=csv.QUOTE_NONE, skipinitialspace=True) for row in reader: doc = clear_str(row[1]) for st in to_sentences(doc): morps = parse(st, mecab) if not morps: continue sts.append(morps) labels.append(row[0]) tags.append(row[1][:50]) return sts, labels, tags
def read_text(fin): # 전처리된 위키백과 파일을 읽어 들입니다. corpus_li = [] mecab = Mecab(dicpath='/opt/local/lib/mecab/dic/mecab-ko-dic') for line in open(fin): # 깨지는 글자를 처리하기 위해 unicodedata.normalize 함수를 이용해 # NFKC로변환합니다. line = unicodedata.normalize('NFKC', line) try: # 첫 글자가 숫자로 시작하는 문장을 말뭉치에 추가합니다. _ = int(line[0]) corpus_li.append(' '.join(mecab.nouns(line)) + '\n') except ValueError: # 첫 글자가 한글로 시작하는 문장을 말뭉치에 추가합니다. if ord(line[0]) >= ord('가') and ord(line[0]) <= ord('힇'): corpus_li.append(' '.join(mecab.nouns(line)) + '\n') else: pass print('# of lines in corpus', len(corpus_li)) return (corpus_li)
def preprocess( data_path: str, word_index: dict = None, num_words: int = 10000, ): tokenizer = Mecab() # 0. data load with open(data_path, "rb") as f: data = pickle.load(f) # 1. bag-of-words vocab, docs = [], [] for doc in tqdm(data): if doc: # nsmc 데이터에 nan값을 제외해주기 위함 try: nouns = tokenizer.nouns(doc) vocab.extend(nouns) docs.append(nouns) except: continue # 2. build vocab if not word_index: vocab = Counter(vocab) vocab = vocab.most_common(num_words) # 3. add unknwon token word_index = {"<UNK>": 0} for idx, (word, _) in enumerate(vocab, 1): word_index[word] = idx index_word = {idx: word for word, idx in word_index.items()} # 4. create corpus corpus = [] for doc in docs: if doc: corpus.append([word_index.get(word, 0) for word in doc]) return corpus, word_index, index_word
def __init__(self, train=None, model=None, recom_raw=None): # 훈련용 토큰화 파일 전처리 train_frame = pd.read_csv(train, header=None) token_train = [] for i in range(len(train_frame)): token = train_frame.loc[i, :].values token = token.tolist() j = -1 for k in range(len(token)): j += 1 if not isinstance(token[j], str) and math.isnan(token[j]): del token[j] j = j - 1 token_train.append(token) threshold = 3 rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트 # 정수 인코딩 tokenizer = Tokenizer() tokenizer.fit_on_texts(token_train) total_cnt = len(tokenizer.word_index) # 단어의 수 for key, value in tokenizer.word_counts.items(): if (value < threshold): rare_cnt = rare_cnt + 1 vocab_size = total_cnt - rare_cnt + 2 self.tokenizer = Tokenizer(vocab_size, oov_token='OOV') self.tokenizer.fit_on_texts(token_train) self.mecab = Mecab() # 분류 모델에 필요한 딥러닝 모델, 변수 self.deeprunning = load_model(model) self.category = 0 # 추천 모델에 필요한 data self.recom_data = pd.read_csv(recom_raw)
def __init__(self, config): self.config = config self.device = self.config['device'] self.examples = list() self.iterator = None self.tokenizer = Mecab() self.SRC = data.Field(tokenize=lambda x: x.split(' '), eos_token='<eos>', pad_token='<pad>', lower=True, batch_first=True, include_lengths=True) self.rSRC = data.Field(tokenize=lambda x: x.split(' '), eos_token='<eos>', pad_token='<pad>', lower=True, batch_first=True, include_lengths=True, preprocessing=lambda x: x[::-1]) self.SRC.vocab = pickle_reader(self.config['src_field_path']) self.rSRC.vocab = self.SRC.vocab
class MecabTokenizer(object): tokenizer = Mecab() @classmethod def make_vocab(cls, data_path, save_path): texts = read_text(data_path) words = [word for text in tqdm(texts) for word in cls.tokenizer.morphs(preprocess_text(text))] word_counter = Counter(words) vocab = {"[PAD]": 0, "[UNK]": 1} idx = 2 for word, count in word_counter.most_common(): vocab[word] = idx idx += 1 save_json(save_path, vocab) def __init__(self, vocab_path, vocab_size): vocab = read_json(vocab_path) self.vocab = {key: value for key, value in vocab.items() if value < vocab_size} self.vocab_size = len(self.vocab) self.pad_token_id = self.vocab.get("[PAD]", None) self.unk_token_id = self.vocab.get("[UNK]", None) def tokenize(self, text): text = preprocess_text(text) return self.tokenizer.morphs(text) def encode(self, text): text = preprocess_text(text) tokens = self.tokenize(text) ids = [self.vocab.get(token.strip(), self.unk_token_id) for token in tokens] return ids def encode_plus(self, text): preprocessed_text = preprocess_text(text) tokens = self.tokenize(preprocessed_text) ids = [self.vocab.get(token.strip(), self.unk_token_id) for token in tokens] offset = get_offset_mapping(text, tokens) return {"w_ids": ids, "offset_mapping": offset}
def __init__(self): self.config = { 'min_count': 5, # 등장 횟수가 5 이하인 단어는 무시 'size': 200, # ???차원짜리 벡터스페이스에 embedding 'sg': 1, # 0이면 CBOW, 1이면 skip-gram을 사용한다 'batch_words': 10000, # 사전을 구축할때 한번에 읽을 단어 수 'iter': 50, # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수 'window': 5, # 윈도우 사이즈 'workers': multiprocessing.cpu_count(), } #self.projectdir = os.path.dirname(os.path.dirname(__file__)) #self.tagger = Mecab(dicpath=self.projectdir+"/install/mecab-ko-dic/dic") self.tagger = Mecab() #self.tagger = Mecab("/home/mini/work/chatbot2017/install/mecab-ko-dic/dic") self.twitter_tagger = Twitter() self.ps = PorterStemmer() self.title_dict = {} self.docs = [] self.texts_ko = [] self.sentences = [] self.doc_file_names = []
def __init__(self, data_handler): super(NaverNewsCrawler, self).__init__(NaverNewsCrawler) self.mecab = Mecab() self.data_handler = data_handler self.pattern_publisher = r"\s?[가-힣\s]{3,}기자" self.pattern_email = r"([\w-]+)@([\w\.-]+)(\.[\w\.]+)" driver_path = CONFIG['chromedriver_path'] options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") options.add_argument( "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36" ) self.driver = webdriver.Chrome(driver_path, chrome_options=options) self.driver.implicitly_wait(3)
def find_common(filename): """ Finds common keyword in given file(name) :param filename: name of file to analyze :return: Common keyword of file(returned as list of tuples) Returned result is sorted """ file = sample._open_file(filename) keywords = list() while True: line = file.readline().split(',') keywords += line[:-1] if line == ['']: break mecab = Mecab() category = ['NNP', 'NNG', 'SL', 'VV', 'VA', 'XR', 'VA+ETM', 'NP+VCP+EC'] keywords = [classification[0] for classification in mecab.pos(str(keywords)) if classification[1] in category] cnt = sorted(Counter(keywords).items(), key=lambda x: x[1], reverse=True) return cnt
def make_noun_voca_list(raw_list, konlpy_opt): noun_voca_list = [] assert konlpy_opt in ['Hannanum', 'Kkma', 'Komoran', 'Mecab', 'Okt'] if konlpy_opt == 'Hannanum': nlp = Hannanum() elif konlpy_opt == 'Kkma': nlp = Kkma() elif konlpy_opt == 'Komoran': nlp = Komoran() elif konlpy_opt == 'Mecab': nlp = Mecab() elif konlpy_opt == 'Okt': nlp = Okt() for line in raw_list: if type(line).__name__ == 'float' or type(line).__name__ == 'int': noun_voca_list.append('') else: noun_voca_list.append( nlp.nouns(' '.join(re.compile('[가-힣0-9]+').findall(line)))) return noun_voca_list
def make_tag(self, contents): mecab = Mecab() tmp_tag = [] count = Counter(' ') for col in self.collection_review.find({"ISBN": contents}, { "_id": 0, "review_text": 1 }): pos = mecab.pos(col['review_text']) for i in xrange(1, len(pos), 1): if (pos[i][1] == "NNG") | (pos[i][1] == "NNP"): print pos[i][0] tmp_tag.append(pos[i][0]) count = count + Counter(tmp_tag) print type(count) tag = count # self.collection_TitleTag.insert({"ISBN": contents, "tag": tag}) return tag
def get_weight(url, f_l): mecab = Mecab() d_l = dict() for f in f_l: exec("d=" + open(os.getcwd() + "/static/dt/" + f + "2.txt").read(), globals()) d_l[f] = d soup = "" try: a = Article(url, language='ko') a.download() a.parse() soup = a.text except: return -1 print(soup) me = list() for i in mecab.pos(soup): if i[1] == "NNG" or i[1] == "NNP": me.append(i[0]) print(me) W_l = dict() for f in f_l: W = 0.0 for i in d_l[f]: if i in me: W = W + math.log(d_l[f][i]) print(W) print(len(me), len(d_l[f])) if len(me) == 0 or len(d_l[f]) == 0: W = 0 else: W = float( int((W / (float( (len(me)**0.7) * (len(d_l[f])**0.5))) * (10**6))) / 10.0) W_l[f] = W return W_l
def expect_single_noun_text_ko(sentence): # Define a chunk grammar, or chunking rules, then chunk grammar = """ 명사1: {<SL>} 명사1: {<SN>} 명사1: {<NNG>} 명사2: {<NN.*>} 동사구: {<NP\+VCP\+EF>} 동사구: {<NP><VCP\+EF>} 형용사: {<MA.*>*} """ mecab = Mecab() postagged_sentence = mecab.pos(sentence) nltk_rexp_parser = nltk.RegexpParser(grammar) chunks_sentence = nltk_rexp_parser.parse(postagged_sentence) extract_noun = [] extract_noun_score = {} for subtree in chunks_sentence.subtrees(): if subtree.label().startswith('명사'): if len(' '.join((e[0] for e in list(subtree)))) > 1: noun = ' '.join((e[0] for e in list(subtree))) if re.search(r"\s", noun): extract_noun.append(noun) # extract_noun_score[noun] = 0.75 if in_dict(extract_noun_score, noun) == False: extract_noun_score[noun] = 0.75 else: extract_noun_score[noun] += 0.75 return sorted_dict(extract_noun_score)
def get_tokenizer(self, tokenizer): tokenizer = tokenizer.lower() if tokenizer == "mecab": tokenizer = Mecab() elif tokenizer == "hannanum": tokenizer = Hannanum() elif tokenizer == "kkma": tokenizer = Kkma() elif tokenizer == "komoran": tokenizer = Komoran() elif tokenizer == "Okt": tokenizer = Okt() else: raise RuntimeError( "Tokenizer must be the one of Mecab, Hannanum, Kkma, Komoran, Okt." ) return tokenizer
def annotate_example_tootouch(example, table): """ Apr. 2021: Jaehyuk Annotate only the information that will be used in our model. """ # tokenizer tokenizer = Mecab() ann = {'table_id': example['table_id'], 'phase': example['phase']} ann['question'] = example['question'] ann['question_tok'] = [ str(q).lower() for q in tokenizer.morphs(example['question']) ] # ann['table'] = { # 'header': [annotate(h) for h in table['header']], # } ann['sql'] = example['sql'] ann['query'] = copy.deepcopy(example['sql']) conds1 = ann['sql']['conds'] wv_ann1 = [] for conds11 in conds1: wv_ann11 = tokenizer.morphs(str(conds11[2])) wv_ann1.append(wv_ann11) # Check whether wv_ann exsits inside question_tok try: wvi1_corenlp = check_wv_tok_in_nlu_tok(wv_ann1, ann['question_tok']) ann['wvi_corenlp'] = wvi1_corenlp except: ann['wvi_corenlp'] = None ann['tok_error'] = 'SQuAD style st, ed are not found under CoreNLP.' return ann
def get_nouns_from_csv(data, stopword, synonym): ''' requirement : pandas, mecab data : dataframe type, content of article stopword : set type, stopword synonym : dict type, use in function preprocess return : word_list 입력받은 path에서 csv 파일을 읽어와 dataframe에 저장 'text' col을 차례대로 Mecab을 이용해 형태소 분석하여 명사만 word_list에 추가 ''' mecab = Mecab() # 형태소분석기 Mecab(사용자정의사전 추가) word_list = [] for idx in tqdm(range(len(data))): try: nouns = mecab.nouns(data.loc[idx, 'content']) nouns = preprocess(nouns=nouns, stopword=stopword, dic=synonym) word_list.append(nouns) except Exception as e: continue print("\nNoun Extraction Complete") return word_list
def __init__( self, job_id, vocab_file, output_dir, max_seq_length, num_jobs, blanks_separate_docs, do_lower_case, tokenizer_type, num_out_files=500, ): self._blanks_separate_docs = blanks_separate_docs if tokenizer_type == "mecab_wordpiece": tokenizer = KoNLPyBertTokenizer( konlpy_wordpiece=KoNLPyWordPieceTokenizer(Mecab(), use_tag=False), vocab_file=vocab_file, do_lower_case=do_lower_case, ) elif tokenizer_type == "wordpiece": tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) self._example_builder = ExampleBuilder(tokenizer, max_seq_length, tokenizer_type) self._writers = [] for i in range(num_out_files): if i % num_jobs == job_id: output_fname = os.path.join( output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format( i, num_out_files), ) self._writers.append(tf.io.TFRecordWriter(output_fname)) self.n_written = 0