def get_raw_sentence(path): mecab = Mecab() with open(path, 'r', encoding='utf-8-sig') as f: sentences = f.readlines() sentences = list(map(make_raw_sentence, sentences)) mecab.morphs() sentences = [sentence.strip() for sentence in sentences] return sentences
def clean_korean_documents_simple_version(documents): #텍스트 정제 (HTML 태그 제거) for i, document in enumerate(documents): document = BeautifulSoup(document, 'html.parser').text #print(document) #스토리가 진짜 너무 노잼 documents[i] = document #텍스트 정제 (특수기호 제거) for i, document in enumerate(documents): document = re.sub(r'[^ ㄱ-ㅣ가-힣]', '', document) #특수기호 제거, 정규 표현식 #print(document) stale and uninspired documents[i] = document #텍스트 정제 (형태소 추출) for i, document in enumerate(documents): eunjeon = Mecab() clean_words = [] for word in eunjeon.morphs(document): clean_words.append(word) #print(clean_words) #['스토리', '진짜', '노잼'] document = ' '.join(clean_words) #print(document) #스토리 진짜 노잼 documents[i] = document return documents
def predict_(request): if request.method == "GET": start = time.time() review = request.GET["review"] print("Get Review _________________________________") pkl = joblib.load('./restapi/reviewSentiment.pkl') model = FastText.load('./restapi/FastText_embedding.model').wv print("load Model __________________________________") okt = Mecab() review_text = re.sub review_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", review) word_review = okt.morphs(review_text) word_review = ' '.join(word_review) print("preprocessing _______________________") feature_vector = np.zeros((100), dtype=np.float32) num_words = 0 index2word_set = set(model.wv.index2word) for w in word_review.split(): if w in index2word_set: num_words += 1 feature_vector = np.add(feature_vector, model.wv[w]) feature_vector = np.divide(feature_vector, num_words) print("predict _____________________________________") result = pkl.predict([feature_vector]) print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 if result[0] == 1: return redirect('emotion/1') else: return redirect('emotion/0')
class Chuncker: def __init__(self): self.tagger = Mecab() self.Bi_charcter_feature = [] def get_feautre(self, query): self.Bi_charcter_feature = [] TKs = self.tagger.morphs(query) for TK in TKs: if len(TK) > 1: for i in range(1, len(TK)): self.Bi_charcter_feature.append(str(TK[i - 1:i + 1])) #print(self.Bi_charcter_feature) def get_chunk_score(self, paragraph): score = 0 for ch_feat in self.Bi_charcter_feature: if paragraph.find(ch_feat) != -1: score += 1 return 1 + score / len(self.Bi_charcter_feature)
def embed(data): mecab = Mecab() inputs = [] labels = [] for encode_raw in data['encode']: encode_raw = mecab.morphs(encode_raw) encode_raw = list( map(lambda x: encode_raw[x] if x < len(encode_raw) else '#', range(encode_length))) if (embed_type == 'onehot'): bucket = np.zeros(vector_size, dtype=float).copy() input = np.array( list( map( lambda x: onehot_vectorize(bucket, x) if x in model.wv.index2word else np.zeros(vector_size, dtype=float), encode_raw))) else: input = np.array( list( map( lambda x: model[x] if x in model.wv.index2word else np. zeros(vector_size, dtype=float), encode_raw))) inputs.append(input.flatten()) for decode_raw in data['decode']: label = np.zeros(label_size, dtype=float) np.put(label, decode_raw, 1) labels.append(label) return inputs, labels
def make_keyword(sample_data): ''' :param sample_data: DataFrame :return: content column의 내용 토큰화-> 상위 10개 키워드 리스트 반환 ''' # 불용어 정의 stopwords = ['어때','아하','어때요','니깐','니까','거든','을까','할까','거든요','많이','조금','습니당','습니다','입니다','니다','여러분','라도','만나','어디', '이렇게','저렇게','은데','한데','응','아직','응응','그래','오키','든요','어떻게','왜','감사','고맙','죄송','랑','이랑','지만','하지만', '화이팅','파이팅','습니다','슴당','아요','에요','예요','아용','가용','바로','그냥','수정','파일','보내','올려','이모티콘', '따로', '다고', '구나', 'ㅠㅠㅠㅠ', 'ㅠㅠㅠ', '잖아', '그거', '부분', '어제', '내일', '오늘', '을까요', '괜찮', '으면', '해야', 'ㅇㅋ', '각자', '이건', '이거', '상관없', '사진', '께서', '드릴게요', '오후', '오전', '우선', '걸로', '이번', '해도', '할까요', '월요일', '화요일', '수요일', '목요일', '금요일', '토요일', '일요일', '까지', '드려요', '너무', '해요', '네네', '오늘', '다음', '아서', '셔서', '올리', '진짜', '오빠', '누나', '언니', '의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다', '다', '고', '을', '하', '있', '게', '보', '없', '세요', '아요', '습니다', '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니', '등', '같', '우리', '때', '년', '가', '한', '지', '어요', '네요', '대하', '오', '말', '일', '그렇', '이나', '위하', '는데', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니', '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하', '때문', '그것', '두', '말하', '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회', '많', '그리고', '좋', '크', '따르', '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하', '그러', '속', '하나', '집', '살', '모르', '적', '월', '데', '자신', '안', '어떤', '내', '내', '경우', '명', '생각', '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻', '여자', '개', '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓', '그럼', '혹시', '니다', '에서', '아침', '점심', '저녁', '해서', '어서', '감사', '수고', '저희', '근데', '일단', '나요', '부터', '합니다', '니까', '안녕', '입니다'] file_extension_list=make_file_extension_list("resource\\filename_extension_list.txt") for extension in file_extension_list: x=extension.replace(".","") stopwords.append(x) # 토큰화 및 불용어 제거 tokenizer = Mecab() tokenized = [] for sentence in sample_data['Content']: temp = tokenizer.morphs(sentence) # 토큰화 temp = [word for word in temp if not word in stopwords] # 불용어 제거 temp = [word for word in temp if len(word) > 1] tokenized.append(temp) # 전처리한 단어 데이터 데이터 프레임 구조로 변환 vocab = FreqDist(np.hstack(tokenized)) vocab = pd.DataFrame(vocab, {'count': [1]}) vocab = vocab.transpose() vocab = vocab.sort_values(by='count', ascending=False) vocab.reset_index(inplace=True) # 상위 언급 10개 단어 추출 dataf_10 = vocab.iloc[0:10] dataf_10 = dataf_10.reset_index() data10_dic = dataf_10['index'] data10_dic = pd.Series(data10_dic) data10_list = list(data10_dic) print("키워드 추출완료\n") return data10_list,vocab
def isKnown(self, text): if len(text) == 0: return True m = Mecab() # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성) # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성) for i in m.morphs(text): if m.pos( i )[0][1] == 'UNKNOWN': # or maybe include when first letter is 'S' too? # print(i) # it is not RP return False return True
def train_vector_model(str_buf): mecab = Mecab() str_buf = train_data_list['encode'] pos1 = mecab.pos(''.join(str_buf)) pos2 = ' '.join(list(map(lambda x: '\n' if x[1] in ['SF'] else x[0], pos1))).split('\n') morphs = list(map(lambda x: mecab.morphs(x), pos2)) print(str_buf) model = word2vec.Word2Vec(size=vector_size, window=2, min_count=1) model.build_vocab(morphs) model.train(morphs, epochs=model.epochs, total_examples=model.corpus_count) return model
def train_vector_model(str_buf): mecab = Mecab() str_buf = train_data_list['encode'] #mecab로 POS Tagging pos1 = mecab.pos(''.join(str_buf)) #문장별로 list로 나눔 마침표등이 존재시 줄바꾸기 (문장이길경우) pos2 = ' '.join(list(map(lambda x: '\n' if x[1] in ['SF'] else x[0], pos1))).split('\n') #단어구성을 위한 형태소단위 문장 쪼개기 morphs = list(map(lambda x: mecab.morphs(x), pos2)) model = word2vec.Word2Vec(size=vector_size, window=2, min_count=1) model.build_vocab(morphs) model.train(morphs, total_examples=model.corpus_count, epochs=model.iter) return model
def pos_tag(sentences): # eunjeon 형태소분석기 설정 tagger = Mecab() # 문장 품사 변수 초기화 sentences_pos = [] # 모든 문장 반복 for sentence in sentences: # 특수기호 제거 sentence = re.sub(RE_FILTER, "", sentence) # 배열인 형태소분석의 출력을 띄어쓰기로 구분하여 붙임 sentence = " ".join(tagger.morphs(sentence)) sentences_pos.append(sentence) return sentences_pos
def inference_embed(data): mecab = Mecab() encode_raw = mecab.morphs(data) encode_raw = list( map(lambda x: encode_raw[x] if x < len(encode_raw) else '#', range(encode_length))) if (embed_type == 'onehot'): bucket = np.zeros(vector_size, dtype=float).copy() input = np.array( list( map( lambda x: onehot_vectorize(bucket, x) if x in model.wv.index2word else np.zeros(vector_size, dtype=float), encode_raw))) else: input = np.array( list( map( lambda x: model[x] if x in model.wv.index2word else np. zeros(vector_size, dtype=float), encode_raw))) return input
stopwords = [ '의', '가', '며', '들', '는', '됨', '좀', '걍', '과', '를', '을', '으로', '에', '와', '한', '하다', '개체', '으나', '관리', '번호', '면', '함', '쪽', '줄', '신고자' '혼종', '고양이', '묘', '발견', '추정', '생후', '개월', '남음', '믹스', '구조', '음', '고' ] # #### 3) 토큰화 (tokenized) - 형태소 분석기 1. Mecab # + tokenizer = Mecab() tokenized = [] for sentence in text_df['specialmark']: temp = [] temp = tokenizer.morphs(sentence) # 토큰화 temp = [word for word in temp if not word in stopwords] # 불용어 제거 tokenized.append(temp) print(tokenized[:30]) # - new = pd.DataFrame(tokenized) new # #### 3) 토큰화 (tokenized) - 형태소 분석기 2. Okt sample_data = text_df.copy() tokenizer2 = Okt()
def __init__(self, inputPath, index=3, words=2, standard=0.3): # CSV, TXT 파일, 또는 기사 원문에서 복합단어를 추출 # 파이썬 버전 3.6 # 설치할 패키지: kss, eunjeon, pandas # 차후 eunjeon에서 konlpy로 이전 예정 # 리눅스 환경 mecab-ko-dic 설치과정 # wget -c https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/최신버전-mecab-ko-dic.tar.gz # tar zxfv 최신버전-mecab-ko-dic.tar.gz # cd 최신버전-mecab-ko-dic # ./configure # make # make check # sudo make install # 위 과정을 거치면 /usr/local/lib/mecab/dic/mecab-ko-dic 경로에 mecab-ko-dic 설치 # 입력변수 # inputPath: CSV 또는 TXT 파일의 위치 (너무 길 경우 원문 스트링으로 인식하여 분석) # outputPath: 출력할 텍스트 파일의 위치 # index: CSV 테이블에서 불러올 텍스트의 행 번호 # words: 복합단어를 이루는 단어 갯수 (기본:2) # standards: 요구사항을 충족하는 TR+PMI 점수의 최소치 (임시:0.3) # inputPath가 길 경우 원문으로 인식 if len(inputPath) > 50: self.data = inputPath else: self.data = self.extractText(inputPath) # 파일을 호출해서 행 번호(index)에 있는 값을 TXT에 저장 txt = self.clean_str(self.readValue(self.data, index)) # target = "문서전체내용" self.target = self.clean_str(txt) m = Mecab() # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성) # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성) # wTotal = (int)문서 내 명사 갯수 # fList = [["문장1명사1", "문장1명사2", ...], ["문장2명사1", "문장2명사2", ...], ...] # mList = ["문장1명사1", "문장1명사2", ..., "문장2명사1", "문장2명사2" ...] 중복 제거 # lList = [["문장1형태소1", "문장1형태소2", ...], ["문장2형태소1", "문장2형태소2", ...], ...] self.wTotal = len(m.nouns(self.target)) self.fList = self.nounExt(kss.split_sentences(self.target)) self.mList = list( dict.fromkeys([item for sublist in self.fList for item in sublist])) self.lList = [] for i in kss.split_sentences(self.target): l = [] for j in m.morphs(i): l.append(j) self.lList.append(l) # N그램 변수 (형태소의 갯수, 또는 문자수. 문자수 기반 N그램 현재 사용불가능) self.ngram = 8 # 복합단어를 이룰 단어의 갯수 self.nOfWords = words # 제동변수 self.df = 0.85 # 텍스트랭크 반복횟수 (임시:16) self.defIteration = 16 # allCW = [["단어1", "단어2", ...], ["단어a", "단어b", ...], ...] 복합단어의 가능성이 있는 모든 명사 리스트의 리스트 self.allCW = [] for i in range(len(self.fList)): n = self.genCW(self.fList[i]) for j in n: # 문서를 검색하는 방식 # if self.complexSearch(j, self.target) > 1 and j not in self.allCW: # 띄어쓰기 경우의 수를 모두 검색 (사용가능, 비활성) if self.searchSpaceless( j, self.target ) > 1 and j not in self.allCW: # 본문 그대로 검색 (활성) self.allCW.append(j) # 일부분 중복되는 복합단어를 탐지한 뒤 추가 # self.allCW += self.detectRedundant(self.allCW) # (사용가능, 비활성) # trdic = {"단어1": TR1, "단어2": TR2, ...} (기존방식) (활성)) self.trdic = self.calculateTROld(self.mList, self.fList, self.defIteration) # trdic = {"단어1": TR1, "단어2": TR2, ...} (N그램 방식) (사용가능, 비활성) # self.trdic = self.calculateTR(self.mList, self.lList, self.ngram, self.defIteration) # pmiList = [PMI1, PMI2, ...] allCW의 복합단어의 PMI 점수 리스트 pmiList = [] for i in self.allCW: pmiList.append(self.getPMI(i, self.wTotal, self.target)) # trmpiList = [TRPMI1, TRPMI2, ...] 복합단어를 구성하는 TR의 기하평균 곱하기 복합단어의 PMI trpmiList = [] for i in range(len(self.allCW)): k = self.allCW[i] key = 1 for j in k: key *= self.trdic[j] key **= (1 / len(k)) key *= pmiList[i] trpmiList.append(key) #gluedCW = ["복합단어1", "복합단어2", ...] allCW의 단어 구성 리스트를 합친 스트링 리스트 gluedCW = [] for i in self.allCW: gluedCW.append(''.join(i)) # compDict = {"복합단어1": 1.11, "복합단어2": 2.22, ...} # 중복된 복합단어가 없는 경우 if len(self.detectDuplicates(gluedCW)) == 0: self.compDict = dict(zip(gluedCW, trpmiList)) # 중복된 복합단어가 있는 경우 else: self.compDict = self.eliminateDuplicates(gluedCW, trpmiList) self.out = [] for i in self.compDict.items(): if i[1] > standard: self.out.append(i[0])
from eunjeon import Mecab from tensorflow.keras.preprocessing.text import text_to_word_sequence from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import numpy as np import tensorflow as tf docs = [ '먼저 텍스트의 각 단어를 나누어 토큰화합니다.', '텍스트의 단어로 토큰화해야 딥러닝에서 인식됩니다.', '토큰화한 결과는 딥러닝에서 사용할 수 있습니다.' ] m = Mecab() x = m.morphs('제가이렇게띄어쓰기를전혀하지않고글을썼다고하더라도글을이해할수있습니다.') x = [' '.join(x), 'xxx', 'zzz', 'gasdgasdg'] x = np.array(x) print(x) print('[MeCab 형태소 분석기]') print(m.morphs('제가이렇게띄어쓰기를전혀하지않고글을썼다고하더라도글을이해할수있습니다.')) print('[Keras 문장 단어 분석기]') print(text_to_word_sequence('제가이렇게띄어쓰기를전혀하지않고글을썼다고하더라도글을이해할수있습니다.')) token = Tokenizer() token.fit_on_texts(x) print("[몇개의 단어]") print(token.word_counts) # 몇개의 단어가 나오는지 print("[몇개의 문장]") print(token.document_count) # 몇개의 문장이 나오는지 print("[각 단어들이 몇 개의 문장에 나오는가]") print(token.word_docs) # 각 단어들이 몇 개의 문장에 나오는가 print("[각 단어에 매겨진 인덱스 값]")
def __init__(self, inputText, inputCorpus=None): # 원문 문서에서 신조어 추출 # 파이썬 버전 3.6 # 설치할 패키지: eunjeon, pandas # 차후 eunjeon에서 konlpy로 이전 예정 # 리눅스 환경 mecab-ko-dic 설치과정 # wget -c https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/최신버전-mecab-ko-dic.tar.gz # tar zxfv 최신버전-mecab-ko-dic.tar.gz # cd 최신버전-mecab-ko-dic # ./configure # make # make check # sudo make install # 위 과정을 거치면 /usr/local/lib/mecab/dic/mecab-ko-dic 경로에 mecab-ko-dic 설치 # 현 문제점: 말뭉치 인풋을 받지 않음 # Need to implement Corpus input and put it together into one string, or find other way to search the corpus # doc = "원문전체 문자열" self.doc = self.clean_str(inputText) # corpus = 말뭉치. 데이터 형태 미정 (사용불가능, 비활성) if inputCorpus == None: self.corpus = ' ' + self.doc else: self.corpus = self.clean_str(' ' + inputCorpus) # wTotal = 말뭉치 총 어절 수 self.wTotal = self.corpus.count(' ') l = self.doc.split(' ') self.eoList = [i for i in l if len(re.sub(r'[0-9]+', '-', i)) >= 3] # 괄호가 포함된 어절 출력 missed = [] for i in self.eoList: if i.count("(") > 0 and i.count(")") > 0: missed.append(i[i.find("(") + 1:i.find(")")]) continue if i.count("(") > 0: missed.append(i.split("(", 1)[1]) if i.count(")") > 0: missed.append(i[:-1]) parenthesisless = [ x for x in self.eoList if not '(' in x and not ')' in x ] + [x for x in self.eoList if '(' in x and ')' in x] parenthesisless += missed self.eoList = parenthesisless # 괄호가 한 쪽만 포함된 어절을 모두 제거하고 괄호 속 어절을 포함 ############################################################################################################################################ # 없는부분 # [LP, UM, RP] 형태가 가능한 모든 조합을 리스트로 구축 self.posUMpairList = [] for i in range(len(self.eoList)): for j in self.splitEojeol(self.eoList[i]): # RP가 알려진 단어로 이루어져있는지 확인(확인된 경우 KRP라고 부름) 후 등록 # if self.isAfterNoun(j[2]) and len(j[1]) > 1: if self.isKnown(j[2]): self.posUMpairList.append(j) # partialEoList: 모든 부분어절의 리스트: ["어절1부분1", "어절1부분2", ...] # (사용가능, 비활성) # self.partialEoList = [] # for i in self.eoList: # for j in self.eojeolPart(i): # self.partialEoList.append(j) ############################################################################################################################################ # lplist: 모든 어절의 2자 이상의 LP부분 리스트: [["어절1LP1", "어절1LP2", ...], ["어절2LP1", "어절2LP2", ...], ...] self.lplist = [] iter = self.eoList[:] iter = list(dict.fromkeys(iter)) for i in iter: if len(i) > 1: self.lplist.append(self.genLP(i)) # 명사로 추정되는 문자열 리스트 추출 -> extnouns self.extnouns = [] for i in self.lplist: scores = [] finalscore = 0 chosen = '' for j in range(len(i)): # 현재는 단순히 말뭉치에 띄어쓰기+단어가 검색된 갯수만 찾지만 본래 어절의 좌측부분만 검색하도록 해야 함 # 문제점1: 말뭉치는 클렌징이 되어있지 않음 # 문제점2: 기존에 이미 발견된 명사를 제외한 말뭉치에서 검색해야 함 scores.append(self.corpus.count(' ' + i[j]) / self.wTotal) for j in range(len(scores)): if j >= len(scores) - 1: chosen = i[j] finalscore = scores[j] break # 예: 마스터투자운 -> 마스터투자운용 빈도수가 크게 차이가 안 날 경우 넘어가지만 # 마스터투자운용 -> 마스터투자운용은 빈도수가 크게 차이가 나기 때문에 그 직전에 명사로 채택 if scores[j] > scores[j + 1] * 1.1: chosen = i[j] finalscore = scores[j] break finalscore = scores[j] # 빈도율이 2/어절수 이상인 경우 채택 if finalscore >= 2 / self.wTotal: self.extnouns.append(chosen) self.extnouns = list(dict.fromkeys(self.extnouns)) ############################################################################################################################################ # 없는부분 # 여기서 Mecab은 단일 글자가 어떠한 글자인지 판단하기 위해 사용 m = Mecab() # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성) # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성) # 한글이 아닌 문자가 갈라지는 경우 제외 # 예: ['신한BN', 'P파리', '바자산운용으로부터'], ['', '320', '0억원에'] 등 temp = self.posUMpairList[:] # temp와 포인터가 같으면 곤란하기 때문에 새로 메모리 할당 for i in self.posUMpairList: # LP가 빈 문자열이 아니고 LP의 마지막 글자와 UM의 첫 글자가 모두 한글이외 문자일 경우 후보에서 제거 if len(i[0]) > 0 and m.pos(i[0][-1])[0][1][0] == 'S' and m.pos( i[1][0])[0][1][0] == 'S': temp.remove(i) # RP가 빈 문자열이 아니고 UM의 마지막 글자와 RP의 첫 글자가 모두 한글이외 문자일 경우 후보에서 제거 elif len(i[2]) > 0 and m.pos(i[1][-1])[0][1][0] == 'S' and m.pos( i[2][0])[0][1][0] == 'S': temp.remove(i) # # UM에 괄호가 한 쪽만 포함된 경우 제거 elif '(' in i[1] and ')' not in i[1]: temp.remove(i) elif ')' in i[1] and '(' not in i[1]: temp.remove(i) # 결과물은 LP+UM+KRP의 리스트 self.posUMpairList = temp # candidates: 신조어 최종 후보 리스트 self.candidates = [] for i in self.posUMpairList: # KRP가 비어있는 경우: UM을 말뭉치에 대해 검색하여 2번 이상 등장할 경우 LP+UM 등록 if i[2] == '' and self.corpus.count(i[1]) >= 2: self.candidates.append(i[0] + i[1]) # KRP가 비어있지 않은 경우: UM+KRP[0](KRP의 첫 형태소)를 말뭉치에 대해 검색하여 2번 이상 등장할 경우 LP+UM 등록 elif i[2] != '' and self.corpus.count(i[1] + m.morphs(i[2])[0]) >= 2: self.candidates.append(i[0] + i[1]) # 서로를 포함하는 어절 빈도수 기준으로 정리 temp = [] for i in range(len(self.candidates) - 1): if self.candidates[i] in self.candidates[i + 1]: if self.wordFreq( self.candidates[i], self.corpus) > self.wordFreq( self.candidates[i + 1], self.corpus) * 1.1: temp.append(self.candidates[i]) elif self.candidates[i - 1] in self.candidates[i]: if self.wordFreq(self.candidates[i - 1], self.corpus) * 0.9 < self.wordFreq( self.candidates[i], self.corpus): temp.append(self.candidates[i]) else: temp.append(self.candidates[i]) if self.wordFreq(self.candidates[-2], self.corpus) * 0.9 < self.wordFreq( self.candidates[-1], self.corpus): temp.append(self.candidates[-1]) self.candidates = temp self.candidates = list(dict.fromkeys(self.candidates)) # 여기서 Mecab은 기존에 등록된 명사인지 아닌지 판단하기 위해 사용 # 기존에 등록된 명사 제외 temp = [] for i in self.candidates: if len(m.pos(i)) > 1 or m.pos(i)[0][1][0] != 'N': temp.append(i) self.candidates = temp
from eunjeon import Mecab tagger = Mecab() sentence = '아무문장이다.' print(tagger.morphs(sentence)) print(tagger.nouns(sentence)) #아나콘다 환경 #python 3.6 #대소 비교가 가능한 숫자(뭐 ~km) 같은거 2차언 배열로 순위 매기기 #대소 비교 불가능한 건 0 으로 처리하고 1순위부터 시작해서 2차원 배열 생성 # '/usr/local/lib/mecab/dic/mecab-ko-dic'
for i in range(documentCount): if i < 300: f = open("Finance/Finance%05d.txt" % i, 'r', -1, "utf-8") elif i < 600: f = open("Social/Social%05d.txt" % (i - 300), 'r', -1, "utf-8") else: f = open("Science/Science%05d.txt" % (i - 600), 'r', -1, "utf-8") data = f.read() f.close() data = data[139:] contentsText += data data = re.sub( "[-=+,#/\?:%$.@*\"※~&%!\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", data) data = m.morphs(data) articleMemory.append(data) # 특수 문자 & 기호 제거 token = re.sub( "[-=+,#/\?:%$.@*\"※~&%!\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", contentsText) # 특수 기호 제외 # 형태소 분리 temp = m.morphs(token) # BagOfWords 적용 sequencesText = util.BagOfWords(temp) # StopWord 적용 sequencesText = util.stopWord(sequencesText[0], sequencesText[1], deleteRate=0.02) # TF-IDF 구하기
# One Hot Vector를 통한 출력 from eunjeon import Mecab ona_data = [['안녕', '만나서 반가워'], ['넌 누구니', ' 나는 AI봇이란다.'], ['피자 주문 할게', '음료도 주문해줘'], ['음료는 뭘로', '콜라로 해줘']] mecab = Mecab() train_Data = list(map(lambda x: mecab.morphs(' '.join(x)), ona_data)) # .morphs() 문장을 형태소 단위로 끊어준다. import itertools train_data = list(itertools.chain.from_iterable(train_Data)) print(list(train_data)) import numpy as np bucket = np.zeros(len(train_data), dtype=np.float) for word in train_data: bucket_temp = bucket.copy() np.put(bucket_temp, train_data.index(word), 1) #print(bucket_temp) # Word to Vector (By Gensim) # W2V를 통해 출력해보자 from gensim.models import word2vec train_data = [train_data] print(train_data) model = word2vec.Word2Vec(size=50, window=2, min_count=1, iter=100) model.build_vocab(train_data)
class Rouge: DEFAULT_METRICS = {"rouge-n"} DEFAULT_N = 1 STATS = ["f", "p", "r"] AVAILABLE_METRICS = {"rouge-n", "rouge-l", "rouge-w"} AVAILABLE_LENGTH_LIMIT_TYPES = {"words", "bytes"} REMOVE_CHAR_PATTERN = re.compile("[^A-Za-z0-9가-힣]") def __init__( self, metrics=None, max_n=None, limit_length=True, length_limit=1000, length_limit_type="words", apply_avg=True, apply_best=False, use_tokenizer=True, alpha=0.5, weight_factor=1.0, ): self.metrics = metrics[:] if metrics is not None else Rouge.DEFAULT_METRICS for m in self.metrics: if m not in Rouge.AVAILABLE_METRICS: raise ValueError("Unknown metric '{}'".format(m)) self.max_n = max_n if "rouge-n" in self.metrics else None # Add all rouge-n metrics if self.max_n is not None: index_rouge_n = self.metrics.index("rouge-n") del self.metrics[index_rouge_n] self.metrics += [ "rouge-{}".format(n) for n in range(1, self.max_n + 1) ] self.metrics = set(self.metrics) self.limit_length = limit_length if self.limit_length: if length_limit_type not in Rouge.AVAILABLE_LENGTH_LIMIT_TYPES: raise ValueError( "Unknown length_limit_type '{}'".format(length_limit_type)) self.length_limit = length_limit if self.length_limit == 0: self.limit_length = False self.length_limit_type = length_limit_type self.use_tokenizer = use_tokenizer if use_tokenizer: self.tokenizer = Mecab() self.apply_avg = apply_avg self.apply_best = apply_best self.alpha = alpha self.weight_factor = weight_factor if self.weight_factor <= 0: raise ValueError("ROUGE-W weight factor must greater than 0.") def tokenize_text(self, text): if self.use_tokenizer: return self.tokenizer.morphs(text) else: return text @staticmethod def split_into_sentences(text): return text.split("\n") @staticmethod def _get_ngrams(n, text): ngram_set = collections.defaultdict(int) max_index_ngram_start = len(text) - n for i in range(max_index_ngram_start + 1): ngram_set[tuple(text[i:i + n])] += 1 return ngram_set @staticmethod def _split_into_words(sentences): return list(itertools.chain(*[_.split() for _ in sentences])) @staticmethod def _get_word_ngrams_and_length(n, sentences): assert len(sentences) > 0 assert n > 0 tokens = Rouge._split_into_words(sentences) return Rouge._get_ngrams(n, tokens), tokens, len(tokens) - (n - 1) @staticmethod def _get_unigrams(sentences): assert len(sentences) > 0 tokens = Rouge._split_into_words(sentences) unigram_set = collections.defaultdict(int) for token in tokens: unigram_set[token] += 1 return unigram_set, len(tokens) @staticmethod def _compute_p_r_f_score( evaluated_count, reference_count, overlapping_count, alpha=0.5, weight_factor=1.0, ): precision = 0.0 if evaluated_count == 0 else overlapping_count / float( evaluated_count) if weight_factor != 1.0: precision = precision**(1.0 / weight_factor) recall = 0.0 if reference_count == 0 else overlapping_count / float( reference_count) if weight_factor != 1.0: recall = recall**(1.0 / weight_factor) f1_score = Rouge._compute_f_score(precision, recall, alpha) return {"f": f1_score, "p": precision, "r": recall} @staticmethod def _compute_f_score(precision, recall, alpha=0.5): return (0.0 if (recall == 0.0 or precision == 0.0) else precision * recall / ((1 - alpha) * precision + alpha * recall)) @staticmethod def _compute_ngrams(evaluated_sentences, reference_sentences, n): if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: raise ValueError("Collections must contain at least 1 sentence.") evaluated_ngrams, _, evaluated_count = Rouge._get_word_ngrams_and_length( n, evaluated_sentences) reference_ngrams, _, reference_count = Rouge._get_word_ngrams_and_length( n, reference_sentences) # Gets the overlapping ngrams between evaluated and reference overlapping_ngrams = set(evaluated_ngrams.keys()).intersection( set(reference_ngrams.keys())) overlapping_count = 0 for ngram in overlapping_ngrams: overlapping_count += min(evaluated_ngrams[ngram], reference_ngrams[ngram]) return evaluated_count, reference_count, overlapping_count @staticmethod def _compute_ngrams_lcs(evaluated_sentences, reference_sentences, weight_factor=1.0): def _lcs(x, y): m = len(x) n = len(y) vals = collections.defaultdict(int) dirs = collections.defaultdict(int) for i in range(1, m + 1): for j in range(1, n + 1): if x[i - 1] == y[j - 1]: vals[i, j] = vals[i - 1, j - 1] + 1 dirs[i, j] = "|" elif vals[i - 1, j] >= vals[i, j - 1]: vals[i, j] = vals[i - 1, j] dirs[i, j] = "^" else: vals[i, j] = vals[i, j - 1] dirs[i, j] = "<" return vals, dirs def _wlcs(x, y, weight_factor): m = len(x) n = len(y) vals = collections.defaultdict(float) dirs = collections.defaultdict(int) lengths = collections.defaultdict(int) for i in range(1, m + 1): for j in range(1, n + 1): if x[i - 1] == y[j - 1]: length_tmp = lengths[i - 1, j - 1] vals[i, j] = (vals[i - 1, j - 1] + (length_tmp + 1)**weight_factor - length_tmp**weight_factor) dirs[i, j] = "|" lengths[i, j] = length_tmp + 1 elif vals[i - 1, j] >= vals[i, j - 1]: vals[i, j] = vals[i - 1, j] dirs[i, j] = "^" lengths[i, j] = 0 else: vals[i, j] = vals[i, j - 1] dirs[i, j] = "<" lengths[i, j] = 0 return vals, dirs def _mark_lcs(mask, dirs, m, n): while m != 0 and n != 0: if dirs[m, n] == "|": m -= 1 n -= 1 mask[m] = 1 elif dirs[m, n] == "^": m -= 1 elif dirs[m, n] == "<": n -= 1 else: raise UnboundLocalError("Illegal move") return mask if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: raise ValueError("Collections must contain at least 1 sentence.") evaluated_unigrams_dict, evaluated_count = Rouge._get_unigrams( evaluated_sentences) reference_unigrams_dict, reference_count = Rouge._get_unigrams( reference_sentences) # Has to use weight factor for WLCS use_WLCS = weight_factor != 1.0 if use_WLCS: evaluated_count = evaluated_count**weight_factor reference_count = 0 overlapping_count = 0.0 for reference_sentence in reference_sentences: reference_sentence_tokens = reference_sentence.split() if use_WLCS: reference_count += len( reference_sentence_tokens)**weight_factor hit_mask = [0 for _ in range(len(reference_sentence_tokens))] for evaluated_sentence in evaluated_sentences: evaluated_sentence_tokens = evaluated_sentence.split() if use_WLCS: _, lcs_dirs = _wlcs( reference_sentence_tokens, evaluated_sentence_tokens, weight_factor, ) else: _, lcs_dirs = _lcs(reference_sentence_tokens, evaluated_sentence_tokens) _mark_lcs( hit_mask, lcs_dirs, len(reference_sentence_tokens), len(evaluated_sentence_tokens), ) overlapping_count_length = 0 for ref_token_id, val in enumerate(hit_mask): if val == 1: token = reference_sentence_tokens[ref_token_id] if evaluated_unigrams_dict[ token] > 0 and reference_unigrams_dict[token] > 0: evaluated_unigrams_dict[token] -= 1 reference_unigrams_dict[ref_token_id] -= 1 if use_WLCS: overlapping_count_length += 1 if (ref_token_id + 1 < len(hit_mask) and hit_mask[ref_token_id + 1] == 0) or ref_token_id + 1 == len(hit_mask): overlapping_count += overlapping_count_length**weight_factor overlapping_count_length = 0 else: overlapping_count += 1 if use_WLCS: reference_count = reference_count**weight_factor return evaluated_count, reference_count, overlapping_count def get_scores(self, hypothesis, references): if isinstance(hypothesis, str): hypothesis, references = [hypothesis], [references] if type(hypothesis) != type(references): raise ValueError("'hyps' and 'refs' are not of the same type") if len(hypothesis) != len(references): raise ValueError("'hyps' and 'refs' do not have the same length") scores = {} has_rouge_n_metric = (len([ metric for metric in self.metrics if metric.split("-")[-1].isdigit() ]) > 0) if has_rouge_n_metric: scores.update(self._get_scores_rouge_n(hypothesis, references)) # scores = {**scores, **self._get_scores_rouge_n(hypothesis, references)} has_rouge_l_metric = (len([ metric for metric in self.metrics if metric.split("-")[-1].lower() == "l" ]) > 0) if has_rouge_l_metric: scores.update( self._get_scores_rouge_l_or_w(hypothesis, references, False)) # scores = {**scores, **self._get_scores_rouge_l_or_w(hypothesis, references, False)} has_rouge_w_metric = (len([ metric for metric in self.metrics if metric.split("-")[-1].lower() == "w" ]) > 0) if has_rouge_w_metric: scores.update( self._get_scores_rouge_l_or_w(hypothesis, references, True)) # scores = {**scores, **self._get_scores_rouge_l_or_w(hypothesis, references, True)} return scores def _get_scores_rouge_n(self, all_hypothesis, all_references): metrics = [ metric for metric in self.metrics if metric.split("-")[-1].isdigit() ] if self.apply_avg or self.apply_best: scores = { metric: {stat: 0.0 for stat in Rouge.STATS} for metric in metrics } else: scores = { metric: [{stat: [] for stat in Rouge.STATS} for _ in range(len(all_hypothesis))] for metric in metrics } for sample_id, (hypothesis, references) in enumerate( zip(all_hypothesis, all_references)): assert isinstance(hypothesis, str) has_multiple_references = False if isinstance(references, list): has_multiple_references = len(references) > 1 if not has_multiple_references: references = references[0] # Prepare hypothesis and reference(s) hypothesis = self._preprocess_summary_as_a_whole(hypothesis) references = ([ self._preprocess_summary_as_a_whole(reference) for reference in references ] if has_multiple_references else [self._preprocess_summary_as_a_whole(references)]) # Compute scores for metric in metrics: suffix = metric.split("-")[-1] n = int(suffix) # Aggregate if self.apply_avg: # average model total_hypothesis_ngrams_count = 0 total_reference_ngrams_count = 0 total_ngrams_overlapping_count = 0 for reference in references: ( hypothesis_count, reference_count, overlapping_ngrams, ) = Rouge._compute_ngrams(hypothesis, reference, n) total_hypothesis_ngrams_count += hypothesis_count total_reference_ngrams_count += reference_count total_ngrams_overlapping_count += overlapping_ngrams score = Rouge._compute_p_r_f_score( total_hypothesis_ngrams_count, total_reference_ngrams_count, total_ngrams_overlapping_count, self.alpha, ) for stat in Rouge.STATS: scores[metric][stat] += score[stat] else: # Best model if self.apply_best: best_current_score = None for reference in references: ( hypothesis_count, reference_count, overlapping_ngrams, ) = Rouge._compute_ngrams(hypothesis, reference, n) score = Rouge._compute_p_r_f_score( hypothesis_count, reference_count, overlapping_ngrams, self.alpha, ) if best_current_score is None or score[ "r"] > best_current_score["r"]: best_current_score = score for stat in Rouge.STATS: scores[metric][stat] += best_current_score[stat] # Keep all else: for reference in references: ( hypothesis_count, reference_count, overlapping_ngrams, ) = Rouge._compute_ngrams(hypothesis, reference, n) score = Rouge._compute_p_r_f_score( hypothesis_count, reference_count, overlapping_ngrams, self.alpha, ) for stat in Rouge.STATS: scores[metric][sample_id][stat].append( score[stat]) # Compute final score with the average or the the max if (self.apply_avg or self.apply_best) and len(all_hypothesis) > 1: for metric in metrics: for stat in Rouge.STATS: scores[metric][stat] /= len(all_hypothesis) return scores def _get_scores_rouge_l_or_w(self, all_hypothesis, all_references, use_w=False): metric = "rouge-w" if use_w else "rouge-l" if self.apply_avg or self.apply_best: scores = {metric: {stat: 0.0 for stat in Rouge.STATS}} else: scores = { metric: [{stat: [] for stat in Rouge.STATS} for _ in range(len(all_hypothesis))] } for sample_id, (hypothesis_sentences, references_sentences) in enumerate( zip(all_hypothesis, all_references)): assert isinstance(hypothesis_sentences, str) has_multiple_references = False if isinstance(references_sentences, list): has_multiple_references = len(references_sentences) > 1 if not has_multiple_references: references_sentences = references_sentences[0] # Prepare hypothesis and reference(s) hypothesis_sentences = self._preprocess_summary_per_sentence( hypothesis_sentences) references_sentences = ([ self._preprocess_summary_per_sentence(reference) for reference in references_sentences ] if has_multiple_references else [ self._preprocess_summary_per_sentence(references_sentences) ]) # Compute scores # Aggregate if self.apply_avg: # average model total_hypothesis_ngrams_count = 0 total_reference_ngrams_count = 0 total_ngrams_overlapping_count = 0 for reference_sentences in references_sentences: ( hypothesis_count, reference_count, overlapping_ngrams, ) = Rouge._compute_ngrams_lcs( hypothesis_sentences, reference_sentences, self.weight_factor if use_w else 1.0, ) total_hypothesis_ngrams_count += hypothesis_count total_reference_ngrams_count += reference_count total_ngrams_overlapping_count += overlapping_ngrams score = Rouge._compute_p_r_f_score( total_hypothesis_ngrams_count, total_reference_ngrams_count, total_ngrams_overlapping_count, self.alpha, self.weight_factor if use_w else 1.0, ) for stat in Rouge.STATS: scores[metric][stat] += score[stat] else: # Best model if self.apply_best: best_current_score = None best_current_score_wlcs = None for reference_sentences in references_sentences: ( hypothesis_count, reference_count, overlapping_ngrams, ) = Rouge._compute_ngrams_lcs( hypothesis_sentences, reference_sentences, self.weight_factor if use_w else 1.0, ) score = Rouge._compute_p_r_f_score( total_hypothesis_ngrams_count, total_reference_ngrams_count, total_ngrams_overlapping_count, self.alpha, self.weight_factor if use_w else 1.0, ) if use_w: reference_count_for_score = reference_count**( 1.0 / self.weight_factor) overlapping_ngrams_for_score = overlapping_ngrams score_wlcs = (overlapping_ngrams_for_score / reference_count_for_score)**( 1.0 / self.weight_factor) if (best_current_score_wlcs is None or score_wlcs > best_current_score_wlcs): best_current_score = score best_current_score_wlcs = score_wlcs else: if best_current_score is None or score[ "r"] > best_current_score["r"]: best_current_score = score for stat in Rouge.STATS: scores[metric][stat] += best_current_score[stat] # Keep all else: for reference_sentences in references_sentences: ( hypothesis_count, reference_count, overlapping_ngrams, ) = Rouge._compute_ngrams_lcs( hypothesis_sentences, reference_sentences, self.weight_factor if use_w else 1.0, ) score = Rouge._compute_p_r_f_score( hypothesis_count, reference_count, overlapping_ngrams, self.alpha, self.weight_factor, ) for stat in Rouge.STATS: scores[metric][sample_id][stat].append(score[stat]) # Compute final score with the average or the the max if (self.apply_avg or self.apply_best) and len(all_hypothesis) > 1: for stat in Rouge.STATS: scores[metric][stat] /= len(all_hypothesis) return scores def _preprocess_summary_as_a_whole(self, summary): sentences = Rouge.split_into_sentences(summary) # Truncate if self.limit_length: # By words if self.length_limit_type == "words": summary = " ".join(sentences) all_tokens = summary.split() # Counting as in the perls script summary = " ".join(all_tokens[:self.length_limit]) # By bytes elif self.length_limit_type == "bytes": summary = "" current_len = 0 for sentence in sentences: sentence = sentence.strip() sentence_len = len(sentence) if current_len + sentence_len < self.length_limit: if current_len != 0: summary += " " summary += sentence current_len += sentence_len else: if current_len > 0: summary += " " summary += sentence[:self.length_limit - current_len] break else: summary = " ".join(sentences) summary = Rouge.REMOVE_CHAR_PATTERN.sub(" ", summary.lower()).strip() tokens = self.tokenize_text(Rouge.REMOVE_CHAR_PATTERN.sub( " ", summary)) preprocessed_summary = [" ".join(tokens)] return preprocessed_summary def _preprocess_summary_per_sentence(self, summary): sentences = Rouge.split_into_sentences(summary) # Truncate if self.limit_length: final_sentences = [] current_len = 0 # By words if self.length_limit_type == "words": for sentence in sentences: tokens = sentence.strip().split() tokens_len = len(tokens) if current_len + tokens_len < self.length_limit: sentence = " ".join(tokens) final_sentences.append(sentence) current_len += tokens_len else: sentence = " ".join(tokens[:self.length_limit - current_len]) final_sentences.append(sentence) break # By bytes elif self.length_limit_type == "bytes": for sentence in sentences: sentence = sentence.strip() sentence_len = len(sentence) if current_len + sentence_len < self.length_limit: final_sentences.append(sentence) current_len += sentence_len else: sentence = sentence[:self.length_limit - current_len] final_sentences.append(sentence) break sentences = final_sentences final_sentences = [] for sentence in sentences: sentence = Rouge.REMOVE_CHAR_PATTERN.sub(" ", sentence.lower()).strip() tokens = self.tokenize_text( Rouge.REMOVE_CHAR_PATTERN.sub(" ", sentence)) sentence = " ".join(tokens) final_sentences.append(sentence) return final_sentences
return sample # title_data = data[["label", "title"]] gd = data.groupby('label').apply(sampling_func, n_sample=13000) gd.index.names = ["temp_label", None] gd = gd.reset_index(level=[0]) gd = gd.drop(["temp_label"], axis=1) gd = shuffle(gd) mecab_processed_data = gd.copy() etri_processed_data = gd.copy() soynlp_processed_data = gd.copy() spm_processed_data = gd.copy() mecab = Mecab() mecab_processed_data["title"] = mecab_processed_data["title"].progress_apply(lambda x: " ".join(mecab.morphs(x))) def concat_text_with_pos(setence): tag = Mecab() pos = tag.pos(setence) temp = [] for p in pos: temp.append(p[0] + "/" + p[1]) s = ' '.join(temp) return s etri_processed_data["title"] = etri_processed_data["title"].progress_apply(concat_text_with_pos) word_extractor = WordExtractor( min_frequency=100,
from konlpy.tag import Kkma kkma = Kkma() print(kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요'] print(kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # [('열심히','MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')] print(kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['코딩', '당신', '연휴', '여행'] ''' 한글 형태소 분석기 중에 가장 속도가 빠른 Mecab은 konlpy 엔진에 포함되어 있지 않다. 아래는 eunjeon 패키지를 이용하여 python에서 mecab을 활용하는 예시이다. ''' from eunjeon import Mecab # KoNLPy style mecab wrapper tagger = Mecab() print(tagger.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['열심히', '코딩', '한', '당신', ',', '연휴', '에', '는', '여행', '을', '가', '봐요'] print(tagger.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # [('열심히', 'MAG'), ('코딩', 'NNG'), ('한', 'XSA+ETM'), ('당신', 'NP'), (',', 'SC'), ('연휴', 'NNG'), ('에', 'JKB'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가', 'VV'), ('봐요', 'EC+VX+EC')] print(tagger.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['코딩', '당신', '연휴', '여행'] ''' 2) 정제(Normalization) - https://wikidocs.net/21693 1. 규칙에 기반한 표기가 다른 단어들의 통합 2. 대, 소문자 통합 3. 정규 표현식(Regular Expression) ''' ''' 3) 어간 추출(Stemming) and 표제어 추출(Lemmatization) - https://wikidocs.net/21707 1. 표제어 추출(Lemmatization) 2. 어간 추출(Stemming)
# ## 4.1 품사 추출 # In[14]: data_pos = [] for sentence in movie_data['new_text']: data_pos.append(mecab.pos(sentence)) data_pos[:3] # ## 4.2 형태소 분석 # In[15]: tokenized_data = [] for sentence in movie_data['new_text']: for text in mecab.morphs(sentence): tokenized_data.append(text) tokenized_data[:10] # * 상위 빈도 순으로 100개의 단어는 다음과 같습니다. # In[16]: # 상위 빈도 100개 단어 top_words = Counter(tokenized_data) top_words.most_common(100)[:10] # ## 4.3 명사 추출
from eunjeon import Mecab import json from collections import OrderedDict import re token = Mecab() with open("output.txt", 'r', encoding='utf-8') as f: # output.txt -> from crawling.py document = f.read() list_morphs = token.morphs(document) for k in range(0, len(list_morphs)): list_morphs[k] = re.sub( '[^a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]', '', list_morphs[k]) # Delete everything except English and Korean. # Using the characteristics of the set to remove duplication. list_clear = set(list_morphs) list_clear = list(list_clear) data = OrderedDict() index = 1 # To index vocabulary for i in range(0, len(list_clear)): if list_clear[i].strip( ) == '': # Remove 'null-element' that is created during conversion from set to list. continue else: data[index] = list_clear[i].strip()
if i < 1920: f = open("Finance/Finance%05d.txt" % i, 'r', -1, "utf-8") elif i < 3840: f = open("Social/Social%05d.txt" % (i - 1920), 'r', -1, "utf-8") else: f = open("Science/Science%05d.txt" % (i - 3840), 'r', -1, "utf-8") data = f.read() f.close() data = data[139:] data = re.sub( "[-=+,#/\?:%$.@*\"※~&%!\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "", data) data = data[:-117] data = m.morphs(data) data = util.stopWord(data, stopwoardList) articleMemory.append(data) # 정답데이터 기사 불러오기 for i in range(resultDocumentCount): if i < 480: f = open("Finance/Finance%05d.txt" % (i + 1920), 'r', -1, "utf-8") elif i < 960: f = open("Social/Social%05d.txt" % (i + 1920 - 480), 'r', -1, "utf-8") else: f = open("Science/Science%05d.txt" % (i + 1920 - 960), 'r', -1, "utf-8") data = f.read()
from eunjeon import Mecab # 단어와 2차원 X축의 값, Y축의 값을 입력받아 2차원 그래프를 그린다 def plot_2d_graph(vocabs, xs, ys): plt.figure(figsize=(8, 6)) plt.scatter(xs, ys, marker='o') for i, v in enumerate(vocabs): plt.annotate(v, xy=(xs[i], ys[i])) sentences = [['안녕', '만나서 반가워'], ['넌 누구니', ' 나는 AI봇이란다.'], ['피자 주문 할게', '음료도 주문해줘'], ['음료는 뭘로', '콜라로 해줘']] mecab = Mecab() sentences = list(map(lambda x: mecab.morphs(' '.join(x)), sentences)) # 문장을 이용하여 단어와 벡터를 생성한다. model = Word2Vec(sentences, size=50, window=2, min_count=1, iter=100) # 단어벡터를 구한다. word_vectors = model.wv vocabs = word_vectors.vocab.keys() word_vectors_list = [word_vectors[v] for v in vocabs] # 단어간 유사도를 확인하다 print(word_vectors.similarity(w1='피자', w2='음료')) from sklearn.decomposition import PCA pca = PCA(n_components=2)
from eunjeon import Mecab out_f = './data/prepro_ko_wiki.txt' in_f = './data/ko_wiki.txt' me = Mecab() # cp949 에러 해결을 위해 encoding='utf-8' 추가 output = open(out_f, 'wt', encoding='utf-8') with open(in_f, 'r', encoding='utf-8') as rf: lines = rf.readlines() i = 0 for line in lines: temp_arr = me.morphs(line) line = bytes(' '.join(temp_arr), 'utf-8').decode('utf-8') + '\n' output.write(line) i = i + 1 if i % 10000 == 0: print('Preprocessed ' + str(i) + ' articles') output.close() print('Preprocessing complete!')
# konlpy : 설치하기 # http://konlpy.org/ko/v0.5.2/install/#id2 from eunjeon import Mecab # Mecab 함수를 tagger라는 이름으로 사용 tagger = Mecab() # 문장에서 명사만 분류 tagger.nouns("고양이가 냐 하고 울면 나는 녜 하고 울어야지") # 빛 아래 유령 poem = """ 흘러내린 머리카락이 흐린 호박빛 아래 빛난다. 유영하며. 저건가보다. 세월의 힘을 이겨낸 마지막 하나 남은 가로등. 미래의 색, 역겨운 청록색으로 창백하게 바뀔 마지막 가로등 난 유영한다. 차분하게 과거에 살면서 현재의 공기를 마신다. 가로등이 깜빡인다. 나도 깜빡여준다. """ # 문장을 형태소 단위로 끊어줌 tagger.morphs(poem) # 문장을 형태소단위로 끊고, 형태소 마다 품사를 분석 # 이때, ('지우개', 'NNG')등의 형식을 분류되는데, NNG는 일반명사를 뜻 # 자세한 품사태그는 링크를 참고 : https://m.blog.naver.com/PostView.nhn?blogId=aul-_-&logNo=221557243190 print(tagger.pos(poem)) # print(tagger.nouns(poem))