def control(input_msg): tagger = Komoran() dataset = Dataset('nsmc/ratings.txt', tagger, max_length=MAX_LENGTH, batch_size=BATCH_SIZE) Z_DIM = 40 H_DIM = 300 C_DIM = 2 model = RNN_VAE(dataset.num_words, H_DIM, Z_DIM, C_DIM, freeze_embeddings=False, gpu=USE_CUDA, gpu_id=GPU_ID) test_data = torch.LongTensor( dataset.sentence2idxs(tagger.morphs(input_msg))).unsqueeze(1) model.load_state_dict(torch.load('models/vae_epoch_300_400.bin')) results = model.controlSentence(test_data, t=0.5) return (dataset.idxs2sentence(results[0], no_pad=True), dataset.idxs2sentence(results[1], no_pad=True))
def main(base_path, pkl_lst): DataFrame = preprocess(base_path, pkl_lst) print('Spacing the document...') DataFrame = multicore_cpu(DataFrame, spacing_doc, n_cores=args.cpu_core, spell=False) print('Spell checking...') checked_data = multicore_cpu(DataFrame, spell_check, n_cores=args.cpu_core, spell=True) checked_data.reset_index(drop=True, inplace=True) # tokenizing print('Tokenizing the document...') komoran = Komoran(userdic=args.token_dict) checked_data['tokenized_contents'] = checked_data['contents'].apply( lambda x: komoran.morphs(x)) # filter documents checked_data['doc_length'] = checked_data['tokenized_contents'].apply( lambda x: len(x)) final_data = checked_data.loc[checked_data['doc_length'] > args.token_cnt] final_data.reset_index(drop=True, inplace=True) # save the output data os.makedirs(args.save_path, exist_ok=True) with open(os.path.join(args.save_path, 'preprocessed_data.pickle'), 'wb') as f: pickle.dump(final_data, f)
def main(corpora, output): filelist = os.listdir(corpora) tagger_stan = Komoran() tagger_jeju = Komoran(userdic='userdic.txt') # TODO: If not userdic for file in filelist: book = openpyxl.load_workbook(os.path.join(corpora, file)) sheet = book.get_sheet_by_name("Sheet") tagged = (bool(sheet.cell(row=1, column=3).value) and bool(sheet.cell(row=1, column=4).value)) if not tagged: for sample in sheet.rows: index = sample[0].row try: stan = sample[0].value pos_stan = ' '.join(tagger_stan.morphs(stan)) jeju = sample[1].value pos_jeju = ' '.join(tagger_jeju.morphs(jeju)) except: continue else: sheet.cell(row=index, column=3).value = pos_stan sheet.cell(row=index, column=4).value = pos_jeju book.save(os.path.join(corpora, file)) filename = file[:file.find('.')] if not os.path.exists(output): os.makedirs(output) output_dir = os.path.join(output, filename + '.txt') # Exception: Output Dir not Exists output_file = open(output_dir, 'w') for sample in sheet.rows: try: line = '\t'.join([ s.value for s in sample[:5] ]) + '\n' # Exception: s.value can be no string except TypeError: continue else: output_file.write(line) output_file.close() book.close()
def samerank(db, emotion_dict): # Null값 전처리 파일 출력 - 1,2등 동순위 전처리 komoran = Komoran() cursor = db.cursor() emotion = ['happy', 'enjoy', 'comfort', 'horror', 'angry', 'sad'] sql = "SELECT DISTINCT title, artist, lyrics FROM musicl WHERE (DATE, ranking) IN (SELECT DATE, ranking FROM emoti_test WHERE rank1 IS NULL)" cursor.execute(sql) null_data = cursor.fetchall() null_data = pd.DataFrame(null_data, columns=['제목', '가수', '가사']) null_data_rating = pd.DataFrame(columns=['제목', '가수', '순위', '수치']) for title, singer, lyrics in null_data.values: # 형태소 나누기 lyrics = lyrics.replace('\n', '') words_temp = komoran.morphs(lyrics) # 6개의 감정 섹션 happy = 0 enjoy = 0 comfort = 0 angry = 0 horror = 0 sad = 0 # 가사의 감성 분석 lyrics_emotion = pd.DataFrame(index=emotion) for word in words_temp: if word in emotion_dict['happy']: happy += 1 if word in emotion_dict['enjoy']: enjoy += 1 if word in emotion_dict['comfort']: comfort += 1 if word in emotion_dict['angry']: angry += 1 if word in emotion_dict['horror']: horror += 1 if word in emotion_dict['sad']: sad += 1 # 어떤 감성이 더 많이 나왔는지 정렬 result_emotion = [happy, enjoy, comfort, angry, horror, sad] lyrics_emotion[0] = result_emotion rating = lyrics_emotion[0].sort_values(ascending=False).index value = lyrics_emotion[0].sort_values(ascending=False).values null_data_rating = null_data_rating.append( { '제목': title.strip(), '가수': singer, '순위': list(rating), '수치': list(value) }, ignore_index=True) null_data_rating.to_excel('data/samepointSong.xlsx', encoding='utf-8')
def kor_tokenizer(list_sentences): komoran = Komoran(max_heap_size=1024) list_output = [] for sentence in list_sentences: sentence = re.sub("[^가-힣\s]", "", sentence) tokenized_sentence = komoran.morphs(sentence) list_output.append(tokenized_sentence) with open('./result/tokens.pickle', 'wb') as f: pickle.dump(list_output, f, pickle.HIGHEST_PROTOCOL) return list_output
class Tagger: def __init__(self, mode: str = "nouns"): """ konlpy pos tagger """ self.tagger = Komoran() self.mode = mode # nouns, morphs def __call__(self, *args, **kwargs) -> list: if self.mode == "nouns": return self.tagger.nouns(*args, **kwargs) elif self.mode == "morphs": return self.tagger.morphs(*args, **kwargs)
class DataLoader: def __init__(self, path="curse_detection/dataset/long.txt", one_hot=False, max_len=30): self.path = path self.one_hot = one_hot # True: [0~1, 0~1] False: 0~1 self.max_len = max_len self.komoran = Komoran() def get_data(self): x, y = self.load() x_train, x_test, y_train, y_test = self.split(x, y) return x_train, x_test, y_train, y_test @staticmethod def one_hot_encoding(y): # 원 핫 인코딩 return np.eye(2)[y.astype("int8")] def load(self): with open(self.path, 'r', encoding='utf8') as f: data = f.read() data = data.split('\n') x, y = [], [] for line in data: try: tmp = self.tokenize('|'.join(line.split('|')[:-1])) except UnicodeDecodeError: continue if len(tmp) > self.max_len: continue x.append(tmp) y.append(line.split('|')[-1].replace('"', '')) y = np.array(y, dtype=np.float32) if self.one_hot: y = self.one_hot_encoding(y) return x, y @staticmethod def split(x, y): # train test split x, y = shuffle(x, y) return train_test_split(x, y, test_size=0.1) def tokenize(self, text): return self.komoran.morphs(text)
def vectorize(train, val, test): parser = Komoran() temp_train = [] for doc in train: temp_train.append(parser.morphs(doc)) result_train = [' '.join(tokens) for tokens in temp_train] temp_val = [] for doc in val: temp_val.append(parser.morphs(doc.replace("[[문단]] ", ""))) result_val = [' '.join(tokens) for tokens in temp_val] temp_test = [] for doc in test: temp_test.append(parser.morphs(doc)) result_test = [' '.join(tokens) for tokens in temp_test] vect = CountVectorizer() X_train = vect.fit_transform(result_train) X_val = vect.transform(result_val) X_test = vect.transform(result_test) return X_train, X_val, X_test
def hyeongtae(filename): tokenizer = Komoran() tok_comments = [] f = open(filename + ".txt", 'r+', -1, "utf-8") comments = f.read().splitlines() f.close() g = open("tok" + filename + ".txt", 'w', -1, "utf-8") for com in comments: tok_com_l = tokenizer.morphs(com) i = 0 for tok in tok_com_l: g.write(tok) i += 1 if i != len(tok_com_l): g.write(" ") g.write("\n")
def make_token(input_file, output_file): komoran = Komoran() token_txt_file = open(output_file, "w", encoding="utf-8") list = [] with open(input_file, 'r', encoding="utf-8") as f: text = f.readlines() num = 0 for i in range(0, len(text)): sentence = text[i].strip() morphs = komoran.morphs(sentence) list.append(morphs) num += 1 print(num) my_json_string = json.dumps(list, ensure_ascii=False) token_txt_file.write(my_json_string)
def run_komoran(): komoran = Komoran() start_time = time.time() print('komoran 시작') komoran_morphs = komoran.morphs(news1) komoran_nouns = komoran.nouns(news1) komoran_pos = komoran.pos(news1) end_time = time.time() print('komoran 끝 - %s 초' % str(end_time - start_time)) with open('komoran.txt', 'w', encoding='utf-8') as fstream: fstream.write('komoran time : %s s\n' % str(end_time - start_time)) fstream.write('komoran_morphs\n') write_list(komoran_morphs, fstream) fstream.write('\n\n') fstream.write('komoran_nouns\n') write_list(komoran_nouns, fstream) fstream.write('\n\n') fstream.write('komoran_pos\n') write_pos(komoran_pos, fstream) fstream.write('\n')
def translate(inputSentence): encText = urllib.parse.quote(inputSentence) data = "source=en&target=ko&text=" + encText url = "https://openapi.naver.com/v1/papago/n2mt" request = urllib.request.Request(url) request.add_header("X-Naver-Client-Id", client_id) request.add_header("X-Naver-Client-Secret", client_secret) response = urllib.request.urlopen(request, data=data.encode("utf-8")) rescode = response.getcode() if (rescode == 200): response_body = response.read() # print(response_body.decode('utf-8')) else: print("Error Code:" + rescode) jsonObject = json.loads(response_body.decode('utf-8')) korText = jsonObject.get("message").get("result").get("translatedText") komoran = Komoran() eng_pos = getEngOrigin.get_eng_origin(inputSentence) return [komoran.morphs(korText), komoran.pos(korText)], eng_pos
f = open('../NLP/sample_data/stopword_02.txt','rt',encoding='utf-8') # Open file with 'UTF-8' 인코딩 text = f.read() stopword = text.split('\n') # 품사 태깅으로 토큰화 (품사로 나눈 토큰화) # from konlpy.tag import Kkma # tokenizer = Kkma() from konlpy.tag import Okt, Kkma, Komoran okt = Okt() kkma = Kkma() komo = Komoran() tag_data = [] for sentence in all_data['data']: temp_x = [] temp_x = komo.morphs(sentence) temp_x = [word for word in temp_x if not word in stopword] tag_data.append(temp_x) # 확인용 출력 # print('토큰화 된 샘플: ', tag_data[-10:-5]) ### 불용어 제거 , 토큰화 전 ### # label data # 35 0 몇 비비가 있다 그랬나 # 36 0 내 친구는 뭐 컴활 이런 게 더 어렵다고 그랬나 # 37 0 실추라고 뭐 그랬나 # 39 0 그랬나 # 42 0 선훈이 오빠가 여동생 있다 그랬나 ### 불용어 제거 , 토큰화 후 ###
def __init__(self, root, phase='train'): print("CustomDataset-> init") #count_vectorizer = make_vocab(root) self.root = root self.phase = phase self.labels = {} self.label_path = os.path.join(root, self.phase + '_hate.txt') with open(self.label_path, 'r',encoding="utf-8") as f: temp1 = [] bias_list = [] hate_list = [] for line in f.readlines()[0:]: v = line.strip().split('\t') w = v[1] w = w.replace('!','') w = w.replace('.','') w = w.replace('^','') w = w.replace('♡','') w = w.replace('@','') w = w.replace('ㅎ','') w = w.replace('ㅉ','') w = w.replace('?','') w = w.replace('ㅜ','') w = w.replace('ㅠ','') w = w.replace('~','') w = w.replace('ㅋ','') w = w.replace('ㅡ','') w = w.replace('!','') w = w.replace('ㄷ','') w = w.replace('ㄹ','') w = w.replace('ㅇ','') w = w.replace(',','') w = w.replace('ㅈ','') w = w.replace('♥','') w = w.replace('ㅁ','') w = w.replace('ㅊ','') w = w.replace(';','') w = w.replace('ㄴ','') w = w.replace('ㆍ','') temp1.append(w) if phase != 'test': bias_list.append(v[2]) hate_list.append(v[3]) stopwords =['의','가','이','은','들','는','좀','잘', '걍','과','도','를','으로','자','에','와','한','하다'] comments_list = [] # 형태소로 자름 okt = Okt() komoran =Komoran() tokenizer = RegexTokenizer() for sentence in temp1: temp_x =[] #temp_x= komoran.morphs(sentence,stem=True) temp_x= komoran.morphs(sentence) #temp_x = tokenizer.tokenize(sentence) temp_x = [word for word in temp_x if not word in stopwords] comments_list.append(temp_x) # 형태소로 잘리고 vocab = FreqDist(np.hstack(comments_list)) #빈도수로 sort threshold = 2 total_cnt = len(vocab) rare_cnt = 0 total_freq = 0 rare_freq = 0 for key in vocab.keys(): total_freq = total_freq + vocab[key] if vocab[key] < threshold : rare_cnt = rare_cnt+1 rare_freq = rare_freq + vocab[key] # print('문장 집합(vocabulary)의 크기 :',total_cnt) # print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt)) # print("문장 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100) # print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100) vocab_size = total_cnt - rare_cnt + 2 vocab = vocab.most_common(vocab_size) word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)} word_to_index['pad'] = 0 word_to_index['unk'] = 0 encoded = [] for line in comments_list: temp = [] for w in line: try: temp.append(word_to_index[w]) except KeyError: temp.append(word_to_index['unk']) # unk의 인덱스로 변환 encoded.append(temp) #print(encoded[0:5]) # rint(encoded.size()) #max_len = max(len(length) for length in encoded) max_len = 74 # batch_size # print("here") # print(a) # print("encoded") # print(len(encoded)) # print('문장의최대 길이 : %d' % max_len) # print('문장의최소 최소 길이 : %d' % min(len(length) for length in encoded)) # print('문장의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded))) for line in encoded: if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면 line += [word_to_index['pad']] * (max_len - len(line)) encoded = torch.LongTensor(encoded) #print(encoded[0:10]) #encoded = pad_sequence(encoded,batch_first=True) #print(encoded.size) # print('패딩결과 최대 길이 : %d' % max(len(l) for l in encoded)) # print('패딩결과의 최소 길이 : %d' % min(len(l) for l in encoded)) # print('패딩결과의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded))) comments_vector = [] # for comment in temp1: # comments_vector.append(count_vectorizer.transform([comment]).toarray()[0]) # comments_vector = torch.FloatTensor(comments_vector) self.comments_vec = encoded # 단어집합 숫자에 맞추고 pad, 한 결과 집합 self.comments_list = temp1 # 문장 원본 print(len(temp1))
import pandas as pd from konlpy.tag import Komoran from sklearn.feature_extraction.text import CountVectorizer from sklearn.cluster import DBSCAN from gensim.models import Word2Vec data = pd.read_excel("C:/Users/leevi/Documents/카카오톡 받은 파일/롱패딩_온라인쇼핑몰후기_종합.xlsx") dic = {feed_code: {'content':content, "star":start} for feed_code, content, start in zip(data['feed_code'].values,data['content'].values,data['star'].values)} komoran = Komoran() cv_pos = [] cv_neg = [] for key in dic.keys(): if type(dic[key]['content']) == str : dic[key]['morph'] = komoran.morphs(dic[key]['content']) if float(dic[key]['star']) >= 3: cv_pos.append(dic[key]['morph']) else: cv_neg.append(dic[key]['morph']) else: dic[key]['morph'] = None model = Word2Vec(cv_pos,size=100, window=3,iter=10) model.most_similar("정말")
hannanum = Hannanum() kkma = Kkma() text = args.text print("-"*5,"원본 텍스트", "-"*5) print(text) print("-"*5, "Mecab", "-"*5) print(mecab.morphs(text)) print("-"*5, "Okt", "-"*5) print(okt.morphs(text)) print("-"*5, "Komoran", "-"*5) print(komoran.morphs(text)) print("-"*5, "Hannanum", "-"*5) print(hannanum.morphs(text)) print("-"*5, "Kkma", "-"*5) print(kkma.morphs(text)) print("-"*5, "Khaiii", "-"*5) tokens = [] for word in khaiii.analyze(text): tokens.extend([str(m).split('/')[0] for m in word.morphs]) print(tokens) print("-"*5, "bert-base-multilingual-cased", "-"*5) print(tokenizer.tokenize(text))
Keys.PAGE_DOWN) komoran = Komoran() review_num = 0 rank_num = 0 hits = 0 review_text = [] review_rank = [] good = 0 bad = 0 for item in driver.find_elements_by_class_name('UD7Dzf'): p_list = [] morph_list = komoran.morphs(item.text) for m in morph_list: if m in kodict: p_list.append(kodict[m]) print(p_list) train_data.append(p_list) review_num += 1 print("-" * 80) for i in range(review_num): rank = driver.find_element_by_xpath( '//*[@id="fcxH9b"]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div/div[2]/div/div[' + str(i + 1) + ']/div/div[2]/div[1]/div[1]/div/span[1]/div/div') rank_string = rank.get_attribute("aria-label") #print(rank_string)
komoran = Komoran() kresult = [] for data in tresult: words = data[1] # 문제 없이 분석과 처리과 완료되었는지 체크용, 완료 성공 시 True, 실패 시 False state = True for word in words: try: print(komoran.pos(word)) type = komoran.pos(word)[0][1] if type == 'NNG' or type == 'NNP': kresult.append([data[0], komoran.morphs(word)[0]]) # 예외 사전에 존재 유무 체크용, True가 있는경우, False가 없는경우 exist = False # 예외 사전에 있는 단어는 INSERT 전에 필터링 for exc in excdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (word, exc[0])) count = cursor.fetchone() if count[0] != 0: print(word + '은(는) 예외 사전에 존재하는 단어입니다.') exist = True break if exist:
kor=Komoran(userdic='./user_dic.txt') #제거 할 형태소 stopwords = ['의', '가', '이', '은', '들', '는', '과', '도', '를', '으로', '이', 'ㅋ', '자', '에', '와', '한', '하다', '을', '다', '에서', '하고', 'ㄴ', 'ㄹ', '아', '하', '있', '았', '것', '나', '라', '고', '지', '게', '어', '되', '보', '면', '거', '네', 'ㅁ', '었', '아서', '겠', '로', '만', 'ㅂ시다', 'ㄴ가', '는데', 'ㄴ다', '왜', '어서', '어요', 'ㅂ니다', '으면', '라고', 'ㄴ데', '요', '그렇', '부터', 'ㄴ다고', '처럼', '라는', '는지', '습니다', '이다', '죠', '네요', 'ㅡ', '으니', 'ㄴ다는', 'ㄹ까', 'ㄴ지', '구나', '그리고', 'ㄴ다는데'] train_data_document = [] for sentence in X_train: temp_X = [] temp_X = kor.morphs(sentence) # 토큰화 #stopword에 등록한 조사 제거 temp_X = [word for word in temp_X if not word in stopwords] train_data_document.append(temp_X) test_data_document = [] for sentence in X_test: temp_X = [] temp_X = kor.morphs(sentence) # 토큰화 #stopword에 등록한 조사 제거 temp_X = [word for word in temp_X if not word in stopwords] test_data_document.append(temp_X) #형태소 정수인코딩 tokenizer = Tokenizer() tokenizer.fit_on_texts(train_data_document)
text = "아름답지만 다소 복잡하기도 한 한국어는 전세계에서 13번째로 많이 사용되는 언어입니다." # 코모란 형태소 분석 from konlpy.tag import Komoran Komoran = Komoran() print(Komoran.morphs(text)) print(Komoran.nouns(text)) print(Komoran.pos(text))
from konlpy.tag import Komoran komoran = Komoran() print(komoran.morphs('이것은 이밀란이다. 아니 저것은 이별이 아닌가?'))
print(text.split(' ')) # In[6]: #코모란 from konlpy.tag import Komoran #선언 komoran =Komoran() #토큰화 : morphs komoran_tokens = komoran.morphs(text) print(komoran_tokens) # In[9]: # 한나눔 from konlpy.tag import Hannanum hannanum = Hannanum() hannanum_tokens=hannanum.morphs(text) print(hannanum_tokens) # In[10]:
from konlpy.tag import Komoran from tensorflow.keras import Model from model import ClassificationModel, input_shape if __name__ == "__main__": komoran = Komoran() model_parent = ClassificationModel() model = model_parent.build_model() embedding = model_parent.embedding model.load_weights("curse_detection/weights-short.h5") att_model = Model(inputs=[model.input], outputs=model.layers[10].output) while True: inp = input(':') inp, mask = embedding([komoran.morphs(inp)]) out = model.predict((inp, mask)).squeeze(1) att = att_model.predict((inp, mask))[1].squeeze(2) print(att) print(out)
from konlpy.tag import Komoran # 코모란 형태소 분석기 객체 생성 komoran = Komoran() text = "아버지가 방에 들어갑니다." # 형태소 추출 morphs = komoran.morphs(text) print(morphs) # 형태소와 품사 태그 추출 pos = komoran.pos(text) print(pos) # 명사만 추출 nouns = komoran.nouns(text) print(nouns)
epochs = 50 model = keras.models.load_model('good_bad_' + str(epochs) + '_epochs.h5') model.summary() with open('prediction_input.txt', encoding='utf-8') as p_input: input_lines = p_input.readlines() p_input.close() prediction_input = [] for line in input_lines: p_list = [] morph_list = komoran.morphs(line) for m in morph_list: if m in kodict: p_list.append(kodict[m]) prediction_input.append(p_list) prediction_input = keras.preprocessing.sequence.pad_sequences(prediction_input, value=0, padding='post', maxlen=256) prediction = model.predict_classes(prediction_input) for i in range(len(prediction)): print(input_lines[i]) if (prediction[i] == 1):
def main(): # def job(): conn = pymysql.connect(host='192.168.0.61', user='******', password='******', db='one_db', charset='utf8mb4') cursor = conn.cursor() sql = 'SELECT ono, originaldata, siteno FROM test_original WHERE state = %s' cursor.execute(sql, 'N') original = cursor.fetchall() print('original data') print(original) # 신조어 필터링 sql = 'SELECT word FROM tb_newdic' cursor.execute(sql) newdic = cursor.fetchall() # print('신조어 사전') # print(newdic) # 예외사전 데이터 가져오기 sql = 'SELECT word FROM tb_excdic' cursor.execute(sql) excdic = cursor.fetchall() print('예외 사전') print(excdic) originalList = [] for data in original: dataList = list(data) for word in newdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (dataList[1], word[0])) count = cursor.fetchone() if count[0] != 0: print(dataList[1], '에서', word[0], '은(는) 신조어 사전에 존재하는 단어입니다.') dataList[1] = dataList[1].replace(word[0], '') sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)' cursor.execute(sql, (dataList[0], word[0], dataList[2])) conn.commit() for word in excdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (dataList[1], word[0])) count = cursor.fetchone() if count[0] != 0: print(dataList[1], '에서', word[0], '은(는) 예외 사전에 존재하는 단어입니다.') dataList[1] = dataList[1].replace(word[0], '') originalList.append(dataList) original = originalList # 트위터로 분석 from konlpy.tag import Twitter twitter = Twitter() tresult = [] for data in original: tresult.append([data[0], twitter.nouns(data[1]), data[2]]) print(twitter.pos(data[1])) # 트위터 분석 결과 확인 print('twitter result') print(tresult) # 코모란으로 분석 from konlpy.tag import Komoran komoran = Komoran() kresult = [] for data in tresult: words = data[1] # 문제 없이 분석과 처리과 완료되었는지 체크용, 완료 성공 시 True, 실패 시 False state = True for word in words: try: type = komoran.pos(word)[0][1] if type == 'NNG' or type == 'NNP': kresult.append([data[0], komoran.morphs(word)[0]]) # 예외 사전에 존재 유무 체크용, True가 있는경우, False가 없는경우 exist = False # 예외 사전에 있는 단어는 INSERT 전에 필터링 for exc in excdic: sql = 'SELECT INSTR(%s, %s)' cursor.execute(sql, (word, exc[0])) count = cursor.fetchone() if count[0] != 0: print(word + '은(는) 사전의 ' + exc[0] + '와(과) 일치') exist = True break if exist: continue # NNG, NNP 타입만 DB에 INSERT # 예외 발생 시 rollback, 아닌 경우 commit으로 처리 sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)' try: if len(komoran.morphs(word)[0]) != 1: cursor.execute( sql, (data[0], komoran.morphs(word)[0], data[2])) except Exception as err: state = False print('ERROR : komoran result의 ' + str(data[0]) + '번 글의 에서 insert 처리 중 오류 발생') print(str(err)) conn.rollback() else: conn.commit() except Exception as err: state = False print('ERROR : komoran 키워드 분석 중 오류 발생') continue ssql = 'UPDATE test_original SET state = %s WHERE ono = %s' state = 'Y' if state == True else 'E' cursor.execute(ssql, (state, data[0])) conn.commit() # 코모란 분석 결과 확인 print('komoran result') print(kresult) print('-----') print('끝') # schedule.every().day.at("").do(job) # # while 1: # schedule.run_pending() # time.sleep(1)
break return percent_list argc = sys.argv komoran = Komoran() f = open(argc[1], 'rt') list_length = -(int(argc[2])) stc_dic = {} read_dic = {} stc = input() start = time.time() stc = komoran.morphs(stc) percent_list = [] for ch in stc: if(ch not in stc_dic): stc_dic[ch] = 1 else: stc_dic[ch] += 1 while True: percent = 0 read = f.readline() if not read:
C_DIM, freeze_embeddings=False, gpu=USE_CUDA, gpu_id=GPU_ID) test_set = dataset.getTestData(100) model.load_state_dict(torch.load('models/vae_epoch_300_400.bin')) for test in test_set: results = model.controlSentence(test[0].unsqueeze(1), t=0.5) print('Original : ', dataset.idxs2sentence(test[0], no_pad=True)) print('Positive : ', dataset.idxs2sentence(results[0], no_pad=True)) print('Negative : ', dataset.idxs2sentence(results[1], no_pad=True)) print() tagger = Komoran() while True: sentence = tagger.morphs(input()) if len(sentence) == 0: break sentence = dataset.sentence2idxs(sentence).unsqueeze(dim=1) results = model.controlSentence(sentence, t=0.5) print('Positive : ', dataset.idxs2sentence(results[0], no_pad=True)) print('Negative : ', dataset.idxs2sentence(results[1], no_pad=True)) print()
''' # Hannanum Class from konlpy.tag import Hannanum hannanum = Hannanum() print(hannanum.analyze(u'롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.')) #Kkma Class from konlpy.tag import Kkma kkma = Kkma() print(kkma.morphs(u'공부를 하면할수록 모르는게 많다는 것을 알게 됩니다.')) # Komoran Class from konlpy.tag import Komoran komoran = Komoran() print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요')) # MeCab installation needed from konlpy.tag import Mecab mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic") print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.')) # Twitter Class # from konlpy.tag import Twitter # twitter = Twitter() # print(twitter.morphs(u'단독입찰보다 복수입찰의 경우')) from konlpy.tag import Okt twitter = Okt() print(twitter.morphs(u'단독입찰보다 복수입찰의 경우'))
#보통명사(NNG)만 추출 tagged_text = kkma.pos(text) [t[0] for t in tagged_text if t[1] == 'NNG'] # In[14]: #명사만 추출할 경우: kkma.nouns() kkma.nouns(text) # ### 3.3 Komoran # In[3]: from konlpy.tag import Komoran komoran = Komoran(max_heap_size=1024) #heap memory; 변수 저장하는 메모리 print(komoran.morphs(text)) #형태소 분석만 # In[4]: #품사 태깅 print(komoran.pos(text)) #ntags=42 # In[5]: #일반명사(NNG)만 추출 tagged_text = komoran.pos(text) [t[0] for t in tagged_text if t[1] == 'NNG'] # In[6]: #명사만 추출할 경우: komoran.nouns()