def clean_csv(dataset_file_dir, merged_file_save_path, ignore_list): sentence_list = [] for filepath in os.listdir(dataset_file_dir): if filepath.endswith(".csv"): entire_path = os.path.join(dataset_file_dir, filepath) with open(entire_path, newline="") as word_file: csv_reader = csv.reader(word_file) for row in csv_reader: sentence_list.append(row) tokenized_sentence_list = [] tokenizer = RegexTokenizer() count = 0 for sentence in sentence_list: tokenized_sentence = tokenizer.tokenize(str(sentence)) clean_sentence = [ elem for elem in tokenized_sentence if is_valid_word(elem, ignore_list) ] tokenized_sentence_list.append(clean_sentence) # print(tokenized_sentence) count += 1 file = open(merged_file_save_path, 'w', encoding='utf-8', newline='') writer = csv.writer(file) for sentence in tokenized_sentence_list: writer.writerow(sentence) file.close()
def convert_to_vector_list(self, ignore_list, model_length, sentence): tokenizer = RegexTokenizer() tokenized_sentence = tokenizer.tokenize(str(sentence)) print(self.key_vector_path) kv = KeyedVectors.load(self.key_vector_path, mmap='r') clean_sentence = [ elem for elem in tokenized_sentence if csv_reader.is_valid_word(elem, ignore_list) ] vector = [] for elem in clean_sentence: try: array = kv[elem] except: array = [1] * 100 vector.append(array) vector_list = [] while (len(vector_list) < model_length): vector_list += vector if (len(vector_list) > model_length): vector_list = vector_list[:model_length] return np.array(vector_list)
def prepare_corpus(self, ignore_list, model_length, corpus_path): tokenizer = RegexTokenizer() data_list = [] label_list = [] myw2v = w2v.word2vec(self.model_path) myw2v.load_keyvector(self.key_vector_path) with open(corpus_path, newline='') as corpus_file: reader = csv.reader(corpus_file) for row in reader: sentence = row[0] label = row[1] # uncensored data if label == '1': label = [1, 0] # Censored data else: label = [0, 1] tokenized_sentence = tokenizer.tokenize(str(sentence)) clean_sentence = [ elem for elem in tokenized_sentence if csv_reader.is_valid_word(elem, ignore_list) ] vector = [myw2v.get_vector(elem) for elem in clean_sentence] print("length: " + str(len(vector))) if (len(vector) > 0): vector_list = [] while (len(vector_list) < model_length): vector_list += vector if (len(vector_list) > model_length): vector_list = vector_list[:model_length] # print(np.array(vector_list).shape) data_list.append(np.array(vector_list)) label_list.append(np.array(label)) train_input = data_list train_label = label_list return (train_input, train_label)
def word2vec(user_file='./review_01_0005_72378155.txt'): tokenizer = RegexTokenizer() sents = [] file = open(user_file, 'r', encoding='UTF-8', newline='') while True: line = file.readline() line = re.sub('\s*\n', '', line) if "-----------------" not in line: sents.append(line) if len(sents) > 5000: break tokenized_contents = [] for sent in sents: temp = tokenizer.tokenize(sent, flatten=True) tokenized_contents.append(temp) embedding_model = Word2Vec(tokenized_contents, size=100, window=5, min_count=2, workers=4, iter=100, sg=1) while True: print("User input : ") user_input = input() if user_input is "": break else: try: result = embedding_model.most_similar(positive=[user_input], topn=5) for elem in result: print(elem) except Exception: print("ERROR : 결과가 없습니다.")
def tokenizer_test(): from soynlp.tokenizer import LTokenizer from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import RegexTokenizer regex_tokenizer = RegexTokenizer() if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']): raise ValueError( "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format( regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!'))) ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38}) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError( "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이'))) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == ['데이터', '는', '데이터센터', '의', '데이', '데이']): raise ValueError( "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}". format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05))) maxscore_tokenizer = MaxScoreTokenizer({ '데이터': 0.4, '데이': 0.35, '데이터센터': 0.38 }) if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError( "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이'))) print('all tokenizer tests have been successed\n')
from soynlp.tokenizer import RegexTokenizer import konlpy tok = konlpy.tag.Mecab() tokenizer = RegexTokenizer() print(tok.morphs('동일하게 테스트 중입니다')) print(tokenizer.tokenize('테스트 중이다'))
def Tokenize(data): tokenizer = RegexTokenizer() output = list(map(lambda x: ' '.join(tokenizer.tokenize(x)), data)) return output
def detail(m_no, current_movie_title): conn = pymysql.connect(host='127.0.0.1', user='******', password='******', db='movie', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) try: with conn.cursor() as cursor: sql = 'select * from current_movie c inner join test t on c.current_movie_title = t.title where current_movie_title = %s;' cursor.execute(sql, (current_movie_title)) result = cursor.fetchone() #하나만 가져올떄 sql = 'select * from current_movie where current_movie_title = %s;' cursor.execute(sql, (current_movie_title)) result1 = cursor.fetchone() #하나만 가져올떄 sql = 'select * from board where m_no= %s;' cursor.execute(sql, (m_no)) board = cursor.fetchall() finally: conn.close() if result is not None: tmrvl = [] movieName = result['codem'] for page in range(1, 200): url = "https://movie.naver.com/movie/bi/mi/review.nhn?code=" + str( movieName) + "&page=" + str(page) response = urllib.request.urlopen(url) soup = BeautifulSoup(response, 'html.parser') table = soup.select('ul.rvw_list_area li a') for result3 in table: mrv = str(result3.string) tmrv = tuple([mrv]) tmrvl.append(tmrv) #tmrv1=str(tmrv) #f.write(tmrv1) df = pd.DataFrame(tmrvl) def preprocessing(text): # 개행문자 제거 text = re.sub('\\\\n', ' ', text) return text tokenizer = RegexTokenizer() stopwords_kr = [ '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말', '너무', '[', ']', '것으로', '했습니다', '했다' ] sentences = df[0].apply(preprocessing) # soynlp로 명사 추출하기 noun_extractor = LRNounExtractor(verbose=True) noun_extractor.train(sentences) nouns = noun_extractor.extract() # 이미지 파일위에 출력하기 img = Image.open('IT_Bank_Movie/static/img/cloud.png') img_array = np.array(img) wordcloud = WordCloud(font_path='/Library/Fonts/NanumBarunGothic.ttf', stopwords=stopwords_kr, background_color='white', mask=img_array, width=800, height=600).generate(' '.join(nouns)) plt.figure(figsize=(15, 10)) plt.imshow(wordcloud) plt.axis("off") #plt.show() url1 = "IT_Bank_Movie/static/wordcloud/" + current_movie_title + ".png" wordcloud.to_file(url1) return render_template('movie_detail.html', wordInfo=result, board=board, movieInfo=result1)
def __init__(self): from soynlp.tokenizer import RegexTokenizer self.inst = RegexTokenizer() self.OUT_TYPE = [list, str]
def __init__(self, root, phase='train'): print("CustomDataset-> init") #count_vectorizer = make_vocab(root) self.root = root self.phase = phase self.labels = {} self.label_path = os.path.join(root, self.phase + '_hate.txt') with open(self.label_path, 'r',encoding="utf-8") as f: temp1 = [] bias_list = [] hate_list = [] for line in f.readlines()[0:]: v = line.strip().split('\t') w = v[1] w = w.replace('!','') w = w.replace('.','') w = w.replace('^','') w = w.replace('♡','') w = w.replace('@','') w = w.replace('ㅎ','') w = w.replace('ㅉ','') w = w.replace('?','') w = w.replace('ㅜ','') w = w.replace('ㅠ','') w = w.replace('~','') w = w.replace('ㅋ','') w = w.replace('ㅡ','') w = w.replace('!','') w = w.replace('ㄷ','') w = w.replace('ㄹ','') w = w.replace('ㅇ','') w = w.replace(',','') w = w.replace('ㅈ','') w = w.replace('♥','') w = w.replace('ㅁ','') w = w.replace('ㅊ','') w = w.replace(';','') w = w.replace('ㄴ','') w = w.replace('ㆍ','') temp1.append(w) if phase != 'test': bias_list.append(v[2]) hate_list.append(v[3]) stopwords =['의','가','이','은','들','는','좀','잘', '걍','과','도','를','으로','자','에','와','한','하다'] comments_list = [] # 형태소로 자름 okt = Okt() komoran =Komoran() tokenizer = RegexTokenizer() for sentence in temp1: temp_x =[] #temp_x= komoran.morphs(sentence,stem=True) temp_x= komoran.morphs(sentence) #temp_x = tokenizer.tokenize(sentence) temp_x = [word for word in temp_x if not word in stopwords] comments_list.append(temp_x) # 형태소로 잘리고 vocab = FreqDist(np.hstack(comments_list)) #빈도수로 sort threshold = 2 total_cnt = len(vocab) rare_cnt = 0 total_freq = 0 rare_freq = 0 for key in vocab.keys(): total_freq = total_freq + vocab[key] if vocab[key] < threshold : rare_cnt = rare_cnt+1 rare_freq = rare_freq + vocab[key] # print('문장 집합(vocabulary)의 크기 :',total_cnt) # print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt)) # print("문장 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100) # print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100) vocab_size = total_cnt - rare_cnt + 2 vocab = vocab.most_common(vocab_size) word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)} word_to_index['pad'] = 0 word_to_index['unk'] = 0 encoded = [] for line in comments_list: temp = [] for w in line: try: temp.append(word_to_index[w]) except KeyError: temp.append(word_to_index['unk']) # unk의 인덱스로 변환 encoded.append(temp) #print(encoded[0:5]) # rint(encoded.size()) #max_len = max(len(length) for length in encoded) max_len = 74 # batch_size # print("here") # print(a) # print("encoded") # print(len(encoded)) # print('문장의최대 길이 : %d' % max_len) # print('문장의최소 최소 길이 : %d' % min(len(length) for length in encoded)) # print('문장의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded))) for line in encoded: if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면 line += [word_to_index['pad']] * (max_len - len(line)) encoded = torch.LongTensor(encoded) #print(encoded[0:10]) #encoded = pad_sequence(encoded,batch_first=True) #print(encoded.size) # print('패딩결과 최대 길이 : %d' % max(len(l) for l in encoded)) # print('패딩결과의 최소 길이 : %d' % min(len(l) for l in encoded)) # print('패딩결과의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded))) comments_vector = [] # for comment in temp1: # comments_vector.append(count_vectorizer.transform([comment]).toarray()[0]) # comments_vector = torch.FloatTensor(comments_vector) self.comments_vec = encoded # 단어집합 숫자에 맞추고 pad, 한 결과 집합 self.comments_list = temp1 # 문장 원본 print(len(temp1))
def db_sentence_2_token_list(database_history_all_users_data_list): """ 설명 : 긴 sentence를 RegexTokenizer로 token으로 나눠서 title에 있던 자리에 다시 담는다. input : Sentence들을 모아둔 list ex) input : [['computer', '의', 'Youtube', '채널', '확인하기'], ----] return : result (type = list) result : [['computer', 'https://www.youtube.com/', 36, 3], ['의', 'https://www.youtube.com/', 36, 3], ['youtube', 'https://www.youtube.com/', 36, 3], ['채널', 'https://www.youtube.com/', 36, 3], ['확인', 'https://www.youtube.com/', 36,3] ------ ] """ # token tokenizer = RegexTokenizer() result = [] # DB 한 줄씩 읽어들이기 for line in database_history_all_users_data_list: """ output : ('https://www.youtube.com/', 'YouTube', 36) """ # output이 tuple이어서 url, title, visit_count, user_count = line # title의 text를 word로 끊어버리기 title_list = tokenizer.tokenize(title) # title이 빈공간인 건 제외 if len(title_list) == 0: continue else: for word in title_list: judgement = kor_or_eng_judge(word) # judgement 가 영어 한글이 아닐 경우 if judgement == 0: pass # judgement가 영어 경우 : 영어인 경우 lower한 단어 입력 elif judgement == 'en': result.append([ token_judge_en_lower_ko_noun(word), url, visit_count, user_count ]) # judgement가 영어 경우 : 한글인 경우 lower한 단어 입력 elif judgement == 'ko': if len(token_judge_en_lower_ko_noun(word)) == 1: result.append([ token_judge_en_lower_ko_noun(word)[0], url, visit_count, user_count ]) elif len(token_judge_en_lower_ko_noun(word)) == 0: pass else: for token_noun in token_judge_en_lower_ko_noun(word): result.append( [token_noun, url, visit_count, user_count]) return result
def review_cr(urll): from selenium import webdriver from selenium.webdriver.common.keys import Keys import time import pandas as pd from bs4 import BeautifulSoup # In[37]: url = urll # In[38]: driver = webdriver.Chrome( 'C:/Users/multicampus/PycharmProjects/airbnb_bot/chromedriver') driver.implicitly_wait(3) driver.get(url) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(10) soup = BeautifulSoup(driver.page_source, 'html.parser') # In[39]: reviews = soup.find('div', { 'id': 'reviews' }).findAll('div', {'class': '_czm8crp'}) review_list = [] for review in reviews: review_list.append(review.string) print(review_list) # In[40]: df = pd.DataFrame(review_list, columns=['review']) # In[41]: from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer tokenizer = RegexTokenizer() tokenizer # In[42]: parsed_list = [] for i in df['review']: temp = tokenizer.tokenize(i) parsed_list.append(temp) df['review_parsed'] = parsed_list # print(df) # In[43]: STOP_WORDS = ['.', '(', ')', '!', '[', ']', '▣', '※'] # In[44]: def remove_stopwords(tokens): return [t for t in tokens if t not in STOP_WORDS] # In[45]: df['review_parsed'] = df['review_parsed'].apply(remove_stopwords) # In[118]: from collections import Counter from matplotlib import pyplot as plt faq_answer_parsed_lst = [ y for x in df['review_parsed'].to_list() for y in x ] counter = Counter(faq_answer_parsed_lst) counter.most_common(20) counter = counter.most_common(20) print(counter) return counter