def tokenize_okt_noscreen(df): okt = Twitter() okt.add_dictionary(call_userword(), 'Noun') stopwords = load_wordset('./tokenizer/korean_stopword.txt') #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt') stopwords = list(stopwords) df['content_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos( x['content'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1) df['title_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos( x['title'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1) return df
class RawTaggerReader: def __init__(self, filepath, tagger=None): if tagger: self.tagger = tagger else: self.tagger = Twitter() self.filepath = filepath self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))') def __iter__(self): for line in open(self.filepath, encoding='utf-8'): ch = self.rgxSplitter.split(line) for s in map(lambda a, b: a + b, ch[::2], ch[1::2]): if not s: continue yield self.tagger.pos(s)
class RawTagger: def __init__(self, textIter, tagger=None): # 전처리 1 (형태소 분석 단어 추가) -> 형태소 분석기가 단어로 인식 못하는 단어들을 추가 f = open('형태소 보완.txt') dd = f.read() a = dd.split('\n') if tagger: self.tagger = tagger else: self.tagger = Twitter() self.tagger.add_dictionary(a, 'Noun') if type(textIter) == str: self.textIter = textIter.split('\n') else: self.textIter = textIter self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))') def __iter__(self): for line in self.textIter: ch = self.rgxSplitter.split(line) for s in map(lambda a, b: a + b, ch[::2], ch[1::2]): if not s: continue yield self.tagger.pos(s)
class Social_analysis(): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) def __init__(self): self.twitter = Twitter() def pickle_to_table(self, filename): with open(filename, 'rb') as f: data = pickle.load(f) data = data[1:] for idx, i in enumerate(data): data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map) data[idx][3] = '/'.join(i[3]) data[idx][4] = '/'.join(i[4]) self.raw_data = np.array(data) def hashtags_split(self, hashtags): hashtags_split = [] for i in hashtags: hashtags_split.append(i.split('/')) hashtags_list = [] for i in hashtags_split: temp = [] for j in i: if self.isHangul(j): t_hashtags = j.translate(self.non_bmp_map) temp.append(t_hashtags) hashtags_list.append(temp) self.hashtags_list = hashtags_list return hashtags_list def add_keyword_dic(self, keyword_list, tag='Noun'): for i in keyword_list: if type(i) == tuple: self.twitter.add_dictionary(i[0], i[1]) else: self.twitter.add_dictionary(i, tag) def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']): morph_list = [] noun_list = [] adj_list = [] verb_list = [] for j in text_list: parsed = self.twitter.pos(j) temp = [] n_temp = [] adj_temp = [] verb_temp = [] for i in parsed: if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): temp.append(i) if i[1] == 'Noun': n_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) morph_list.append(temp) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list = noun_list + adj_list + verb_list return morph_list, nav_list, noun_list, adj_list, verb_list def merge_list(self, tokenized_list): return [j for i in tokenized_list for j in i] def join_list(self, tokenized_list): joined_list = [] for idx, i in enumerate(tokenized_list): joined_list.append(" ".join(i)) return joined_list def split_list(self, untokenized_list): hashtag_splited = [] for idx, i in enumerate(untokenized): hashtag_splited.append(i.split('/')) return hastag_splited def word_substitute(self, dataset, sublist): dataset = copy.deepcopy(dataset) sub_book = dict() for i in sublist: for j in i['sub_words']: sub_book[j] = i['main'] gc.collect() for n, i in enumerate(dataset): dataset[n] = [sub_book.get(item, item) for item in i] del sub_book gc.collect() return dataset def word_delete(self, dataset, del_list): dataset = copy.deepcopy(dataset) for n, line in enumerate(dataset): dataset[n] = [i for i in line if i not in del_list] return dataset def isHangul(self, text): encText = text hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) return hanCount > 0
def token(self, title, ccontent, creplies): memory = psutil.Process(os.getpid()) T_OR_title = [] T_title = [] T_OR_ccontent = [] T_ccontent = [] T_OR_creplies = [] T_creplies = [] twitter = Okt() # 트위터 형태소 사전을 사용하기 위해 초기화 twitter.add_dictionary('백래시', 'Noun') twitter.add_dictionary('문재앙', 'Noun') #### 타이틀 토큰화 #print('1') for i in range(len(title)): a = twitter.pos(title[i]) b = [] #print('title[i]',i,title[i]) for j in range(len(a)): if a[j][1] != 'Punctuation': # 오류로 'Punctuation'에 해당하는 튜플 제거 b.append(a[j]) #print('3',j) T_OR_title.append(b) T_title.append(twitter.morphs(title[i])) #### ccontent 토큰화 try: c = twitter.pos(str(ccontent[i])) d = [] # print('ccontent[i]',i, ccontent[i]) for w in range(len(c)): if c[w][1] != 'Punctuation': # 오류로 'Punctuation'에 해당하는 튜플 제거 d.append(c[w]) #print('4',w) T_OR_ccontent.append(d) T_ccontent.append(twitter.morphs(str(ccontent[i]))) except RuntimeError as e: T_OR_ccontent.append('') T_ccontent.append(twitter.morphs('')) ### 댓글 토큰화 #print('creplies[i]',i,creplies[i]) if type(creplies[i]) == str: # string형 댓글 토큰화 a = [creplies[i]] # string을 리스트로 변경 e = twitter.pos(str(a)) f = [] for u in range(len(e)): if e[u][1] != 'Punctuation': f.append(e[u]) elif e[u][1] != 'KoreanParticle': f.append(e[u]) else: break #print('5',u) T_OR_creplies.append(f) T_OR_creplies.append(twitter.pos(str(a))) T_creplies.append(twitter.morphs(str(a))) else: temp = [] temp2 = [] x = [] for n in range(len(creplies[i])): ### 리스트로 반환되는 댓글 h = twitter.pos(creplies[i][n]) #print('6',n) for z in range(len(h)): if h[z][1] != 'Punctuation': x.append(h[z]) elif h[z][1] != 'KoreanParticle': x.append(h[z]) else: break # print('7',z) # print('8',) #print('h',z,h) temp.append(x) temp2.append(twitter.morphs(creplies[i][n])) T_OR_creplies.append(temp) T_creplies.append(temp2) return T_OR_title, T_title, T_OR_ccontent, T_ccontent, T_OR_creplies, T_creplies
class Social_analysis(): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) def __init__(self): self.twitter = Twitter() def pickle_to_table(self, filename): with open(filename, 'rb') as f: data = pickle.load(f) data = data[1:] for idx, i in enumerate(data): data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map) data[idx][3] = '/'.join(i[3]) data[idx][4] = '/'.join(i[4]) self.raw_data = np.array(data) def DB_to_table(self, DBname='intake', keyword='intake'): self.query = \ """ SELECT keyword, created_at, post_name, main_text, current_url FROM NaverBlogReview WHERE keyword = '{}' """.format(keyword) conn = pymssql.connect( "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname) df = pdsql.read_sql_query(self.query, con=conn) # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map)) # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")) conn.close() self.raw_data = df.as_matrix() # def hashtags_split(self, hashtags): # hashtags_split = [] # for i in hashtags: # hashtags_split.append(i.split('/')) # # hashtags_list = [] # # for i in hashtags_split: # temp = [] # for j in i: # if self.isHangul(j): # t_hashtags = j.translate(self.non_bmp_map) # temp.append(t_hashtags) # hashtags_list.append(temp) # self.hashtags_list = hashtags_list # # return hashtags_list def add_keyword_dic(self, keyword_list, tag='Noun'): for i in keyword_list: if type(i) == tuple: self.twitter.add_dictionary(i[0], i[1]) else: self.twitter.add_dictionary(i, tag) def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']): morph_list = [] noun_list = [] adj_list = [] verb_list = [] for j in text_list: parsed = self.twitter.pos(j) temp = [] n_temp = [] adj_temp = [] verb_temp = [] for i in parsed: if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): temp.append(i) if i[1] == 'Noun': n_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) morph_list.append(temp) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list = noun_list + adj_list + verb_list return morph_list, nav_list, noun_list, adj_list, verb_list def merge_list(self, tokenized_list): return [j for i in tokenized_list for j in i] def join_list(self, tokenized_list): joined_list = [] for idx, i in enumerate(tokenized_list): joined_list.append(" ".join(i)) return joined_list def split_list(self, untokenized_list): hashtag_splited = [] for idx, i in enumerate(untokenized): hashtag_splited.append(i.split('/')) return hastag_splited def word_substitute(self, dataset, sublist): dataset = copy.deepcopy(dataset) sub_book = dict() for i in sublist: for j in i['sub_words']: sub_book[j] = i['main'] gc.collect() for n, i in enumerate(dataset): dataset[n] = [sub_book.get(item, item) for item in i] del sub_book gc.collect() return dataset def word_delete(self, dataset, del_list): dataset = copy.deepcopy(dataset) for n, line in enumerate(dataset): dataset[n] = [i for i in line if i not in del_list] return dataset def isHangul(self, text): encText = text hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) return hanCount > 0
from matplotlib import font_manager, rc from PIL import Image import numpy as np font_name = font_manager.FontProperties( fname="c:/Windows/Fonts/malgun.ttf").get_name() rc('font', family=font_name) tager = Twitter() for add in add_noun: tager.add_dictionary(add, 'Noun') text = open(keyword + '.txt', encoding='utf-8-sig').read() tokens = tager.pos(text) wordlist = [] for word in tokens: if word[1] in ['Noun']: if word[0] not in no_word: wordlist.append(word[0]) words = nltk.Text(wordlist, name=keyword) top_words = words.vocab().most_common(1000) words_dic = dict(top_words) mask = np.array(Image.open("shape.png")) wordcloud = WordCloud( font_path='c:/Windows/Fonts/malgun.ttf', mask=mask,
filtered_content = re.sub(r'[^\.\?\!\w\d\s]', '', filtered_content) print(filtered_content) from ckonlpy.tag import Twitter twitter = Twitter() # 가상통화, 아시아경제 등 미등록 단어를 사전에 추가로 등록 twitter.add_dictionary('가상통화', 'Noun') twitter.add_dictionary('아시아경제', 'Noun') twitter.add_dictionary('한동안', 'Noun') twitter.add_dictionary('블리클리', 'Noun') twitter.add_dictionary('공동창립자', 'Noun') twitter.add_dictionary('부크바', 'Noun') # 형태소 분석 twitter_morphs = twitter.pos(filtered_content) # 명사만 추출하기 Noun_words = [] for word, pos in twitter_morphs: if pos == 'Noun': Noun_words.append(word) print(Noun_words) # 불용어 제거를 위한 별도의 사전 구축 # 본문과 상관 없는 아시아 경제, 기자 이름, 기자 단어 제거 # count 했을 때 빈도수가 높지만 본문 주요 내용과 관련이 없는 못, 것, 수, 까지 단어 제거 stopwords = ['아시아경제', '김철현', '기자', '못', '것', '수', '까지'] # unique 하게 명사 추려내기 unique_noun_words = set(Noun_words)
# 트위터 tokenizer loading 및 업로드 twitter = Twitter() twitter.add_dictionary(brand_set, 'Noun') # 원래 tokenizer 업로드 ori_twitter = original_Twitter() #### emoji2text preprocessed_text = emoji2text(review_DB.text) #### upper & 제품 매핑 preprocessed_text = pd.Series(preprocessed_text).apply(lambda x: x.upper()) bow = preprocessed_text.apply(lambda x: '◈'.join([ token + '╹' + pos if pos in ['Noun'] else ori_twitter. pos(token, stem=True, norm=True)[0][0] + '╹' + ori_twitter.pos( token, stem=True, norm=True)[0][1] for token, pos in twitter.pos(x) ])) ### token normalizing p_normal = { 'BB╹Alpha': '비비크림╹Noun', 'CC╹Alpha': '씨씨크림╹Noun', '비비╹Noun': '비비크림╹Noun', '씨씨╹Noun': '씨씨크림╹Noun', '파데╹Noun': '파운데이션╹Noun', '쟂빛╹Noun': '잿빛╹Noun', '비씨╹Noun': '비씨데이션╹Noun' } bow = bow.apply(lambda x: '◈'.join([ p_normal[token] if token in p_normal.keys() else token for token in x.split('◈') ]))
class Social_analysis(): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) def __init__(self): self.twitter = Twitter() def DB_to_table(self, DBname='intake', keyword='intake'): import pymssql import pandas.io.sql as pdsql import pandas as pd self.query = \ """ SELECT user_id, created_at, main_text, hashtags, comments, likes, current_url FROM instaPost WHERE keyword = '{}' """.format(keyword) conn = pymssql.connect( "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname) df = pdsql.read_sql_query(self.query, con=conn) # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map)) # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")) conn.close() self.raw_data = df.as_matrix() def pickle_to_table(self, filename): with open(filename, 'rb') as f: data = pickle.load(f) data = data[1:] for idx, i in enumerate(data): data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map) data[idx][3] = '/'.join(i[3]) data[idx][4] = '/'.join(i[4]) self.raw_data = np.array(data) def hashtags_split(self, hashtags): hashtags_split = [] for i in hashtags: hashtags_split.append(i.split('/')) hashtags_list = [] for i in hashtags_split: temp = [] for j in i: if self.isHangul(j): t_hashtags = j.translate(self.non_bmp_map) temp.append(t_hashtags) hashtags_list.append(temp) self.hashtags_list = hashtags_list return hashtags_list def add_keyword_dic(self, keyword_list, tag='Noun'): for i in keyword_list: if type(i) == tuple: self.twitter.add_dictionary(i[0], i[1]) else: self.twitter.add_dictionary(i, tag) def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']): morph_list = [] noun_list = [] adj_list = [] verb_list = [] nav_list = [] for j in text_list: parsed = self.twitter.pos(j) temp = [] n_temp = [] adj_temp = [] verb_temp = [] nav_temp = [] for i in parsed: if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): temp.append(i) if i[1] == 'Noun': n_temp.append(i[0]) nav_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) nav_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) nav_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) morph_list.append(temp) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list.append(nav_temp) return morph_list, nav_list, noun_list, adj_list, verb_list def pos_extractor(self, parsed): noun_list = [] adj_list = [] verb_list = [] nav_list = [] for i in parsed: n_temp = [] adj_temp = [] verb_temp = [] nav_temp = [] if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): if i[1] == 'Noun': n_temp.append(i[0]) nav_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) nav_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) nav_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list.append(nav_temp) return nav_list, noun_list, adj_list, verb_list def merge_list(self, tokenized_list): return [j for i in tokenized_list for j in i] def join_list(self, tokenized_list): joined_list = [] for idx, i in enumerate(tokenized_list): joined_list.append(" ".join(i)) return joined_list def split_list(self, untokenized_list): hashtag_splited = [] for idx, i in enumerate(untokenized): hashtag_splited.append(i.split('/')) return hastag_splited def join_underbar(self, morph_list): all_list = [] post_list = [] for i in morph_list: for j in i: post_list.append(j[0] + '_' + j[1]) all_list.append([(' , ').join(post_list)]) post_list = [] all_list = np.array(all_list) return all_list def word_substitute(self, dataset, sublist): dataset = copy.deepcopy(dataset) sub_book = dict() for i in sublist: for j in i['sub_words']: sub_book[j] = i['main'] gc.collect() for n, i in enumerate(dataset): dataset[n] = [sub_book.get(item, item) for item in i] del sub_book gc.collect() return dataset def word_delete(self, dataset, del_list): dataset = copy.deepcopy(dataset) for n, line in enumerate(dataset): dataset[n] = [i for i in line if i not in del_list] return dataset def isHangul(self, text): encText = text hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) return hanCount > 0 def convert_list(self, *tokenized_list): input_length = len(tokenized_list) lists = [[] for i in range(input_length)] for idx, li in enumerate(tokenized_list): for j in li: lists[idx].append(['/'.join(j)]) converted_array = np.array(lists[0]) for idx in range(input_length): try: converted_array = np.concatenate( (converted_array, lists[idx + 1]), axis=1) except Exception as e: print(e, '끝') return converted_array def make_df(self, converted_array): df = pd.DataFrame(np.hstack( (intake.raw_data[:, :3], converted_array, intake.raw_data[:, 3:])), index=None) return df # 키워드 리스트 중 하나라도 있는 경우 def word_check_or(self, text, keywords): if any(word in text for word in keywords): return True else: return False # 키워드 리스트에 있는 단어가 모두 있는 경우 def word_check_and(self, text, keywords): if all(word in text for word in keywords): return True else: return False def word_check(self, method='and', keywords=[], df=None, column_name=None, filter_TF=True): if method == 'and': df['flag'] = df[column_name].apply( lambda x: self.word_check_and(x, keywords)) return df[df.flag == filter_TF] if method == 'or': df['flag'] = df[column_name].apply( lambda x: self.word_check_or(x, keywords)) return df[df.flag == filter_TF]
from konlpy.tag import Komoran from konlpy.tag import Okt from ckonlpy.tag import Twitter okt = Okt() twitter = Twitter() sentence = 'IBK기업은행 ' sentences = '소은지국민은행계좌로30만원이체해줘' komoran = Komoran() twitter.add_dictionary('이체해줘', 'Noun') twitter.add_dictionary('KB 국민은행', 'Noun') komoran = Komoran(userdic="C:/Users/ADMIN/Desktop/dic.txt") print(twitter.pos(sentence, stem=True)) print(twitter.pos(sentences, stem=True)) print(komoran.pos(sentence)) print(komoran.pos(sentences)) arr = komoran.pos(sentence) for word, tag in arr: if (tag == 'VV'): print("|||||||") print(word, tag) if (tag == 'JKO' or tag == 'JKB' or tag == 'JKS'): print("|||||||") brr = komoran.pos(sentences) for word, tag in brr: if (tag == 'VV' or tag == 'XSV'):