def classify(text): if detect_language(text) != 'ko': language = detect_language(text) text = translater(text) else: language = 'ko' text = spellchecker(text) word = [] word.append(text) tokenizer = Twitter() word = [tokenizer.morphs(row) for row in word] with open('./model/tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) sequences_test = tokenizer.texts_to_sequences(word) data_int_t = pad_sequences(sequences_test, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH - 5)) data_test = pad_sequences(data_int_t, padding='post', maxlen=(MAX_SEQUENCE_LENGTH)) model = load_model('./model/train_model.h5') y_prob = model.predict(data_test) for n, prediction in enumerate(y_prob): pred = y_prob.argmax(axis=-1)[n] if pred < 2.0: return ("질문을 이해하지 못했어요. 다시 입력해주세요.") else: if language == 'ko': return (classes[pred]) else: return (translater(classes[pred], language))
def naver_card_info(): url = "https://card.search.naver.com/card.naver?singleCardId=20" html = requests.get(url).text soup = BeautifulSoup(html, "html.parser") div_tags = soup.find("div", {"class": "_detail_1 sum_one sum_one_v1 _tab_detail"}) tr_tags = div_tags.find_all("tr") del tr_tags[0] t = list(tr_tags[1].strings) lists = [] for temp in t: if temp == '\n': del temp else: lists.append(temp) twitter = Twitter() nouns = twitter.nouns(' '.join(lists)) pos = twitter.pos(' '.join(lists)) morph = twitter.morphs(' '.join(lists)) phrases = twitter.phrases(' '.join(lists)) count = Counter(nouns) print(lists) print(pos) print(morph) print(phrases) print(nouns) print(count)
class InputProcessor: def __init__(self): self.twitter = Twitter() def morphs(self, input_line): morphs = self.twitter.morphs(input_line) return morphs
class CharBaseDataset(Dataset): def __init__(self, args, tokenizer, mode): super(CharBaseDataset, self).__init__() self.tokenizer = tokenizer self.word_tokenizer = Twitter() self.maxlen = 128 if "train" in mode: data_path = os.path.join(args.data_dir, args.task, args.train_file) elif "dev" in mode: data_path = os.path.join(args.data_dir, args.task, args.dev_file) elif "test" in mode: data_path = os.path.join(args.data_dir, args.task, args.test_file) self.dataset = pd.read_csv(data_path, encoding="utf8", sep="\t") if "small" in mode: self.dataset = self.dataset[:10000] def __len__(self): return len(self.dataset) def __getitem__(self, idx): txt = str(self.dataset.at[idx, "review"]) data = self.tokenizer(txt, pad_to_max_length=True, max_length=self.maxlen, truncation=True) char_token = self.tokenizer._tokenize(txt) word_token = self.word_tokenizer.morphs(txt) input_ids = torch.LongTensor(data["input_ids"]) token_type_ids = torch.LongTensor(data["token_type_ids"]) attention_mask = torch.LongTensor(data["attention_mask"]) label = self.dataset.at[idx, "rating"] return (input_ids, attention_mask, token_type_ids, label), [txt, char_token, word_token]
def korean_morph(text): twitter = Twitter() s = twitter.morphs(str(unicode(text))) s = ' '.join(s) return s
def tokenizer(self, sentence, vocab_flag): #@ tw = Twitter() str = tw.morphs(sentence.strip()) analysis = [] for m in str: analysis.append(m) self.word_embed_dic.append(analysis) return [w for w in analysis if w]
def korean_morph(text): twitter = Twitter() s=twitter.morphs(str(unicode(text))) s=' '.join(s) return s
def Calculate(input_string,S_DB): f = open(input_string,'r') line = f.readline() number_of_bad = S_DB['# of bad,good case'][0] number_of_good = S_DB['# of bad,good case'][1] sum_of_bad = 0 sum_of_good = 0 string_buf = [] R_DB = [] twitter = Twitter() prob_bad = math.log((number_of_bad/(number_of_good+number_of_bad))) # 전체 리뷰 중 부정적 리뷰의 비율(log scale) prob_good = math.log((number_of_good/(number_of_good+number_of_bad))) # 전체 리뷰 중 긍정적 리뷰의 비율(log scale) while True: line = f.readline() line_original = line.rstrip() if not line: break line = twitter.morphs(line) line.pop() #\n 제거 string_buf = [] sum_of_bad = 1 sum_of_good = 1 for j in range(1,len(line)): if line[j] in string_buf: # 한 리뷰에서 중복된 단어는 확률에 반영하지 않습니다. continue string_buf.append(line[j]) buf = S_DB.get(line[j]) if buf == None: # 사전에 있지 않은 단어는 무시합니다. continue p_bad = math.log(buf[0]+1) - math.log(number_of_bad) # 사전에 있는 단어 중 빈도수가 0인 경우 log scale에서 제대로 된 값이 나오지 않기 때문에 1을 더해줍니다. p_good = math.log(buf[1]+1) - math.log(number_of_good) sum_of_bad += p_bad sum_of_good += p_good sum_of_bad += prob_bad sum_of_good += prob_good if sum_of_bad > sum_of_good: R_DB.append([line_original,0]) elif sum_of_good > sum_of_bad: R_DB.append([line_original,1]) else: R_DB.append([line_original,-1]) f.close() return R_DB
def word_preprocessor(sent): twt = Twitter() sent = re.sub('[\,\<\>\(\)\+\-\=\&\@\#\$]', '', sent) sent = re.sub('\.{2,}', ' .. ', sent) sent = re.sub('\~+', ' ~ ', sent) sent = re.sub('\!+', ' ! ', sent) sent = re.sub('\?+', ' ? ', sent) sent = re.sub('(ac)', ' 99', sent) sent = re.sub('(mv)', ' 88', sent) sent = re.sub('ㅋ{1,}|ㅎ{1,}', 'ㅋ', sent) sent = re.sub('ㅜ{1,}|ㅠ{1,}|ㅠㅜ|ㅜㅠ\ㅡㅜ\ㅜㅡ\ㅡㅠ\ㅠㅡ', 'ㅠㅠ', sent) sent = " ".join(twt.morphs(sent)) return sent
def prepro_like_morphlized(data): # 형태소 분석 모듈 객체를 생성 morph_analyzer = Twitter() # 형태소 토크나이즈 결과 문장을 받을 리스트를 생성합니다. result_data = list() # 데이터에 있는 매 문장에 대해 토크나이즈를 할 수 있도록 반복문을 선언합니다. for seq in tqdm(data): # Twitter.morphs 함수를 통해 토크나이즈 된 리스트 객체를 받고 다시 공백문자를 기준으로 문자열로 재구성 해줍니다. # morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ', ''))) morphlized_seq = " ".join(morph_analyzer.morphs(seq)) # 이게 훨씬 좋아보임. result_data.append(morphlized_seq) return result_data
class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter() def analyzer_kkma(self, string_data, mode): """ This method is for kkma. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma """ if mode is 'morphs': return self.kkma.morphs(string_data) elif mode is 'nouns': return self.kkma.nouns(string_data) elif mode is 'pos': return self.kkma.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
def regexp(texts): twt = Twitter() container = [] for i, sent in enumerate(texts): if i % 200000 == 0: print(i) sent = re.sub('[\,\<\>\(\)\+\-\=\&\@\#\$]', '', sent) sent = re.sub('\.{2,}', ' .. ', sent) sent = re.sub('\~+', ' ~ ', sent) sent = re.sub('\!+', ' ! ', sent) sent = re.sub('\?+', ' ? ', sent) sent = re.sub('(ac)', ' 99', sent) sent = re.sub('(mv)', ' 88', sent) sent = re.sub('ㅋ{1,}|ㅎ{1,}', 'ㅋ', sent) sent = re.sub('ㅜ{1,}|ㅠ{1,}|ㅠㅜ|ㅜㅠ\ㅡㅜ\ㅜㅡ\ㅡㅠ\ㅠㅡ', 'ㅠㅠ', sent) container.append(" ".join(twt.morphs(sent))) return container
def load_skt_nugu_samples(data_dir): answers = [] questions = [] vocab = set() twitter = Twitter() with open(os.path.join(data_dir, 'question_samples.txt')) as lines: for line in tqdm(lines, desc=lines.name, mininterval=0.5): key, value = line.strip().split('\t') tokens = twitter.morphs(value) questions.append(tokens) answers.append(key) vocab.update(tokens) return questions, answers, vocab
def update_tokenize(petition_id, petition_content, tokenizing_status): tokenizing_status = get_crawled_status(petition_id) twitter = Twitter() #for tokenizing made by twitter token_content = twitter.morphs(petition_content) token_content_str = ' '.join(token_content).strip() sql = "UPDATE simanalysis SET token = %s, tokenizing_status = %s where id = \"%s\"" #UPDATE simanalysis SET token = "ddjdjdjdkak", tokenizing_status = 0 where id = 21 try: curs.execute(sql, (token_content_str, tokenizing_status, petition_id)) conn.commit() except: print("# update except!!") sys.exit()
def insert_tokenize(petition_id, petition_content, tokenizing_status): tokenizing_status = get_crawled_status(petition_id) twitter = Twitter() #for tokenizing made by twitter token_content = twitter.morphs(petition_content) token_content_str = ' '.join(token_content).strip() sql = "INSERT INTO simanalysis (id, token, tokenizing_status) VALUES (%s, %s, %s)" #print(sql, (petition_id, token_content_str, tokenizing_status)) try: curs.execute(sql, (petition_id, token_content_str, tokenizing_status)) conn.commit() except: print("# insert except!!") sys.exit()
def parse_file(lines, vocab): twitter = Twitter() data = dict() for i, line in tqdm(enumerate(lines), desc=lines.name, mininterval=0.5): line = line.strip().split('\t') if len(line) is not 2: continue key, value = line if data.get(key) is None: data[key] = [] if len(data[key]) == maximum: continue tokens = twitter.morphs(value) data[key].append(tokens) vocab.update(tokens) return data
def preproLikeMorphlized(data): # 형태소 분석 모듈 객체를 # 생성합니다. morphAnalyzer = Twitter() # 형태소 토크나이즈 결과 문장을 받을 # 리스트를 생성합니다. result_data = list() # 데이터에 있는 매 문장에 대해 토크나이즈를 # 할 수 있도록 반복문을 선언합니다. for seq in data: # Twitter.morphs 함수를 통해 토크나이즈 된 # 리스트 객체를 받고 다시 공백문자를 기준으로 # 하여 문자열로 재구성 해줍니다. morphlizedSeq = " ".join(morphAnalyzer.morphs(seq.replace(' ', ''))) result_data.append(morphlizedSeq) return result_data
def main(): with open(settings.VERSION_JSON, "r") as jsonFile: data = json.load(jsonFile) VERSION = data['version'] with open("new_data.json", "r") as jf: dt = json.load(jf) text = dt['text'] x_arr = [] t = Twitter() vocab_fn = settings.VOCAB_FILENAME.format(VERSION) vocab_file = os.path.join(settings.DATA_DIR, vocab_fn) jobj = json.loads((open(vocab_file).read())) arr = list() tokens_ko = t.morphs(text) for word in tokens_ko: try: tmp = jobj[word] arr.append(tmp) except KeyError: pass temp_arr = np.asarray(arr) x_arr.append(temp_arr) x_test = np.asarray(x_arr, dtype=object) print('Pad sequences (samples x time)') x_test = sequence.pad_sequences(x_test, maxlen=settings.MAX_LENGTH) print('x_test shape:', x_test.shape) mod_load_fn = settings.MODEL_FILENAME.format(VERSION) mod_load_path = os.path.join(settings.OUTPUT_DIR, mod_load_fn) model = load_model(mod_load_path) classes = model.predict(x_test, batch_size=settings.BATCH_SIZE) print(classes)
def get_tags(open_text_file): nlp = Twitter() nouns_list = [] toekn_list = [] i = 0 for line in open_text_file: # for line in tqdm(open_text_file): print(line) text = line text = regex.sub(u"[\n]", " ", text) n = nlp.nouns(text) token = nlp.morphs(text) for value in n: nouns_list.append(value) for j in token: toekn_list.append(j) # if i == 400: # break # else: # i+=1 return nouns_list, toekn_list
def w2v(size): path = 'C:\\Users\\kwk51\\Desktop\\wikiextractor\\text\\' text_dir = ['AA', 'AB', 'AC'] # text_dir = ['AA','AB','AC','AD','AE','AF'] corpus = [] for directory in text_dir: for file_num in range(size): wiki_object = open(path + directory + '\\wiki_' + str(file_num), encoding="utf-8") for line in wiki_object: if line == '\n': continue if "doc" in line: continue corpus.append(line) twitter = Twitter() tokenized_list = [] for token in corpus: tokenized_list.append(twitter.morphs(token)) model = Word2Vec(tokenized_list, size=100, window=4, min_count=10, workers=1, iter=100, sg=1) # NN으로 되어있음. # sg = 1 skipgram 가운데 단어 양쪽 window 싸이드만큼 맞춰야되는 단어가 8개. # sg = 0 CBOW가 하나. model.save('w2v.model') # print(model.most_similar(positive=['여자','왕'], negative = ['남자'], topn=30)) # print(model.similarity('남자','여자')) return model.most_similar(positive=['여자', '왕'], negative=['남자'], topn=30)
def dobby(): query = request.values.get('query', 'default') conn = pymysql.connect(host=info.host, user=info.user, password=info.password, db=info.db, charset=info.charset) curs = conn.cursor() tagger = Twitter() query = tagger.morphs(query) query = [' '.join(query)] sql = "SELECT * FROM dobby;" curs.execute(sql) rows = curs.fetchall() rows = list(rows) # TF-IDF data = pd.DataFrame(rows, columns=['id', 'question', 'prep_question', 'answer']) prep_question = list(data["prep_question"]) prep_question = query + prep_question tfidfv = TfidfVectorizer().fit(prep_question) tf_idf_mat = tfidfv.transform(prep_question).toarray() doc0 = np.array(tf_idf_mat[0]) tf_idf_mat = tf_idf_mat[1:] max_similarity = -1 max_similarity_index = -1 for idx, doc in enumerate(tf_idf_mat): doc = np.array(doc) if max_similarity < cos_sim(doc0, doc): max_similarity = cos_sim(doc0, doc) max_similarity_index = idx result = rows[max_similarity_index][3] return result
class Tokenizer: def __init__(self): self.t = Twitter() pass; def tokenize(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) token_list = [] for num, input in enumerate(token): if (token[num] in scores) == True: token_list.append(token[num]) elif (token[num] in scores) == False: twit_token = self.t.morphs(token[num]) token_list= token_list + twit_token return token_list def noun_extract(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) noun_list = [] compared_noun_list = self.t.nouns(sentence) for num, input in enumerate(token): if (token[num] in scores) == True: noun_list.append(token[num]) elif (token[num] in scores) == False: twit_token = self.t.nouns(token[num]) noun_list= noun_list + twit_token diff_noun_list = list(set(noun_list) - set(compared_noun_list)) diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys())) for num, input in enumerate(noun_list): if (noun_list[num] in diff_noun_list) == True: noun_list.pop(num); return noun_list def noun_extract_dup(self, sentence, score_dic): scores = score_dic tokenizer = MaxScoreTokenizer(scores=scores) token = tokenizer.tokenize(sentence) noun_list = [] compared_noun_list = self.t.nouns(sentence) for num, input in enumerate(token): if (token[num] in scores) == True: noun_list.append(token[num]) elif (token[num] in scores) == False: twit_token = self.t.nouns(token[num]) noun_list= noun_list + twit_token diff_noun_list = list(set(noun_list) - set(compared_noun_list)) diff_noun_list = list(set(diff_noun_list) - set(score_dic.keys())) noun_list = list(set(noun_list) - set(diff_noun_list)) return noun_list def noun_counter(self, sentence, score_dic, word): noun_list = self.noun_extract(sentence,score_dic) number = 0 for num, input in enumerate(noun_list): if input == word: number = number + 1 return number
from gensim.models import word2vec from gensim.models.keyedvectors import KeyedVectors from konlpy.corpus import kobill from konlpy.tag import Twitter t = Twitter() fields_ko = kobill.fileids() docs_ko = kobill.open('1809890.txt').read() tokens_ko = t.morphs(docs_ko) print(isinstance(tokens_ko, list)) print(tokens_ko) embedding = word2vec.Word2Vec(tokens_ko, size=5, window=1, negative=3, min_count=1) # token으론 잘 짤리는 데 왜 한글자로 저장되는지 모르겠음 embedding.wv.save_word2vec_format('my.sample', binary=False) model = KeyedVectors.load_word2vec_format('my.sample', binary=False, encoding='utf-8') print(model.most_similar('육'))
from konlpy.tag import Twitter # Twitter starts with capital T, so it is a class twitter = Twitter() maillist = twitter.pos("고구려의 영역은 어디까지일까", norm=True, stem=True) print(maillist) print(twitter.morphs(u' ')) # u' ' means string is unicode, but no longer used in Python3 as it is automatic print(twitter.morphs(' 정부와 기업이 함께 근로자의 휴가비를 지원 ')) # slices words into each word print(twitter.nouns('직장 내 자유로운 휴가 분위기를 조성하고 일과 휴식이 균형을 이루는 근무 여건을 만들기 위해 지난해부터 시행한 사업이다. ')) # extracts nouns print(twitter.pos('이것도 되나욬ㅋㅋㅋ')) # each word and its category print(twitter.pos('이것도 되나욬ㅋㅋㅋ', norm=True)) # norm option converts to nomral words print(twitter.pos('이것도 되나욬ㅋㅋㅋ', norm=True, stem=True)) # stem option converts words to root words # Last statement is most frequently used
word_set = set() word_dict = dict() count = 0 requester = Requester(url, limit=1000) res = requester.next() while res is not None: for data in res: song_info = data['song_info'] lyric = song_info['lyric'] split_lyric = twitter.morphs(lyric) word_set.update(split_lyric) for _str in split_lyric: if _str not in word_dict: word_dict[_str] = 1 else: word_dict[_str] += 1 count += 1 print('[{}] word_set : {}'.format(count, len(word_set))) res = requester.next() print(word_set)
from konlpy.tag import Twitter twitter = Twitter() a = '책가방을메고 학교에가서 공부를 하자' b = '우리나라 금수강산 이라서 그런가 금수저가 많네요' mo = twitter.morphs(a) mo2 = twitter.morphs(b) po = twitter.pos(a) ta = [] ta.append(mo) ta.append(mo2) print(ta) print(len(ta)) for word, tag in ta: print(word) print(tag)
''' Created on 2018. 1. 28. @author: hillk ''' import sys from konlpy.tag import Twitter twitter = Twitter() print('한글 문장을 입력하세요.') try: while True: sys.stdout.write('>> ') sys.stdout.flush() text = sys.stdin.readline().strip() if text: answer = twitter.morphs(text) print(answer) except KeyboardInterrupt: pass
from konlpy.utils import pprint import nltk from nltk.corpus import stopwords from nltk.tokenize import wordpunct_tokenize twitter = Twitter() stop_words = set(stopwords.words('korean')) stop_words.update( ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation #TEXT = u'S8에서 Bixby가 왜 광고에 나오는 것처럼 추천기능이 작동하지 않는거니' TEXT = u'갤럭시에서 Bixby 기능이 왜 광고에 나오는 것처럼 추천기능이 작동하지 않는거니' print("=== Parse phrase(tokenize) to morphemes. ===") print(twitter.morphs(TEXT)) #print ("=== Nouns extract ===") #print(twitter.nouns(TEXT)) #print ("=== Phrase extract ===") #print(twitter.phrases(TEXT)) print("=== Post tagger ===") print(twitter.pos(TEXT)) print("=" * 10 + ' Remove stopwords(& lower) Test: ' + "=" * 10) print([(i.lower(), j) for i, j in twitter.pos(TEXT) if i.lower() not in stop_words]) print("=== Post tagger with normalization & stemming(lemmatization) ===") print([(i.lower(), j) for i, j in twitter.pos(TEXT, norm=True, stem=True) if i.lower() not in stop_words])
#-*- coding: utf-8 -*- from konlpy.tag import Twitter import nltk import sys reload(sys) sys.setdefaultencoding('utf-8') twitter = Twitter() print(twitter.morphs(u'한글형태소분석기 테스트 중 입니다')) # ?? print(twitter.nouns(u'한글형태소분석기 테스트 중 입니다!')) #명사 print(twitter.pos(u'한글형태소분석기 테스트 중 입니다.')) #형태소 def read_data(filename): with open(filename, 'r') as f: data = [line.split('\t') for line in f.read().splitlines()] return data def tokenize(doc): # norm, stem은 optional return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)] def term_exists(doc): return { 'exists({})'.format(word): (word in set(doc)) for word in selected_words
if diffchar(l[idx - 1], l[idx]): l.insert(idx, ' ') idx += 1 l = (''.join(l)).split() # 소문자로 바꾸기 l = [c.lower() for c in l] # 사전에 있는 영단어만 남기기 l = [c for c in l if (iskor(c[0]) or (c in eng))] # konlpy idx = 0 while idx < len(l): if iskor(l[idx][0]): sp = twitter.morphs(l[idx]) if idx == 0: l = sp + l[idx + 1:] else: l = l[:idx] + sp + l[idx + 1:] idx += len(sp) else: idx += 1 # konlpy result = ' '.join(l) + '\n' # preprocess failed if len(result) == 1: result = line.split('\x01')[2] f.write(result) cnt += 1 if cnt % 100000 == 0:
def Learn(input_string): f = open(input_string,'r') line = f.readline() T_DB = [] S_DB = {} # 리턴할 Dictionary number_of_bad = 0 # 부정적 반응의 리뷰 개수 number_of_good = 0 # 긍정적 반응의 리뷰 개수 string_buf = [] twitter = Twitter() while True: line = f.readline() if not line: break line = twitter.morphs(line) line.pop() # '\n' 제거 line[0] = line.pop() #'긍정/부정 점수'를 리스트의 가장 앞쪽으로 옮김 string_buf = [] if line[0] == '0': number_of_bad += 1 else: number_of_good += 1 ''' 형태소를 T_DB에 임시로 등록하는 과정, [형태소,부정여부,긍정여부]로 등록. ex) '아름다움'이 긍정적 평가에 나타난 경우 ['아름다움',0,1]로 등록 ''' for i in range(1,len(line)): if line[i] in string_buf: # 하나의 리뷰에서 2개 이상 나오는 단어는 중복하여 사전에 등재하지 않습니다. continue string_buf.append(line[i]) if line[0] == '0': T_DB.append([line[i],1,0]) else: T_DB.append([line[i],0,1]) T_DB.sort(reverse=True) ''' S_DB에 부정적 리뷰 갯수, 긍정적 리뷰 갯수를 먼저 등록함 ''' S_DB['# of bad,good case'] = [number_of_bad,number_of_good] len_db = len(T_DB) bad = 0 good = 0 string = T_DB[1][0] ''' T_DB에 중복으로 등록된 형태소들의 갯수를 각각 합하여 하나로 등록한 뒤 S_DB에 등록하는 과정 ''' for i in range(1,len_db-1): buf = T_DB[i][0] if string == buf: bad += T_DB[i][1] good += T_DB[i][2] if i != len_db-1: continue else: S_DB[string] = [bad,good] string = buf bad = T_DB[i][1] good = T_DB[i][2] if i == len_db-1: if bad != 0 and good != 0: S_DB[string] = [bad,good] f.close() return S_DB
#!usr/bin/env python # File name....: text-analysis-2.py # Module name..: 4 - Text Analysis & Visualization # Author.......: Buomsoo Kim, Jinsoo Park ''' This program demonstrates how to analyze text. It uses Twitter in konlpy module, and handles the moview review full data collected during Module 3. This file is provided for educational purposed and can be distributed for class use. Copyright (c) by Jinsoo Park, Seoul National University. All rights reserved. ''' # 한국어 텍스트 분석에 필요한 모듈 불러오기(konlpy) from konlpy.tag import Twitter # 트위터 형태소 분석기 불러오기 # 분석에 앞서, 크롤링했던 영화 리뷰 데이터 불러오기 file = open('data-full.txt', 'r', encoding='utf-8') # 영화 리뷰 파일 열기 lines = file.readlines() # readlines()를 통해 영화 리뷰 파일의 모든 라인 읽어오기 file.close() # 파일 닫기 #print(lines) # 트위터 변수 생성 twitter = Twitter() # 문장을 형태소 분석하기 tokens = twitter.morphs(lines[1]) # 첫번째 문장의 형태소 분석 print(tokens) # 형태소 분석 결과 출력하기 # ///// END OF text-analysis-2 ////////////////////////////////////////////////