def generate_per_file(self, file_name, path_list, all_reg): kkma = Kkma() with open(file_name, 'w') as file_write: for path in path_list: print path + "-start" for item in self.corpus_generator(path): ''' item = item.decode('mbcs').encode('utf-8') sliced = item[:len(item)-1] subed = all_reg.sub('', sliced) if len(subed) != 0: continue file_write.write((item + '\n')) ''' for sub_item in kkma.sentences(item.decode('utf-8')): sliced = sub_item[:len(sub_item)-1] subed = all_reg.sub('', sliced.encode('utf-8')) if len(subed) != 0: continue if '.' in sub_item: if self.whatisthis(sub_item) == "not": file_write.write(sub_item.encode('utf-8') + '\n') else: file_write.write(sub_item + '\n') print path + "-end"
def tagPOS(filename): try: # Read file f = open(filename, 'r') text = f.read().decode('utf-8') # read file as utf8 decoded f.close() # tagging from konlpy.tag import Kkma #from konlpy.utils import pprint kkma = Kkma() print ('now tagging...') tagged = kkma.pos(text) # Write tagged file (path,fnameExt) = os.path.split(filename) (fname,fext) = os.path.splitext(fnameExt) tagged_file = fname+'_'+'tagged'+fext fw = open(tagged_file,'w') for line in tagged: strs="\t".join(x for x in line).encode('utf-8') fw.write(strs+"\n") fw.close() print '%s is created' % (tagged_file) except: print '\nERROR:' print '"%s" is not a valid text\nCheck your text file\nor file path\n' % (filename) sys.exit(1)
def get_tags(text, ntags=50, multiplier=30): # 폰트 크기 조절은 multiplier값을 조정해서 # h = Hannanum() r = lambda: random.randint(0,255) color = lambda: (r(), r(), r()) h = Kkma() text = unicode(text, 'utf-8') nouns = h.nouns(text) count = Counter(nouns) return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\ for n, c in count.most_common(ntags)]
def _kkma_parse(self, str_arr, tag_combine=True): """ :param h5file: :return: """ kkma = Kkma() return_arr = [] for data in str_arr: return_arr = return_arr + self._flat(kkma.pos(str(data)), tag_combine=tag_combine) return return_arr
class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter() def analyzer_kkma(self, string_data, mode): """ This method is for kkma. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma """ if mode is 'morphs': return self.kkma.morphs(string_data) elif mode is 'nouns': return self.kkma.nouns(string_data) elif mode is 'pos': return self.kkma.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
def __init__(self): self.kkma = Kkma() self.conn = sqlite3.connect('yebi.db') self.cursor = self.conn.cursor() self.count = 20 reload(sys) sys.setdefaultencoding('utf-8')
def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter()
def getKeywords(src): kkma = Kkma() words = kkma.nouns(src) words = list(set(words)) words_calc = [] words_num = len(words) for word in words: if not word.isdigit() and not u'서울' in word and re.match('(.*)?\d+(.*)?', word) is None: word_count = src.count(word) word_idf = word_count * math.log(len(word)) if word_idf > 1: words_calc.append((word, word_idf)) words_sort = sorted(words_calc, key = lambda w: w[1], reverse = True) words_real = [] for word in words_sort: words_real.append(word[0]) print (" / ".join(words_real[:5])).encode('utf-8')
def SortSentence(filename): # Read file f = open(filename, 'r') text = f.read().decode('utf-8') # read file as utf8 decoded f.close() # tagging from konlpy.tag import Kkma #from konlpy.utils import pprint kkma = Kkma() print ('now dividing sentences...') tagged = kkma.sentences(text) # Write tagged file (path,fnameExt) = os.path.split(filename) (fname,fext) = os.path.splitext(fnameExt) tagged_file = fname+'_'+'sentence'+fext fw = open(tagged_file,'w') for line in tagged: strs = line.encode('utf-8') fw.write(strs+"\n") fw.close() print '%s is created' % (tagged_file)
#!/usr/bin/python # vim: set fileencoding=utf8 : from konlpy.tag import Kkma from konlpy.utils import pprint from convert import convert import fileinput kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.')) #poss = kkma.pos(u'(오류보고는) "실행환경", 에러메세지와함께 설명을 최대한상세히!^^') for line in fileinput.input(): poss = kkma.pos(unicode(line, "utf-8")) for tup in poss: print tup[0], print convert(tup[1])
from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.' pprint(kkma.nouns(string))
def excel_noun(): def excel_write(row_val, column_val, data): new_sheet.cell(row = row_val, column = column_val, value="%s" %data) wb=load_workbook('reference.xlsx') sheetList = wb.get_sheet_names() sheet = wb.get_sheet_by_name(sheetList[0]) row_count = sheet.get_highest_row() new_sheet = wb.create_sheet(title='extraction') for i in range(2, row_count): if sheet.row_dimensions[i].visible : pass else : excel_write(i,1,'') new_sheet.row_dimensions[i].hidden = True #new_sheet.row_dimensions[i].outlineLevel = 1 continue noun_val = "" full_qua = "" cellValue_name = sheet.cell(row=i, column=1).value cellValue = sheet.cell(row=i, column=2).value try : QUA = cellValue.count(u'\u201c') except : continue if QUA != -1: if QUA == 1 : START_QUA = cellValue.find(u"\u201c") + 1 # position of first quatation mark CELL_VALUE_LEN = len(cellValue) cellValue_re = cellValue[START_QUA:CELL_VALUE_LEN] END_QUA = cellValue_re.find(u"\u201d") # position of last quatation mark cellValue_final = cellValue_re[0:END_QUA] print str(i) + " "+ cellValue_name + " " + cellValue_final kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(cellValue_final)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' excel_write(i, 1, cellValue_name) excel_write(i, 2, cellValue_final) excel_write(i, 3, noun_val) elif QUA == 0 : #print str(i) + " " + cellValue ANOTHER_QUA = cellValue.find("\"") + 1 # position of first quatation mark ANOTHER_QUA_LEN = len(cellValue) another_cellValue = cellValue[ANOTHER_QUA:ANOTHER_QUA_LEN] ANOTHER_END_QUA = another_cellValue.find("\"") another_cellValue_final = another_cellValue[0:ANOTHER_END_QUA] #print str(i) + " " + cellValue_name + " " + another_cellValue_final kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(another_cellValue_final)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' excel_write(i, 1, cellValue_name) excel_write(i, 2, another_cellValue_final) excel_write(i, 3, noun_val) elif QUA > 1 : #print str(i) + " " + str(QUA) for q in range(0,QUA): arr = cellValue.split(u"\u201d") arr_start_qua = arr[q].find(u"\u201c") + 1 arr_len = len(arr[q]) arr_cellValue = arr[q][arr_start_qua:arr_len] full_qua = full_qua + arr_cellValue kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(arr_cellValue)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' #print str(i) + " " + arr_cellValue excel_write(i, 1, cellValue_name) excel_write(i, 2, full_qua) excel_write(i, 3, noun_val) wb.save('reference.xlsx')
# 3. install google cloud language # 4. set your creds and java directories in the os.environ calls # 5. set the in and out filenames below os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "g_creds.json" os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-14.0.2" in_filename = "singihan hangari.txt" out_filename = "singihan out.txt" translate_client = translate_v2.Client() translate = translate_client.translate okt = Okt() kkma = Kkma() vocab_list = [] def extract_korean_vocab(in_filename, out_filename): with open(in_filename, "r", encoding="utf-8") as korean_text_file_object: with open(out_filename, "w", encoding="utf-8") as outfile: story = korean_text_file_object.read() print("Splitting sentences...") sentences = kkma.sentences(story) print(f"Parsing words in {len(sentences)} sentences...") for sentence in sentences: tags = okt.pos(sentence, norm=True, stem=True) for t in tags: if t[1] not in ['Foreign', 'Punctuation',
# -*- coding: utf-8 -*- import csv, json, time from konlpy.tag import Kkma origdata = open('seoul2015.tsv', 'r') data = csv.reader(origdata, delimiter='\t') output = [] kkma = Kkma() i = 0 for line in data: i += 1 if(line[8].strip().isdigit()): obj = { 'name': line[7].strip(), 'sum': int(line[8].strip()) * 1000, 'categories': [ line[2].strip(), line[3].strip(), line[4].strip(), line[5].strip() ] } words = kkma.nouns(line[7].strip().decode('utf-8')) for j, word in enumerate(words): words[j] = word.encode('utf-8')
# print('직전일자 : ', prevDate) print() # print(soup.prettify()) # print(soup) newsbody = soup.find(id="articleBodyContents") # print(newsbody.contents) bodystr = "" try: for child in newsbody.children: if (isinstance(child, NavigableString) and not isinstance(child, Comment)): # print(child.string.strip()) bodystr += child.string.strip() # 형태소 분석 kkma = Kkma() # pprint(kkma.nouns(bodystr)) # pprint(kkma.pos(bodystr)) wordList = kkma.nouns(bodystr) print('k : ', k) if k == 0: testEntry = wordList testIssueDate = issueDate testTitle = soup.title.string k = k + 1 else: if (int(df[df['날짜'] >= issueDate].tail(1)['종가']) > int(df[df['날짜'] < issueDate].head(1)['종가'])): print('up') docList.append(wordList) classList.append(1) else:
# -*- coding: utf-8 -*- # 품사 태깅 실습 # 꼬꼬마와 트위터 형태소 분석기 사용해서 토큰화 수행. 명사 등을 추출한다. from konlpy.tag import Kkma kkma = Kkma() print('kkma 문장분리 : ', kkma.sentences(u'안녕하세요. 반갑습니다. 저는 인공지능입니다.')) # sentences : 문장분리 print('kkma 명사만추출 : ', kkma.nouns(u'을지로 3가역 주변 첨단빌딩숲 사이에 자리 잡은 커피집')) # nouns : 명사 추출 print('='*80) from konlpy.tag import Twitter tagger = Twitter() print('Twitter 명사만 추출 : ', tagger.nouns(u'을지로 3가역 주변 첨단빌딩숲 사이에 자리 잡은 커피집')) print('Twitter 품사 추출 : ', tagger.pos(u'이것도 처리되나욕ㅋㅋ')) # pos : 품사 부착(Part-of-speech tagging) print('Twitter 오타와 원형처리 : ', tagger.pos(u'이것도되나욕ㅋㅋ', norm=True, stem=True)) # nouns : 명사 추출, norm=True.. 단어의 오타를 자동 정정, stem=True.. '이다'처럼 원형을 리턴. 기본값은 False
# -*- coding: utf-8 -*- """ Konlpy : 한글 형태소 분석을 제공하는 패키지 pip install konlpy """ #import konlpy from konlpy.tag import Kkma # class kkma = Kkma() # 생성자 -> object 생성 # 문단 -> 문장 para = "나는 홍길동 입니다. 나이는 23세 입니다. 대한민국 만세 입니다." ex_sent = kkma.sentences(para) ex_sent # list # ['나는 홍길동 입니다.', '나이는 23세 입니다.', '대한민국 만세 입니다.'] len(ex_sent) # 3 # 문단 -> 단어(명사) ex_nouns = kkma.nouns(para) ex_nouns # ['나', '홍길동', '나이', '23', '23세', '세', '대한', '대한민국', '민국', '만세'] # 문단 -> 품사(형태소) ex_pos = kkma.pos(para) ex_pos type(ex_pos) # list [(word, 품사),(word, 품사)] # NNG 일반 명사 NNP 고유 명사 NP 대명사 nouns = [] # 명사 저장
def test(self, keyword): FLAGS = tf.flags.FLAGS #print("\nParameters:") #for attr, value in sorted(FLAGS.__flags.items()): # print("{}={}".format(attr.upper(), value)) #print("") #x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) #print(x_raw) #print(y_test) x_raw = [keyword] print(keyword) y = [[1,0]] y_test = np.concatenate([y], 0) print(y_test) y_test = np.argmax(y_test, axis=1) kkma=Kkma() x_raw=[" ".join(kkma.morphs(x2)) for x2 in x_raw] print("x_raw",x_raw) # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name("output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0}) all_predictions = np.concatenate([all_predictions, batch_predictions]) print(all_predictions[0]) return all_predictions[0] return "text"
links =html.select('a[class="link_txt"]') one_page_data = [] for link in links: link_str = str(link.string) one_page_data.append(link_str.strip()) one_day_data.extend(one_page_data[:10]) print(date) return one_day_data[:5] data = [Crawling(date)[0] for date in sdate] kkma = Kkma() nouns_word = [] for sent in data: for noun in kkma.nouns(sent): nouns_word.append(noun) from re import match nouns_count = {} for noun in nouns_word: if len(noun) > 1 and not(match('^[0-9]', noun)): nouns_count[noun] = nouns_count.get(noun, 0) + 1 from collections import Counter # class
def save_news_words(s_date=None, e_date=None): nouns_tools = [Kkma(), Twitter()] # 분석기 token = NewsTokenization(nouns_tools=nouns_tools) mongo = MongoManager() news = [ DaumNews(s_date.strftime('%Y%m%d')), NaverNews(s_date.strftime('%Y%m%d')) ] # load할 news 목록 while s_date.strftime('%Y%m%d') < e_date.strftime('%Y%m%d'): day = s_date.strftime('%Y%m%d') con = {'date': day} if not mongo.find_one(NewsTokenization.collection, con): articles = token.load_news(day, repository_manager=mongo, news_list=news) print('[NewsTokenization][day: %s][article len: %d]' % (day, len(articles))) # articles = articles[2:3] datas = list() for article in articles: data = dict() data['_id'] = article['_id'] # title(0) + contents(1~) lines = list() lines.append(article['article']['title']) lines.extend(article['article']['contents']) data['raw_data'] = lines data['data_words'] = [[item for item in token.get_words(line)] for line in data['raw_data']] data['data_frequency'] = token.get_term_frequency( data['data_words']) if 'summary' in article['article']: data['raw_label'] = article['article']['summary'] data['label_words'] = [[ item for item in token.get_words(line) ] for line in data['raw_label']] data['label_frequency'] = token.get_term_frequency( data['label_words']) print(data) datas.append(data) else: #save obj = dict() obj['date'] = day obj['article'] = datas mongo.save_one(collection=NewsTokenization.collection, data=obj) else: news_words_list = mongo.find(NewsTokenization.collection, con) for news_words in news_words_list: print('news_words len:', len(news_words)) # day + 1 s_date = s_date + datetime.timedelta(days=1)
content = "" docList = [] for line in reader: content = re.sub('[^0-9a-zA-Zㄱ-힗 .]', ' ', line[2]) reversed_content = ''.join(reversed(content)) for i in range(0, len(content)): if reversed_content[i:i + 2] == '.다': content = ''.join(reversed(reversed_content[i:])) break content = content.replace('.', '. ') kkma = Kkma() noun = ' '.join(kkma.nouns(content)) docList.append(noun) print(noun) tfidf_vectorizer = TfidfVectorizer(min_df=1) tfidf_matrix = tfidf_vectorizer.fit_transform(docList) document_distances = (tfidf_matrix * tfidf_matrix.T) print(document_distances) print("Result: " + str(document_distances.get_shape()[0]) +\ 'x' + str(document_distances.get_shape()[1]))
x = word_tokenize(text) pos_tag(x) # [('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students', 'NNS'), ('.', '.'), ('and', 'CC'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D.', 'NNP'), ('student', 'NN'), ('.', '.')] # Penn Treebank POG Tags에서 PRP는 인칭 대명사, VBP는 동사, RB는 부사, VBG는 현재부사, IN은 전치사, NNP는 고유 명사, NNS는 복수형 명사, CC는 접속사, DT는 관사를 의미합니다. from konlpy.tag import Okt okt = Okt() print(okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요'] print(okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # [('열심히','Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')] print(okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['코딩', '당신', '연휴', '여행'] from konlpy.tag import Kkma kkma = Kkma() print(kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요'] print(kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # [('열심히','MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')] print(kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['코딩', '당신', '연휴', '여행'] ''' 한글 형태소 분석기 중에 가장 속도가 빠른 Mecab은 konlpy 엔진에 포함되어 있지 않다. 아래는 eunjeon 패키지를 이용하여 python에서 mecab을 활용하는 예시이다. ''' from eunjeon import Mecab # KoNLPy style mecab wrapper tagger = Mecab() print(tagger.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요")) # ['열심히', '코딩', '한', '당신', ',', '연휴', '에', '는', '여행', '을', '가', '봐요'] print(tagger.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
Sorted_Dict_Values = sorted(wordInfo.values(), reverse=True) Sorted_Dict_Keys = sorted(wordInfo, key=wordInfo.get, reverse=True) plt.bar(range(len(wordInfo)), Sorted_Dict_Values, align='center') plt.xticks(range(len(wordInfo)), list(Sorted_Dict_Keys), rotation='70') plt.show() file = open("test.txt", mode='r', encoding='utf-8') doc = file.read() file.close() print(doc) kkma = Kkma() ex_sent = kkma.sentences(doc) kkma.pos nouns = [] for sent in ex_sent: for noun in kkma.nouns(sent): # 단어 전처리 : 2음절 이상, 수사 제외 if len(str(noun)) >= 2 and not (match('^[0-9]', noun)): nouns.append(noun) print(nouns) word_count = {}
poets = get_reviews() for poet in poets: sentences = poet.split('\n') for sentence in sentences: try: c += Counter(kkma.nouns(sentence)) except NameError: c = Counter(kkma.nouns(sentence)) except: pass #poets = get_poets() poets = get_reviews() kkma = Kkma() for idx, poet in enumerate(poets): tags = [] for noun in kkma.nouns(poet): if noun in TAGS: tags.append(noun) hash_object = hashlib.sha1(poet.encode('utf-8', 'ignore')) hex_dig = hash_object.hexdigest() results = collection.find_one({'hex':hex_dig}) if not results: document = {'text': poet, 'index': idx, 'tags': tags, 'hex': hex_dig, 'like': 0, 'date': datetime.datetime.utcnow()} collection.insert(document)
ip = 'localhost' id = 'testuser' pw = 'AsDf1234!' db = 'qnaboard' conn = pymysql.connect(ip, id, pw, db, charset="utf8") curs = conn.cursor() sql = "select * from board" curs.execute(sql) result = curs.fetchall() i = 0 sen = [] kkma = Kkma() w_count = {} for t, c in result: s = t + " " + c kk = kkma.nouns(s) for lst in kk: try: w_count[lst] += 1 except: w_count[lst] = 1 sorted_w = sorted(w_count.items(), key=operator.itemgetter(1)) print(sorted_w) conn.close()
def excel_noun(): def excel_write(row_val, column_val, data): new_sheet.cell(row = row_val, column = column_val, value="%s" %data) wb=load_workbook(REFERENCE_EXCEL) sheetList = wb.get_sheet_names() sheet = wb.get_sheet_by_name(sheetList[0]) row_count = sheet.get_highest_row() new_sheet = wb.create_sheet(title='extraction') news_info = {} for i in range(1, row_count): noun_val = "" full_qua = "" cellValue_name = sheet.cell(row=i, column=1).value cellValue = sheet.cell(row=i, column=2).value cellValue_id = sheet.cell(row=i, column=3).value # u201c 'LEFT DOUBLE QUOTATION MARK' # u201d 'RIGHT DOUBLE QUOTATION MARK' try : QUA = cellValue.count(u'\u201c') # u201c 'LEFT DOUBLE QUOTATION MARK' except : continue if QUA != -1: if QUA == 1 : START_QUA = cellValue.find(u"\u201c") + 1 # position of first quatation mark CELL_VALUE_LEN = len(cellValue) cellValue_re = cellValue[START_QUA:CELL_VALUE_LEN] END_QUA = cellValue_re.find(u"\u201d") # position of last quatation mark cellValue_final = cellValue_re[0:END_QUA] #print str(i) + " "+ cellValue_name + " " + cellValue_final kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(cellValue_final)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id) news_info[i]={news_tuple} MyPrettyPrinter().pprint(news_info[i]) excel_write(i, 1, cellValue_name) excel_write(i, 2, cellValue_final) excel_write(i, 3, noun_val) excel_write(i, 4, cellValue_id) elif QUA == 0 : #print str(i) + " " + cellValue ANOTHER_QUA = cellValue.find("\"") + 1 # position of first quatation mark ANOTHER_QUA_LEN = len(cellValue) another_cellValue = cellValue[ANOTHER_QUA:ANOTHER_QUA_LEN] ANOTHER_END_QUA = another_cellValue.find("\"") another_cellValue_final = another_cellValue[0:ANOTHER_END_QUA] #print str(i) + " " + cellValue_name + " " + another_cellValue_final kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(another_cellValue_final)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id) news_info[i]={news_tuple} MyPrettyPrinter().pprint(news_info[i]) excel_write(i, 1, cellValue_name) excel_write(i, 2, another_cellValue_final) excel_write(i, 3, noun_val) excel_write(i, 4, cellValue_id) elif QUA > 1 : #print str(i) + " " + str(QUA) for q in range(0,QUA): arr = cellValue.split(u"\u201d") if arr is not None: try : arr_start_qua = arr[q].find(u"\u201c") + 1 except : continue arr_len = len(arr[q]) arr_cellValue = arr[q][arr_start_qua:arr_len] full_qua = full_qua + arr_cellValue kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(arr_cellValue)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' #print str(i) + " " + arr_cellValue news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id) news_info[i]={news_tuple} MyPrettyPrinter().pprint(news_info[i]) excel_write(i, 1, cellValue_name) excel_write(i, 2, full_qua) excel_write(i, 3, noun_val) excel_write(i, 4, cellValue_id) wb.save(REFERENCE_EXCEL) nt.saveObjectBinaryFast(news_info, DICT_NEWS_INFO)
#*--coding:utf-8--* import os import pickle from konlpy.tag import Kkma from codecs import open as copen kkma = Kkma() feature_list = set() #Q&A 데이터에서 명사와 형용사 feature 추출 for i in os.listdir('./DataSet/Leave/'): print i f = copen('./DataSet/Leave/' + str(i), 'r', 'utf-8') temp = f.read().replace('\n', ' ') for j in kkma.pos(unicode(temp)): if j[1] in ['NNG', 'NNP', 'NNB', 'NP', 'VA']: feature_list.add(j[0]) f.close() print len(feature_list) p = open('DataSet/feature_4.txt', 'wb') pickle.dump(list(feature_list), p) p.close()
def Training(): for article in article_list: # print(article) # title = article[0] # link = article[1] # newspaper = article[2] kkma = Kkma() try: content, issueDateTime = NateNews.get_content(article['link']) issueDateTime = pd.to_datetime(issueDateTime) # issueDate = time.strftime('%Y-%m-%d', issueDateTime) # issueTime = time.strftime('%H:%M:%S', issueDateTime) issueDate = issueDateTime.date() issueTime = issueDateTime.time() # 형태소 분석 # wordList = kkma.pos(content) # [보통명사 동사 형용사 보조동사 명사추정범주] 필터링 # print(title) # print('wordList : ', wordList) # print(issueDateTime) # print(link) # print(newspaper) # print(issueDate) # print('wordList : ', wordList) wordList = list(getWords(kkma.pos(content))) ws = set(wordList) print('ws : ', ws) dic = {} for word in ws: print('word : ', word) dic.update({word: wordList.count(word)}) print('dic : ', dic) n = 10 listdic = sorted(dic.items(), key=operator.itemgetter(1), reverse=True)[:n] print('listdic : ', listdic) for l in listdic: print('l : ', l) wordList.append(l[0]) baseDate = '' if issueTime > pd.to_datetime('15:30:00').time(): # print('장 마감 이후') baseDate = stockDF[stockDF['datetime'] > issueDate].head(1)['datetime'] else: # print('장 마감 이전') baseDate = stockDF[stockDF['datetime'] >= issueDate].head(1)['datetime'] print('issueTime : ', issueTime) print('baseDate : ', baseDate) # print(type(baseDate)) if issueDate > pd.to_datetime(testSetFromDate).date() or len(baseDate) == 0: # test set testEntry.append({'issueDateTime': issueDateTime, 'wordList': wordList}) else: # trainning set baseDate = pd.Series(baseDate).values[0] # print('해당 일자 주식 확인 : ', baseDate) trainingSet.append({'issueDateTime': issueDateTime, 'wordList': wordList}) print(trainingSet) # print(int(stockDF[stockDF['날짜'] == baseDate]['종가'])) # print(int(stockDF[stockDF['날짜'] < baseDate].tail(1)['종가'])) todayPrice = int(stockDF[stockDF['datetime'] == baseDate]['close']) prevPrice = int(stockDF[stockDF['datetime'] < baseDate].tail(1)['close']) if (todayPrice > prevPrice): # print(baseDate, ' : up') classList.append(1) else: if (todayPrice < prevPrice): # print(baseDate, ' : down') classList.append(0) else: # print(baseDate, ' : hold') classList.append(0) except: pass
from konlpy.tag import Kkma #"C:/Program Files/Java/jre1.8.0_171/bin/server/jvm.dll" kkma = Kkma() print("asdasd") sentences = kkma.sentences(u'네, 안녕하세요. 반갑습니다.') print(sentences) nouns = kkma.nouns(u'명사만을 추출하여 다빈도 분석을 합니다.') print(nouns) pos = kkma.pos(u'오류보고는 실행환경, 에러메세지와함께 설명을 최대한상세히!^^') print(pos)
from konlpy.tag import Kkma kkma = Kkma() malist = kkma.pos("아버지 가방에 들어가신다.") print(malist)
#!/usr/bin/env python3 from konlpy.tag import Hannanum, Kkma, Komoran, Mecab, Okt hannanum = Hannanum() print('[Hannanum]') print(hannanum.analyze('롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.')) kkma = Kkma() print('[Kkma]') print(kkma.morphs('공부를 하면할수록 모르는게 많다는 것을 알게 됩니다.')) komoran = Komoran() print('[Komoran]') print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요')) mecab = Mecab() print('[Mecab]') print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.')) okt = Okt() print('[Okt]') print(okt.morphs(u'단독입찰보다 복수입찰의 경우'))
p = sum((first['counter'] & second['counter']).values()) q = sum((first['counter'] | second['counter']).values()) return p / q if q else 0 def build_graph(sentences): graph = networkx.Graph() graph.add_nodes_from(sentences) for first, second in combinations(sentences, 2): weight = occurrence(first[1], second[1]) if weight: graph.add_edge(first[0], second[0], weight=weight) return graph STOPWORDS = get_stopwords() TAGGER = Kkma() def get_summarize(text, count=3): sentences = [(num, { 'text': line + '.', 'counter': Counter(filter(lambda x: x not in STOPWORDS, TAGGER.nouns(line))) }) for num, line in enumerate(text.split('. '))] pagerank = networkx.pagerank(build_graph(sentences), weight='weight') reordered = sorted(pagerank, key=pagerank.get, reverse=True) for index in reordered[:count]: yield sentences[index][1]['text']
# -*- coding: utf-8 -*- """ 오전 수업때 만든 'new_data.pck' word cloud 1. pickle file 읽기 2. 명사 추출 : kkma.nouns() 3. 전처리 : 단어길이 제한, 숫자 제외 4. WordCloud """ import konlpy from konlpy.tag import Kkma from wordcloud import WordCloud # class # object 생성 kkma = Kkma() # 1. pickle file 읽기 : news_data.pck file = open('../data/new_data.pck', mode='rb') news_data = pickle.load(file) news_data file.close() len(news_data) # 11600 type(news_data) # list # docs -> sentence # <error> news_sent = kkma.sentences(news_data) news_sent = [kkma.sentences(sent)[0] for sent in news_data] news_sent
from konlpy.tag import Kkma from konlpy.utils import pprint from openpyxl import load_workbook, Workbook dic={} kkma=Kkma() wb=load_workbook(filename='menulist.xlsx',read_only=True) ws=wb['Sheet1'] for i in range(1,5897): for l,k in kkma.pos(ws['A'+str(i)].value): if l not in dic.keys(): dic[l]=0 else: dic[l]+=1 wb=Workbook() dest_filename="determine.xlsx" ws1=wb.active ws1.title="result" num=1 for l,k in dic.items(): ws1['A'+str(num)]=l ws1['B'+str(num)]=k num+=1 wb.save(filename=dest_filename)
def get_intent_type(dbconn, cursor, u_text): try: cursor.execute(f""" SELECT Q_TEXT, A_TEXT, Q_MORPHEMES, INTENT, ENTITIES FROM TBL_AP_QNA_CHAT_SET_LIST """) rows = cursor.fetchall() except Exception as e: print(f'error! >> select_ap_qna_content >> {e}') finally: intent = 0 all_values = [] for row in rows: all_values.append(row) match_intent_list = [] for all_value in all_values: matchPer = round( (SequenceMatcher(None, u_text, all_value[0]).ratio() * 100), 2) if matchPer >= 65: # print(70) match_intent_list.append([matchPer, all_value[3]]) # print(f'[{matchPer}% 일치][Q_type : {all_value[3]}] {all_value[0]} >> {all_value[1]}') # print('match_intent_list', match_intent_list) top = [0, 0] if len(match_intent_list) > 0: for idx, match_intent in enumerate(match_intent_list): if match_intent[0] > top[0]: top = match_intent intent = top[1] # 정의해놓은 대화뭉치가 없는 경우 > 답변을 직접 등록할 수 있도록 유도 # 새로운 질문과 기존 질문들의 유사도 체크하여 높은 유사도의 질문을 (최소 70% 이상) 노출 if intent == 0: kkma = Kkma() # 사용자 질문 명사 u_text_nouns = kkma.nouns(u_text) q_text_nouns_group = [] for all_value in all_values: # 텍스트 뭉치 명사 if all_value[2] != '[]': print('텍스트 뭉치 명사', all_value[2], '|||', all_value[3]) q_text_nouns_group.append( [ast.literal_eval(all_value[2]), all_value[3]]) point_list = [] for q_text_nouns in q_text_nouns_group: match_point = 0 for q_noun in q_text_nouns[0]: for u_noun in u_text_nouns: if q_noun == u_noun: match_point += 1 if match_point > 0: point_list.append([match_point, q_text_nouns[1]]) top = [0, 0] if len(point_list) > 0: for idx, point in enumerate(point_list): if point[0] > top[0]: top = point intent = top[1] # print('point_list', point_list) print('intent', intent) return [intent, all_values]
from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 의류매장 입니다')); pprint(kkma.nouns(u'구입하실 물건 있으시면 말씀해주세요.')); pprint(kkma.pos(u'하하하 즐거운 쇼핑입니다.'));
def __init__(self): self.nlp_engine = Kkma() self.nlp_engine.pos('시작')
# -*- coding: utf-8 -*- from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences(u'저는 대학생이구요. 소프트웨어 관련학과 입니다.')) [저는 대학생이구요., 소프트웨어 관련학과 입니다.]
def get_vs(line): korean_analyzer = Kkma() return [word for word, tag in korean_analyzer.pos(line) if tag in ['VA', 'VX', 'VC']]
# 01 한국 법률 말뭉치 from konlpy.corpus import kolaw c = kolaw.open('constitution.txt').read() print(c[:10]) #%% from konlpy.corpus import kobill d = kobill.open('1809890.txt').read() print(d[:15]) #%% # 사전 ## 문장, 명사, 품사 태깅 from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() #%% pprint(kkma.sentences('네 안녕하세요. 반갑습니다')) pprint(kkma.nouns('질문이나 건의사항은 여기에 남겨주세요.')) pprint(kkma.pos('우리는 데이터 과학자입니다. 멋진 과학자입니다.')) #%% # 02. 문서 탐색 from collections import Counter from konlpy.corpus import kolaw from konlpy.tag import Hannanum from konlpy.utils import concordance, pprint from matplotlib import pyplot
def test2_kor(): train_df = pd.read_table( 'D:/BookCodes/tensorflow-ml-nlp-master/4.TEXT_CLASSIFICATION/data_in/ratings_train.txt' ) test_df = pd.read_table( 'D:/BookCodes/tensorflow-ml-nlp-master/4.TEXT_CLASSIFICATION/data_in/ratings_test.txt' ) print(train_df.head()) print('훈련 데이터 샘플의 개수 : {}'.format(len(train_df))) print('테스트 데이터 샘플의 개수 : {}'.format(len(test_df))) tokenizer = Kkma() # .morphs() ---> 너무 느리다. ID = torchtext.data.Field( sequential=False, use_vocab=False) # 실제 사용은 하지 않을 예정 ---> txt파일에 ID column이 있어서... TEXT = torchtext.data.Field( sequential=True, include_lengths=True, use_vocab=True, tokenize=tokenizer.morphs, # 토크나이저로는 Kkma 사용. lower=True, batch_first= True, # batch_firt=True ---> (N,fix_length) False ---> (fix_length,N) fix_length=20) LABEL = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True) # tsv: Tab-separated values if False: train_data, test_data = torchtext.data.TabularDataset.splits( path= 'D:/BookCodes/tensorflow-ml-nlp-master/4.TEXT_CLASSIFICATION/data_in', train='ratings_train.txt', test='ratings_test.txt', format='tsv', fields=[('id', ID), ('text', TEXT), ('label', LABEL)], skip_header=True) else: train_data = torchtext.data.TabularDataset( path= 'D:/BookCodes/tensorflow-ml-nlp-master/4.TEXT_CLASSIFICATION/data_in/ratings_train_small.txt', format='tsv', fields=[('id', ID), ('text', TEXT), ('label', LABEL)], skip_header=True) train_data, test_data = train_data.split(split_ratio=0.7, random_state=random.seed(100)) print('훈련 샘플의 개수 : {}'.format(len(train_data))) print('테스트 샘플의 개수 : {}'.format(len(test_data))) print(vars(train_data[0])) print(train_data.examples[0].id, train_data.examples[0].text, train_data.examples[0].label) TEXT.build_vocab(train_data, min_freq=10, max_size=10000) # build_vocab 단계를 거처야, 단어가 숫자로 mapping된다. print('단어 집합의 크기 : {}'.format(len(TEXT.vocab))) print(TEXT.vocab.stoi) # 단어 dict # DataLoader 만들기 batch_size = 2 if False: train_loader = torchtext.data.Iterator( dataset=train_data, batch_size=batch_size, shuffle=True) # shuffle=True epoch 사이에 shuffle 여부. test_loader = torchtext.data.Iterator(dataset=test_data, batch_size=batch_size) else: # data.BucketIterator ----> padding이 최소화 되도록 한다. train_loader, test_loader = torchtext.data.BucketIterator.splits( (train_data, test_data), batch_size=batch_size, device='cpu', shuffle=False, sort_key=lambda x: len(x.text)) # train_data에 직접 접근 & 직접 단어를 숫자로 mapping 시키기 for i, d in enumerate(train_data): # d: Example print(d.text, TEXT.numericalize(([d.text], 1)), d.label) # tuple([xx],batch_size)을 넘겨야 한다. if i >= 2: break for i, d in enumerate(train_loader): print(i, d.text, d.label ) # d.text[0], d.text[1] ----> Field에서 include_lengths=True로 설정. print(''.join([TEXT.vocab.itos[x] for x in d.text[0][0].numpy()])) if i >= 2: break print('=' * 20) it = iter(train_loader) for i in range(2): batch = next(it) print(batch.text, batch.label)
plt.figure(figsize=(16, 16)) for i in range(len(x)): plt.scatter(x[i], y[i]) plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.show() # In[69]: kkma = Kkma() stopWord_Ingre = {"재료" , "계량법" , "안내" , "조금"} # In[113]: mystr = getString("/home/gwangjik/문서/hanyang corps/데이터/만개의레시피/Text/text_recipe10000_6879000_6880000") mystr += getString("/home/gwangjik/문서/hanyang corps/데이터/만개의레시피/Text/text_recipe10000_6870000_6871000") # In[ ]: tokenized = kkma.pos(mystr)
class SentenceTokenizer(object): def __init__(self): self.kkma = Kkma() self.twitter = Twitter() self.stopwords = [ '중인', '만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자", "아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가", ] def url2sentences(self, url): article = Article(url, language='ko') article.download() article.parse() sentences = self.kkma.sentences(article.text) for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: sentences[idx - 1] += (' ' + sentences[idx]) sentences[idx] = '' return sentences def text2sentences(self, text): sentences = self.kkma.sentences(text) for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: sentences[idx - 1] += (' ' + sentences[idx]) sentences[idx] = '' return sentences def get_nouns(self, sentences): nouns = [] for sentence in sentences: if sentence is not '': nouns.append(' '.join([ noun for noun in self.twitter.nouns(str(sentence)) if noun not in self.stopwords and len(noun) > 1 ])) return nouns
__author__ = 'woojin' # -*- coding: utf-8 -*- from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences('네, 안녕하세요. 반갑습니다.')) pprint(kkma.nouns('질문이나 건의사항은 깃허브 이슈 트래커에 남겨주세요.')) pprint(kkma.pos('오류보고는 실행환경, 에러메시지와 함께 설명을 최대한 상세히!!^^'))
constitution = kolaw.open('./constitution.txt').read() print(constitution) # 몇번째 줄에 '민주'라는 단어가 있는지 찾아줌 r = concordance(u'민주', constitution, show=False) print("show=False => ", r) # show=False => 문장으로 안나타나고 # show=True => 문장으로 나타남 # 텍스트 마이닝 작업 시 고려사항 : 정확성, 속도 from konlpy.tag import Kkma # 정확성 때문에 Kkma 사용, 맨 처음 시작시 시간이 좀 걸림 (variable들 날리면 더 걸림) from konlpy.utils import pprint kkma = Kkma() text = u'네, 안녕하세요. 반갑습니다.' # 문장 단위로 찾아냄 text_s = kkma.sentences(text) print("text_s => ", text_s) # 리스트에 담겨서 나옴 print("type(text_s) => ", type(text_s)) print("type_s[0] => ", text_s[0]) print("type_s[0] => ", text_s[-1]) # tagset : 형식들에 대한 정보 파악 kkma = Kkma() print(kkma.tagset)
# -*- coding: utf-8 -*- import zmq import time import sys reload(sys) sys.setdefaultencoding('utf-8') #set utf8 as a default #init KoNLPy from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() from multiprocessing import Pool port = 46000 context = zmq.Context() socket = context.socket(zmq.REP) socket.bind('tcp://127.0.0.1:%s' % port) while True: print 'in the loop' # Wait for next request from client message = socket.recv() result = kkma.nouns(message); result = ', '.join(result) print '------' print result socket.send_string(result) # for socker.end unicode is not allowed use send_string
from konlpy.tag import Kkma from konlpy.corpus import kolaw from threading import Thread import jpype def do_concurrent_tagging(start, end, lines, result): jpype.attachThreadToJVM() l = [k.pos(lines[i]) for i in range(start, end)] result.append(l) return if __name__=="__main__": import time print('Number of lines in document:') k = Kkma() lines = kolaw.open('constitution.txt').read().splitlines() nlines = len(lines) print(nlines) print('Batch tagging:') s = time.clock() result = [] l = [k.pos(line) for line in lines] result.append(l) t = time.clock() print(t - s) print('Concurrent tagging:') result = [] t1 = Thread(target=do_concurrent_tagging, args=(0, int(nlines/2), lines, result))
class Crawler: def __init__(self): self.kkma = Kkma() self.conn = sqlite3.connect('yebi.db') self.cursor = self.conn.cursor() self.count = 20 reload(sys) sys.setdefaultencoding('utf-8') def do(self): print '트위터 타임라인 탐색 중.' for x in TwitterFetcher().get_time_line(self.count): user_id = x['user']['id'] print '' print '=' * 80 print '... @%s: %s' % (x['user']['name'], x['text']) t = (user_id, ) self.cursor.execute('select count(*) from users where id=?', t) count_user = self.cursor.fetchone()[0] if count_user == 0: #DB안에 User가 없으면 ( 0 ) name = x['user']['name'] screen_name = x['user']['screen_name'] profile_image = x['user']['profile_image_url_https'] t = (user_id, name, screen_name, profile_image) self.cursor.execute('insert into users values(?, ?, ?, ?)', t) self.conn.commit() print "... 유저 %s를 User 디비에 추가중" % x['user']['name'] i = 1 tweet_id = x['id'] t = (tweet_id, ) self.cursor.execute('select count(*) from tweets where id=?', t) count_tweets = self.cursor.fetchone()[0] print "... 트윗 디비를 검색중" if count_tweets == 0: print "... 아직 디비에 없어요." text = x['text'] created_at = x['created_at'] t = (tweet_id, text, created_at, user_id) self.cursor.execute('insert into tweets values(?, ?, ?, ?)', t) self.conn.commit() print '... %s 추가 중' % x['text'] for n in self.kkma.nouns(x['text']): t = (user_id, n) self.cursor.execute('select count from user_nouns where user_id=? and noun=?', t) count_noun = self.cursor.fetchone() screen_name = x['user']['screen_name'] if count_noun is not None: print "... %s가 명사 \"%s\"의 갯수는 %d회 사용하였습니다." % \ (screen_name, n, count_noun[0]) if count_noun is None: print "... %s가 명사 \"%s\"를 처음 사용하였습니다." % (screen_name, n) #t = (user_id, n) self.cursor.execute('insert into user_nouns values(?, ?, 1)', t) else: self.cursor.execute('update user_nouns set count=count+1 where user_id=? and noun=?', t) else: print "... 이미 디비에 있어요. (그래도 명사를 분석하겠습니다.)" for n in self.kkma.nouns(x['text']): # print "...... %s" % n t = (user_id, n) self.cursor.execute('select count from user_nouns where user_id=? and noun=?', t) count_noun = self.cursor.fetchone() screen_name = x['user']['screen_name'] if count_noun is not None: print "... %s가 명사 \"%s\"의 갯수는 %d회 사용하였습니다." \ % (screen_name, n, count_noun[0]) i += 1
for link in links : cont = link.string crawling_news.append(str(cont).strip()) # 클로러 함수 호출 crawler_func(5, '20190505') print('크롤링 news 수 =', len(crawling_news)) # 크롤링 news 수 = 380 print(crawling_news) # 형태소 분석 from konlpy.tag import Kkma kkma = Kkma() # object str_news = str(crawling_news) print(str_news) ex_sent = kkma.sentences(str_news) print(ex_sent) from re import match # 1. 문장 -> 단어 -> 전처리 -> 워드카운트 news_count = {} for sent in ex_sent : ex_noun = kkma.nouns(sent) for noun in ex_noun : if len(str(noun)) > 1 and not(match("^[0-9]", noun)) :
#-*- coding: utf-8 -*- __author__ = 'KC' from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.'))
''' konlpy test ''' from konlpy.tag import Kkma # object 생성 kkma = Kkma() # 문단 -> 문장 추출 para = "형태소 분석을 시작합니다. 나는 홍길동 이고 age는 28세 입니다." ex_sen = kkma.sentences(para) print(ex_sen) # list # 문단 -> 명사 추출 ex_noun = kkma.nouns(para) print(ex_noun) # ['형태소', '분석', '나', '홍길동', '28', '28세', '세'] # 문단 -> 형태소 추출 ex_pos = kkma.pos(para) print(ex_pos) # [('형태소', 'NNG'), ('분석', 'NNG'), ('을', 'JKO'), ('시작하', 'VV'), ('ㅂ니다', 'EFN'), ('.', 'SF'), ('나', 'NP'), ('는', 'JX'), ('홍길동', 'NNG'), ('이', 'VCP'), ('고', 'ECE'), ('age', 'OL'), ('는', 'JX'), ('28', 'NR'), ('세', 'NNM'), ('이', 'VCP'), ('ㅂ니다', 'EFN'), ('.', 'SF')]
], [ """서민금융진흥원은 지난 18일 서울 청계천로 본원에서 제2차 서민금융 전문가 간담회를 개최했다소 19일 밝혔다. 이번 간담회는 서민금융, 복지, 자활사업 등 각 분야 전문가들이 참석한 가운데, 정책서민금융 지원의 방향성에 대해서 의견을 청취하기 위해 마련됐다. 이날 이 원장은 "소득양극화와 고용부진 심화 등으로 서민·취약계층, 자영업자들의 경제적 어려움이 커지는 가운데 사회안전망으로서 서민금융의 역할이 중요한 시점"이라며, "현재 8등급 이하자가 263만명이고 이들중 74%가 연체중인 상황에서 정상적인 금융 이용이 어려운 취약계층에게 꼭 필요한 서민금융 지원을 위해 노력해야 한다"고 강조했다. 이어서 이 원장은 "현장 전문가의 의견을 반영하여 취약계층을 위한 금융과 함께 금융교육, 컨설팅, 종합상담 등 자활기반을 구축하도록 힘쓰겠다"고 밝혔다. 이날 참석자들은 '정책서민금융지원에 대한 방향성'에 대하여 다양한 의견을 제시했다. 진흥원은 이날 간담회의 다양한 제언들을 바탕으로 수요자가 체감할 수 있는 실질적인 방안 마련을 위해 더욱 노력하고, 지속적으로 서민금융 현장의 폭넓은 의견을 청취할 계획이다. """ ] ] # In[3]: kkma = Kkma() sentences = [] list_vec = [] for da in data: print(da) sentences.append(kkma.sentences(da[0])) for s in sentences: for w in s: list_vec.append(kkma.nouns(w)) word_list = [] for l in list_vec: empty_vec = [] for w in l: if len(w) >= 2: empty_vec.append(w)
def main(): # Arguments # parser = argparse.ArgumentParser( description='Pengtai Instagram RNN LSTM Model') parser.add_argument( '-t', '--type', type=str, help="run type Options: 'n' for new | 'o' for overwrite", default='o', nargs='+') # parser.add_argument('-d', '--dest_dir', type=str, help='CSV data file') parser.add_argument('-i', '--input_dir', type=str, help='Input Raw CSV directory') parser.add_argument('-u', '--user_id', type=str, help='Instagram User ID') parser.add_argument('-v', '--version', help='current version', action='store_true') args = parser.parse_args() # End Argparse # # VERSION CONTROL # if args.version: with open(settings.VERSION_JSON, "r") as jsonFile: data = json.load(jsonFile) return print(data['version']) if args.type: if args.type[0] == 'n' and args.type[1]: with open(settings.VERSION_JSON, "r") as jsonFile: data = json.load(jsonFile) data["version"] = args.type[1] with open(settings.VERSION_JSON, "w") as jsonFile: json.dump(data, jsonFile) VERSION = args.type[1] elif args.type[0] == 'o': with open(settings.VERSION_JSON, "r") as jsonFile: data = json.load(jsonFile) VERSION = data["version"] # End VERSION CONTROL # with open('./dic/polarity.csv', 'r', encoding='UTF-8') as file: csvreader = csv.DictReader(file) kosac = [row for row in csvreader] total_arr = [] rowI = 0 rowDict = {} # File List in the directory from the arguments for filename in glob.glob(os.path.join(args.input_dir, '*.csv')): # i = ['id', 'img', 'text', 'has_tag', 'write_date', 'reg_date'] with open(filename, 'r', encoding='UTF-8') as f: csvreader = csv.DictReader(f) # csvreader = csv.reader(f) for row in csvreader: if rowI == 0: rowDict = {"user_id": row['user_id'], "posts": []} else: # print(user_id, row['user_id'], rowDict) if rowDict['user_id'] != row['user_id']: total_arr.append(rowDict) rowDict = {"user_id": row['user_id'], "posts": []} # text preprocess text = re.sub(r'@\w+', '', row['text']) text = re.sub( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) text = re.sub(r'[\[]|[\]]', '', text) text = re.sub(r'[\r]|[\n]', ' ', text) text = re.sub(r'[.]|[ㆍ]', '', text) text = re.sub(r'#', ' ', text) rowDict['posts'].append({ "datetime": row['write_date'], "text": text }) rowI = rowI + 1 # print(total_arr) trg_res = [item for item in total_arr if item["user_id"] == args.user_id] temp = [] kkma = Kkma() t = Twitter() for post in trg_res[0]['posts']: date = datetime.datetime(int(post['datetime'][0:4]), int(post['datetime'][5:7]), int(post['datetime'][8:10]), int(post['datetime'][11:13]), int(post['datetime'][14:16]), int(post['datetime'][17:19])) text = post['text'] temp.append((date, text)) temp = sorted(temp, key=lambda t: t[0], reverse=False) sentArr = [] newArr = [] tokens_ko = [] index = 0 nounsArr = [] for data in temp: sentPosArr = kkma.pos(data[1]) # sentNouns = kkma.nouns(data[1]) inArr = [] for outA in sentPosArr: # for inA in outA: inArr.append("/".join(outA)) morph_arr = t.morphs(data[1]) morphWords = [word for word in morph_arr if not word in tokens_ko] for word in morphWords: if not word in nounsArr: nounsArr.append(word) tokens_ko.extend(morphWords) newArr.append({"sentence": "", "words": morph_arr, "score": 0}) index = index + 1 sentArr.append(";".join(inArr)) index = 0 for eaSent in sentArr: sentiScore = 0 for corp in kosac: if eaSent.find(corp['ngram']) > -1: if corp['max.value'] == 'NEG': sentiScore = sentiScore - float(corp['max.prop']) elif corp['max.value'] == 'POS': sentiScore = sentiScore + float(corp['max.prop']) newArr[index]["sentence"] = eaSent newArr[index]["score"] = sentiScore index = index + 1 # ACO 알고리즘 # doc_ko = " ".join([row[1] for row in temp]) # text_arr = [row[1] for row in temp] # for text in text_arr: # morph_arr = t.morphs(text) # temp = [word for word in morph_arr if not word in tokens_ko] # tokens_ko.extend(temp) print(tokens_ko) ko = nltk.Text(tokens_ko) # For Python 2, input `name` as u'유니코드' # # print(len(set(ko.tokens))) # returns number of unique tokens vocab = dict([(item[0], index + 1) for index, item in enumerate(ko.vocab().items())]) # pprint(vocab) # returns number of tokens (document length) minTimeVal = int(temp[0][0].timestamp()) maxTimeVal = int(temp[len(temp) - 1][0].timestamp() - minTimeVal) tenPow = len(str(int(temp[len(temp) - 1][0].timestamp() - minTimeVal))) tenPow = pow(10, tenPow) index = 0 nodes = [] for data in temp: # print(data[0].utctimetuple) # print(data[0].time()) diffTimeVal = int(data[0].timestamp() - minTimeVal) opt1 = float(diffTimeVal / tenPow) opt2 = float(diffTimeVal / maxTimeVal) print(diffTimeVal, opt1, opt2) nodes.append((opt2, newArr[index]["words"])) index = index + 1 # print(nounsArr) nodes2 = [] for noun in nounsArr: for corp in kosac: hts = "%s/NNG" % (noun) if hts.find(corp['ngram']) > -1: if corp['max.value'] == 'NEG': nodes2.append({ "noun": noun, "score": -float(corp['max.prop']) }) elif corp['max.value'] == 'POS': nodes2.append({ "noun": noun, "score": float(corp['max.prop']) }) print() antCount = len(newArr) rhoVal = 0.3 # ACO 알고리즘 예시 # nodes = [] # for _ in range(20): # x = random.uniform(-10, 10) # y = random.uniform(-10, 10) # nodes.append((x, y)) # def euclidean(a, b): return math.sqrt(pow(a[1] - b[1], 2) + pow(a[0] - b[0], 2)) # world = pants.World(nodes, euclidean) # solver = pants.Solver(rho=rhoVal, )
from konlpy.tag import Kkma file = open("text.txt", mode='r', encoding='utf-8') doc = file.read() file.close() print(doc) kkma = Kkma() ex_sent = kkma.sentences(doc) nouns = [] for sent in ex_sent: for noun in kkma.sentences(sent): #단어 전처리 : 2음절 이상, 수사 제외 if len(str(noun)) >= 2 and not (match('^[0-9]', noun)): nouns.append(noun) #----------------------------------------------------------------------------------------------------- para = "형태소 분석을 시작합니다. 저는 김유진 입니다." ex_sent = kkma.sentences(para) print(len(ex_sent)) print(ex_sent) ex_nouns = kkma.nouns(para) print(len(ex_nouns)) print(ex_nouns) ex_pos = kkma.pos(para) print(len(ex_pos))
import cx_Oracle import os import re import pandas as pd from datetime import datetime import numpy as np from sklearn.metrics.pairwise import linear_kernel import getpass os.environ['NLS_LANG'] = '.UTF8' f = open(r"D:\user\Desktop\testtest.txt", 'rb') lines = f.read() text = lines.decode(encoding='utf-8') kkma = Kkma() keyword = ' '.join(kkma.nouns(text)) def makeDictFactory(cursor): columnNames = [d[0] for d in cursor.description] def createRow(*args): return dict(zip(columnNames, args)) return createRow def OutputTypeHandler(cursor, name, defaultType, size, precision, scale): if defaultType == cx_Oracle.CLOB: return cursor.var(cx_Oracle.LONG_STRING, arraysize=cursor.arraysize)
def get_tags(text, ntags=40, multiplier=1): h = Kkma() nouns = h.nouns(text) count = Counter(nouns) return [{ 'tag': n, 'count': c }\ for n, c in count.most_common(ntags)]
#ci -*- coding: utf8 -*- import xlrd from konlpy.utils import pprint from konlpy.tag import Twitter from konlpy.tag import Kkma import json import codecs twitter = Twitter() kkma = Kkma() result_file = codecs.open('streaming_result.txt', 'w', 'utf-8') dic_list = [] dic_count = 0 valence_list = [] with xlrd.open_workbook('dic.xlsx') as workbook: sheet = workbook.sheet_by_index(1) sheet.cell_value(0, 0) sheet.nrows sheet.ncols for row in range(sheet.nrows): verb = sheet.cell_value(row, 0) if verb == u'' or verb == u'원형' or verb == u'ㄱ' or verb == u'ㄴ' or verb == u'ㄷ' or verb == u'ㅁ' or verb == u'ㅂ' or verb == u'ㅅ' or verb == u'ㅇ' or verb == u'ㅈ' or verb == u'ㅊ' or verb == u'ㅋ' or verb == u'ㅌ' or verb == u'ㅍ' or verb == u'ㅎ' or verb == u'이모티콘' or verb == u'숫자': continue #dic_list.append([]) valence_list.append([]) dic_count = dic_count + 1
class Classifier(metaclass=Singleton): """ Convolutional Neural Networks 모델을 이용하여 Label의 Concept_id를 추측하는 클래스 Attributes ---------- konlpy 형태소 추출기 Hannanum, Kkma, Komoran, Mecab, Okt 설정 가능 자세한 사항은 http://konlpy.org/ 참고 word_dict 단어 Lookup Table concept_dict concept_id Lookup Table model CNN 모델 is_load CNN 모델 및 Lookup Table 로딩 여부 """ def __init__(self): self.konlpy = Kkma() self._dataset = None self.word_dict = None self.concept_dict = None self._x_train = None self._y_train = None self.model = None self.is_load = False def extract_nouns(self, text): """ KoNLPy을 이용하여 명사만 추출 Parameters ---------- text: str 추출할 문장 Returns ------- list of str 추출된 명사 리스트 """ return self.konlpy.nouns(text) def gen_dataset(self, reports): """ Report들에서 XBRL 파일을 추출후 Concept_id와 Label 값을 이용하여 CNN 모델 학습 Parameters ---------- reports: list of Report 추출할 Report 리스트 """ self._extract_dataset(reports) self._gen_word_dict() self._gen_concept_dict() self._gen_x_train() self._gen_y_train() self.is_load = True def _extract_dataset(self, reports: List[Report]): """ Report에 포함된 XBRL 파일에서 Concept_id 와 Label 값 추출 Parameters ---------- reports: list of Report 추출할 Report 리스트 """ if is_notebook(): from tqdm import tqdm_notebook as tqdm else: from tqdm import tqdm dataset = [] for report in tqdm(reports, desc='Extracting concept_id and label_ko', unit='report'): df_fs = analyze_xbrl(report) if df_fs is None: continue for tp in df_fs: df = df_fs[tp] if df is not None: concept_column = find_all_columns(df, 'concept_id')[0] label_ko_column = find_all_columns(df, 'label_ko')[0] for idx in range(len(df)): concept_id = df[concept_column].iloc[idx] label_ko = df[label_ko_column].iloc[idx] if concept_id and label_ko: try: label = self.extract_nouns(label_ko) dataset.append((concept_id, label)) except BaseException: continue self._dataset = dataset def _gen_word_dict(self): """ 단어 Lookup Table 생성 """ word_index = dict() for _, nouns in self._dataset: for noun in nouns: if word_index.get(noun) is None: word_index[noun] = 0 word_index[noun] += 1 word_dict = dict() for idx, (noun, _) in enumerate( sorted(word_index.items(), key=lambda x: x[1], reverse=True)): word_dict[noun] = idx + 1 self.word_dict = word_dict def _gen_concept_dict(self): """ concept_id Lookup Table 생성 """ concepts = set() for concept, _ in self._dataset: concepts.add(concept) concept_dict = dict() for idx, concept in enumerate(concepts): concept_dict[concept] = idx + 1 self.concept_dict = concept_dict def _gen_x_train(self): """ 입력값 변환 """ dataset = [] for concept_id, label_ko in self._dataset: dataset.append([self.word_dict[x] for x in label_ko]) x_train = self.vectorize_sequences(dataset) self._x_train = x_train def _gen_y_train(self): """ 결과값 변환 """ dataset = [self.concept_dict[concept] for concept, _ in self._dataset] y_train = tf.keras.utils.to_categorical(dataset) self._y_train = y_train @property def input_length(self): return len(self.word_dict) + 1 @property def output_length(self): return len(self.concept_dict) + 1 def gen_model(self, units: int = 256, dropout: float = 0.2, epochs: int = 50, batch_size: int = 512): """ Keras를 이용한 CNN 모델 생성 및 학습 Parameters ---------- units: int unit 수 dropout: float dropout rate epochs: int 학습 반복 횟수 batch_size: int batch_size 수 """ model = tf.keras.models.Sequential([ tf.keras.layers.Dense(units, activation='relu', input_shape=(self.input_length, )), tf.keras.layers.Dropout(rate=dropout), tf.keras.layers.Dense(units, activation='relu'), tf.keras.layers.Dense(self.output_length, activation='softmax') ]) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) length = int(len(self._x_train) / 5) x_val = self._x_train[:length * 3] partial_x_train = self._x_train[length * 3:length * 4] x_test = self._x_train[length * 4:] y_val = self._y_train[:length * 3] partial_y_train = self._y_train[length * 3:length * 4] y_test = self._y_train[length * 4:] print("\n==========Model Fit==========\n") model.fit(x_val, y_val, epochs=epochs, batch_size=batch_size, validation_data=(partial_x_train, partial_y_train)) print("\n==========Model Evaluation==========\n") model.evaluate(x_test, y_test) self.model = model def vectorize_sequences(self, sequences: List[List[str]]) -> List[List[int]]: """ Label에 포함된 단어를 0과 1의 리스트로 변환""" results = np.zeros((len(sequences), self.input_length)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results def save(self, path=None): """ Convolutional Neural Networks 모델 및 Dictionary 저장 Parameters ---------- path: str 데이터 저장 위치 """ if path is None: path = pkg_resources.resource_filename('dart_fss_classifier', 'data/') create_folder(path) file = os.path.join(path, 'dict.json') config = { 'word_dict': self.word_dict, 'concept_dict': self.concept_dict, } model_file = os.path.join(path, 'model.h5') self.model.save(model_file) with open(file, 'w') as outfile: json.dump(config, outfile) def load(self, path: str = None) -> str: """ Convolutional Neural Networks 모델 및 Dictionary 로딩 Parameters ---------- path: str 데이터 위치 """ if path is None: path = pkg_resources.resource_filename('dart_fss_classifier', 'data/') file = os.path.join(path, 'dict.json') if not os.path.isfile(file): raise FileExistsError( "The dictionary does not exist. Please run 'generate_default_dataset_and_cnn_model'." ) model_file = os.path.join(path, 'model.h5') if not os.path.isfile(model_file): raise FileExistsError( "The Keras model does not exist. Please run 'generate_default_dataset_and_cnn_model'." ) self.model = tf.keras.models.load_model(model_file) with open(file) as json_file: data = json.load(json_file) self.word_dict = data['word_dict'] self.concept_dict = data['concept_dict'] self.is_load = True def guess(self, text: str) -> str: """ Concept_id 추측 Method Parameters ---------- text: str Label 명 Returns ------- str 추측한 Concept_id """ if not self.is_load: self.load() data = [] for noun in self.extract_nouns(text): try: word = self.word_dict[noun] data.append(word) except BaseException: pass d = self.vectorize_sequences([data]) prediction = np.argmax(self.model.predict(d)) for key, value in self.concept_dict.items(): if value == prediction: return key return None
# This is script to test KoNLPy. # Project started at 01/18/2016. Author by Jaehyun Ahn([email protected]) __author__ = 'Sogo' from konlpy.tag import Kkma from collections import Counter print('Number of lines in document:') k = Kkma() f = open('test.txt', 'r') lines = f.read().splitlines() nlines = len(lines) print(nlines) nouns = [k.nouns(lines[i]) for i in range(0, nlines)] cnt = Counter() for i in range(len(nouns)): for j in range(len(nouns[i])): cnt[nouns[i][j]] += 1 print(cnt.most_common(15)) # let's get words! It's a steal! print(cnt.most_common(15)[0][0]) print(cnt.most_common(15)[1])