from konlpy.tag import Twitter from konlpy.tag import Komoran twitter = Komoran() text = open('190747347803005_191149761096097','r').read().decode('utf8') print text print '---------------' sentence = [] for i in twitter.pos(text): #if i[1] == 'Unknown' or i[1] == 'Punctuation': # continue if i[1] == 'SF' or i[1] == 'SE' or i[1] == 'SP' or i[1] == 'SS': continue sentence.append(i[0]) if i[1] == 'EF' : print ''.join(sentence) sentence = []
if __name__ == '__main__': parser = OptionParser() parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") (options, args) = parser.parse_args() if options.verbose : VERBOSE = 1 komoran = Komoran() while 1: try: line = sys.stdin.readline() except KeyboardInterrupt: break if not line: break analyzed = komoran.pos(line) seq = 1 for morph, tag in analyzed : tp = [seq, morph, morph, tag, tag, '_', 0, '_', '_', '_'] print '\t'.join([str(e) for e in tp]) seq += 1 print '\n',
from konlpy.tag import Komoran result = [] komoran = Komoran() with open("/home/nlpgpu4/data/cm/Attetion_SRL/[naver]train_data.txt") as f: for line in f.readlines(): if line.strip() == "" : result.append("\n") continue idx, raw_word, label = line.split() tmp_word = komoran.pos(raw_word.decode("utf-8")) word = "" for w in tmp_word: word += w[0]+"/"+w[1] + "|" new_line = idx + "\t" + word[:-1] + "\t" + raw_word.decode("utf-8") +"\t"+ label.strip() + "\n" result.append(new_line) # break with open("/home/nlpgpu4/data/cm/Attetion_SRL/[naver]komoran_train_data.txt", "w") as f: for r in result: f.write(r.encode("utf-8")) 34857
from konlpy.tag import Komoran print("Dsds") komoran = Komoran() print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요')) print(komoran.nouns(u'오픈소스에 관심 많은 멋진 개발자님들!')) print(komoran.pos(u'한글형태소분석기 코모란 테스트 중 입니다.'))
if __name__ == '__main__': parser = OptionParser() parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") (options, args) = parser.parse_args() if options.verbose: VERBOSE = 1 komoran = Komoran() while 1: try: line = sys.stdin.readline() except KeyboardInterrupt: break if not line: break analyzed = komoran.pos(line) seq = 1 for morph, tag in analyzed: tp = [seq, morph, morph, tag, tag, '_', 0, '_', '_', '_'] print '\t'.join([str(e) for e in tp]) seq += 1 print '\n',
result = [] i = 0 twit_id = [] user_timeline = [] sent1 = [] sent2 = [] sent3 = [] sent4 = [] sent5 = [] for content in timeline: sentence = content[2] user_timeline.append(sentence) twit_id.append(str(content[1])) result.append(komoran.pos(sentence)) #print("문장 :", sentence) #print(result[i]) i += 1 ''' sentence = "★마비노기 14주년 기념 축제★ 매주 그리운 NPC와 함께 상상여행을 떠나고, 돌아온 악동 4인방을 도와 축제를 꾸며주세요. 다양한 14주년 이벤트에 참여하면 역대급으로 쏟아지는 푸짐한 선물까지! #마비노기_14주년" result = komoran.pos(sentence) print("문장 :", sentence) print(result) ''' i = 0 for twit in result: #print(twit)
okt = Okt() twitter = Twitter() sentence = 'IBK기업은행 ' sentences = '소은지국민은행계좌로30만원이체해줘' komoran = Komoran() twitter.add_dictionary('이체해줘', 'Noun') twitter.add_dictionary('KB 국민은행', 'Noun') komoran = Komoran(userdic="C:/Users/ADMIN/Desktop/dic.txt") print(twitter.pos(sentence, stem=True)) print(twitter.pos(sentences, stem=True)) print(komoran.pos(sentence)) print(komoran.pos(sentences)) arr = komoran.pos(sentence) for word, tag in arr: if (tag == 'VV'): print("|||||||") print(word, tag) if (tag == 'JKO' or tag == 'JKB' or tag == 'JKS'): print("|||||||") brr = komoran.pos(sentences) for word, tag in brr: if (tag == 'VV' or tag == 'XSV'): print("|||||||") print(word, tag)
# 코모란 형태소 분석기 객체 생성 # komoran = Komoran() # text = "성남에 있는 성심병원의 주소를 알려주세요." # # # 형태소 추출 # morphs = komoran.morphs(text) # print(morphs) # # # 형태소와 품사 태그 추출 # pos = komoran.pos(text) # print(pos) # 명사만 추출 # nouns = komoran.nouns(text) # print(nouns) #인식이 안되는 새로운 단어 추가 #[단어] TAP [품사] #엔엘피 NNG komoran = Komoran(userdic='./user_dic.txt') # 사용자사전 1 병원이름 정의 X komoran2 = Komoran(userdic='./user_dic_2.txt') # 사용자사전 2 병원이름 정의 O text = "용인에 있는 한성의심원병원 전화번호 알려줘!" pos = komoran.pos(text) pos2 = komoran2.pos(text) print(pos) print(pos2) # print(get_keywords(pos, without_tag=False)) # print(get_keywords(pos, without_tag=True))
with open('./doc.txt', 'r', encoding='utf-8') as f: contents = f.readlines() desc = '' for sent in contents: sent = sent.replace('\n', ' ') desc += sent f.close() phars = desc.split('.') #for phar in phars: #tokenized(desc) kkma = Kkma() article = kkma.sentences(desc) print(article) twitter = Twitter() for sentence in article: out_str = twitter.nouns(sentence) print(out_str) words = komoran.pos(desc, join=True) #print(words) #print(phar) #str_cmp('집합에 관련된 것을 쉽게 처리하기 위해 만든 자료형이다.','집합(set)은 파이썬 2.3부터 지원하기 시작한 자료형으로') #main()
from konlpy.tag import Komoran sentence = '국정 농단 태블릿 PC, 설진욱, 가나다라' print('# before user dic') komo = Komoran() print(komo.pos(sentence)) ''' 임시 파일을 만들어서 위의 코딩을 테스트 해주세요. ''' ''' 결과값 # before user dic [('국정', 'NNG'), ('농', 'NNG'), ('단', 'NNG'), ('태블릿 PC', 'NNP'), (',', 'SP'), ('설', 'NNB'), ('진', 'NNP'), ('욱', 'NA'), (',', 'SP'), ('가나', 'NNP'), ('다라', 'NNP')] '''
with open(FEATURE_FILE_NAME, 'rb') as featureFile: features = pickle.load(featureFile) with open(MODEL_FILE_NAME, 'rb') as modelFile: classifier = pickle.load(modelFile) #형태소분석기 komoran = Komoran() while True: sentence = input("문장을 입력하세요(종료=exit): ") if sentence == 'exit': break analyzed = komoran.pos(sentence) selected = [] for word, morph in analyzed: if morph in SELECT_MORPH: selected.append(word + '/' + morph) #document term array sentFeatures = np.zeros(shape=(1, len(features))) selected = _word_ngrams(NGRAM, selected) for token in selected: featIdx = features.index(token) if token in features else -1 if featIdx != -1:
hannanum = Hannanum() normalizations = hannanum.morphs(input_text) tokenizations = hannanum.pos(input_text) print('') print('######## Hannanum ########') print('') print('normalization : ') print(normalizations) print('') print('tokenization : ') print(tokenizations) print('') ######## use Komoran ######## komoran = Komoran() normalizations = komoran.morphs(input_text) tokenizations = komoran.pos(input_text) print('') print('######## Komoran ########') print('') print('normalization : ') print(normalizations) print('') print('tokenization : ') print(tokenizations) print('')
class sfGen(): def __init__(self, driver, db, taggedSen): #self.searchInput = searchInput #self.searchInputAnnex = searchInputAnnex self.driver = driver self.db = db self.taggedSen = taggedSen from konlpy.tag import Komoran self.Komoran = Komoran() #self.dictSearchCommon def createSearchInputYongeon(self, yongeonPart): # 논항구조 가져오기 index0 = yongeonPart[0] index1 = yongeonPart[1] searchInput = "" for x in self.taggedSen[index0:index1]: if "V" in x[1]: searchInput = searchInput + x[0] elif "J" in x[1]: break else: pass searchInput = searchInput + "다" searchInputAnnex = {} searchInputAnnex["pos"] = "yongeon" print searchInput print searchInputAnnex self.searchInput = searchInput self.searchInputAnnex = searchInputAnnex def createSearchInputCheeon(self, cheeonPart): index0 = cheeonPart[0] index1 = cheeonPart[1] searchInput = "" print index0 print index1 for x in self.taggedSen[index0:index1]: if re.search(r"J[A-Z]+", x[1]) != None: pass else: searchInput = str(x[0]) searchInputAnnex = {} searchInputAnnex["pos"] = "cheeon" print searchInput print searchInputAnnex self.searchInput = searchInput self.searchInputAnnex = searchInputAnnex def getSemanticFeature(self): dbSearchResult = self.retrieveFromDb(self.searchInput, self.searchInputAnnex) fetched = dbSearchResult.fetch_row() # case where there exist search results if fetched: while fetched: print fetched[0][6] fetched = dbSearchResult.fetch_row() # case where no search result has been found else: self.dictSearchCommon(self.searchInput, self.searchInputAnnex) self.dictSearchCheeon(self.searchInput, self.searchInputAnnex) self.insertIntoDb(self.searchInputAnnex) # searchInput : str # searchInputAnnex : dict # returns : _mysql.result object def retrieveFromDb(self, searchInput, searchInputAnnex): searchInputAnnex["pos"] sqlQuery = "" sqlQuery = sqlQuery + "SELECT * FROM sf_" + searchInputAnnex["pos"] sqlQuery = sqlQuery + " WHERE wordname REGEXP \'" + searchInput + "\'" print sqlQuery db2.query(sqlQuery) result = db2.store_result() #self.db.query(sqlQuery) #result = self.db.store_result() return result def dictSearchCommon(self, searchInput, searchInputAnnex): self.searchInput = searchInput self.searchInputAnnex = searchInputAnnex # direct to homepage of Daum한국어사전 # select self.driver.get("http://dic.daum.net/index.do?dic=kor") searchBox = self.driver.find_element_by_class_name("tf_keyword") searchInput = unicode(self.searchInput) searchBox.send_keys(searchInput) self.driver.find_element_by_class_name("btn_search").click() if re.search(r"search\.do\?", self.driver.current_url) != None: try: self.driver.find_element_by_class_name("txt_cleansch").click() except selenium.common.exceptions.NoSuchElementException: pass elems = self.driver.find_elements_by_class_name("fold_open") for x in elems: # folding unfolded boxes try: x.find_element_by_class_name("btn_fold").click() except selenium.common.exceptions.NoSuchElementException: pass def dictSearchCheeon(self, searchInput, searchInputAnnex): ret = "" searchInput = self.searchInput elems = self.driver.find_elements_by_class_name("box_word") for x in elems: retCommon = "" # unfolding folded boxes try: x.find_element_by_class_name("btn_fold").click() except selenium.common.exceptions.NoSuchElementException: #break pass #print "패쓰!" try: curl = self.driver.current_url kkw = curl.split("wordid=")[1].split("&")[0] kku = x.get_attribute("data-supid") retCommon = retCommon + kkw + "|" + kku + "|" + pos.text + "|" except: break try: pos = x.find_element_by_class_name("tit_ex") #if pos.text not in ["명사","인칭 대명사"]: # #print "체언이 아니자나!" # break except: break try: retCommon = retCommon + x.find_element_by_class_name( "txt_subword").text + "|" except selenium.common.exceptions.NoSuchElementException: retCommon = retCommon + searchInput + "|" try: retCommon = retCommon + x.find_element_by_class_name( "txt_hanja").text + "|" except selenium.common.exceptions.NoSuchElementException: retCommon = retCommon + searchInput + "|" #print retCommon descs = x.find_elements_by_class_name("desc_item") #retSpecific = "" sfCfmd = "" descRaw = "" if retCommon.split("|")[2] not in ["명사", "의존 명사", "인칭 대명사"]: print "체언이 아니자나!" else: for y in descs: descRaw = descRaw + y.text + "/" for z in y.text.split(".")[:-1]: desc = z.strip() desc = str(desc) sfCdd = "" #elif re.search(r"말한다", desc) != None: # sfCdd = desc.split(" ")[-2] #부르는 또는 이르는...일컫는 말... 앞에서는 if re.search(r"통틀어 이르는 말", desc) != None: if re.search(r".+을", desc.split(" ")[-4]) != None: sfCdd = desc.split(" ")[-4].rstrip("을") elif re.search(r".+를", desc.split(" ")[-4]) != None: sfCdd = desc.split(" ")[-4].rstrip("를") elif re.search(r"높여 이르는 말", desc) != None: if re.search(r".+을", desc.split(" ")[-4]) != None: sfCdd = desc.split(" ")[-4].rstrip("을") elif re.search(r".+를", desc.split(" ")[-4]) != None: sfCdd = desc.split(" ")[-4].rstrip("를") else: sfCdd = desc.split(" ")[-1] #print sfCdd if self.Komoran.pos(sfCdd)[-1][1] == "ETN": sfCfmd = sfCfmd + "행동;" else: sfCfmd = sfCfmd + sfCdd + ";" sfCfmd = sfCfmd + "/" #print sfCfmd retSpecific = sfCfmd + "|" + descRaw ret = ret + retCommon + retSpecific + "\n" retCommon = "" self.dbInput = ret print ret return ret def dictSearchYongeon(self, searchInput, searchInputAnnex): ret = "" searchInput = self.searchInput elems = self.driver.find_elements_by_class_name("box_word") for x in elems: # initialising variable retCommon = "" # unfolding folded boxes try: x.find_element_by_class_name("btn_fold").click() except selenium.common.exceptions.NoSuchElementException: pass # collecting kkw and kku (if any, break otherwise) try: curl = self.driver.current_url kkw = curl.split("wordid=")[1].split("&")[0] kku = x.get_attribute("data-supid") retCommon = retCommon + kkw + "|" + kku + "|" except: break try: retCommon = retCommon + x.find_element_by_class_name( "txt_subword").text + "|" except selenium.common.exceptions.NoSuchElementException: retCommon = retCommon + searchInput + "|" pass # collecting hanja (if any) try: retCommon = retCommon + x.find_elements_by_class_name( "txt_pronounce")[1].text + "|" except (selenium.common.exceptions.NoSuchElementException, IndexError): try: retCommon = retCommon + self.driver.find_elements_by_class_name( "txt_pronounce")[0].text + "|" except selenium.common.exceptions.NoSuchElementException: retCommon = retCommon + "NoHanja|" #print retCommon # for each description retSpecific = "" descs = x.find_elements_by_class_name("desc_item") for y in descs: try: pos = x.find_element_by_class_name("tit_ex") if pos.text not in ["자동사", "타동사", "형용사"]: break #retSpecific = retSpecific + pos.text + "|" except selenium.common.exceptions.NoSuchElementException: #print "패쓰!" break #print y.text desc = str(y.text) try: argStruct = str(desc.split(")")[0].split("(")[1]) #print ret1 taggedDesc = self.Komoran.pos(argStruct) #print taggedDesc argStructPat = [] for i in range(len(taggedDesc)): if taggedDesc[i][1] == "JKS": if "JKS0" in argStructPat: argStructPat.append("JKS1") else: argStructPat.append("JKS0") elif re.search(r"JK[A-Z]+", taggedDesc[i][1]) != None: argStructPat.append(str(taggedDesc[i][1])) #print argStructPat switch = argStructPat[0] switchIndex = 1 argStructRefined = "" for i in range(len(taggedDesc)): if re.search(r"N[A-Z]+", taggedDesc[i][1]) != None: argStructRefined = argStructRefined + taggedDesc[ i][0] + "/" + switch[2:] + " " elif re.search( r"JK[A-Z]+", taggedDesc[i] [1]) != None and switchIndex < len(argStructPat): switch = argStructPat[switchIndex] switchIndex += 1 retSpecific = pos.text + "|" + argStructRefined + "|" + desc #print retSpecific #print "ongoing" except (UnicodeDecodeError, IndexError): pass ret = ret + retCommon + retSpecific + "\n" #elif searchWordtype == "N": # ret = ret + str(desc) + ";" retCommon = "" self.dbInput = ret print ret return ret def insertIntoDb(self, searchInputAnnex): #print self.dbInput for x in self.dbInput.split("\n")[:-1]: dbInput = x print dbInput kkw = dbInput.split("|")[0] kku = dbInput.split("|")[1] if searchInputAnnex["pos"] == "cheeon": pos = unicode(dbInput.split("|")[2]) wordname = dbInput.split("|")[3] wordname_hanja = dbInput.split("|")[4] elif searchInputAnnex["pos"] == "yongeon": pos = dbInput.split("|")[4] wordname = dbInput.split("|")[2] wordname_hanja = dbInput.split("|")[3] sf = dbInput.split("|")[5] raw_desc = dbInput.split("|")[6] sqlQueryBh = "" sqlQueryBh = sqlQueryBh + "SELECT * FROM sf_" + searchInputAnnex[ "pos"] sqlQueryBh = sqlQueryBh + " WHERE kkw=\'" + kkw + "\' and kku=\'" + kku + "\'" #print sqlQueryBh #self.db.query(sqlQueryBh) db2.query(sqlQueryBh) #result = self.db.store_result() result = db2.store_result() if len(result.fetch_row()) == 0: sqlQuery = "INSERT INTO sf_" + searchInputAnnex["pos"] sqlQuery = sqlQuery + " (kkw, kku, pos, wordname, wordname_hanja, sf, raw_desc)" sqlQuery = sqlQuery + " VALUES (" sqlQuery = sqlQuery + "\'" + kkw + "\'" + "," sqlQuery = sqlQuery + "\'" + kku + "\'" + "," sqlQuery = sqlQuery + "\'" + pos + "\'" + "," sqlQuery = sqlQuery + "\'" + wordname + "\'" + "," sqlQuery = sqlQuery + "\'" + wordname_hanja + "\'" + "," sqlQuery = sqlQuery + "\'" + sf + "\'" + "," sqlQuery = sqlQuery + "\'" + raw_desc + "\'" + ")" print sqlQuery #self.db.query(sqlQuery) db2.query(sqlQuery) else: print "already inserted"