예제 #1
0
from konlpy.tag import Twitter
from konlpy.tag import Komoran
twitter = Komoran()

text = open('190747347803005_191149761096097','r').read().decode('utf8')
print text
print '---------------'
sentence = []
for i in twitter.pos(text):
    #if i[1] == 'Unknown' or i[1] == 'Punctuation':
    #    continue
    if i[1] == 'SF' or i[1] == 'SE' or i[1] == 'SP' or i[1] == 'SS':
        continue
    sentence.append(i[0])
    if i[1] == 'EF' :
        print ''.join(sentence)
        sentence = []
    
    
예제 #2
0
파일: tagger.py 프로젝트: dsindex/syntaxnet
if __name__ == '__main__':

	parser = OptionParser()
	parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
	(options, args) = parser.parse_args()

	if options.verbose : VERBOSE = 1

	komoran = Komoran()

	while 1:
		try:
			line = sys.stdin.readline()
		except KeyboardInterrupt:
			break
		if not line:
			break

		analyzed = komoran.pos(line)
		seq = 1
		for morph, tag in analyzed :
			tp = [seq, morph, morph, tag, tag, '_', 0, '_', '_', '_']
			print '\t'.join([str(e) for e in tp])
			seq += 1
		print '\n',




예제 #3
0
from konlpy.tag import Komoran
result = []
komoran = Komoran()
with open("/home/nlpgpu4/data/cm/Attetion_SRL/[naver]train_data.txt") as f:
    for line in f.readlines():

        if line.strip() == "" :
            result.append("\n")
            continue
        idx, raw_word, label = line.split()
        tmp_word = komoran.pos(raw_word.decode("utf-8"))
        word = ""
        for w in tmp_word:
            word += w[0]+"/"+w[1] + "|"

        new_line = idx + "\t" + word[:-1] + "\t" + raw_word.decode("utf-8") +"\t"+ label.strip() + "\n"
        result.append(new_line)
        # break

with open("/home/nlpgpu4/data/cm/Attetion_SRL/[naver]komoran_train_data.txt", "w") as f:
    for r in result:
        f.write(r.encode("utf-8"))

		34857
예제 #4
0
from konlpy.tag import Komoran

print("Dsds")

komoran = Komoran()

print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요'))
print(komoran.nouns(u'오픈소스에 관심 많은 멋진 개발자님들!'))
print(komoran.pos(u'한글형태소분석기 코모란 테스트 중 입니다.'))
예제 #5
0
if __name__ == '__main__':

    parser = OptionParser()
    parser.add_option("--verbose",
                      action="store_const",
                      const=1,
                      dest="verbose",
                      help="verbose mode")
    (options, args) = parser.parse_args()

    if options.verbose: VERBOSE = 1

    komoran = Komoran()

    while 1:
        try:
            line = sys.stdin.readline()
        except KeyboardInterrupt:
            break
        if not line:
            break

        analyzed = komoran.pos(line)
        seq = 1
        for morph, tag in analyzed:
            tp = [seq, morph, morph, tag, tag, '_', 0, '_', '_', '_']
            print '\t'.join([str(e) for e in tp])
            seq += 1
        print '\n',
예제 #6
0
result = []
i = 0

twit_id = []
user_timeline = []
sent1 = []
sent2 = []
sent3 = []
sent4 = []
sent5 = []

for content in timeline:
  sentence = content[2]
  user_timeline.append(sentence)
  twit_id.append(str(content[1]))
  result.append(komoran.pos(sentence))
  #print("문장 :", sentence)
  #print(result[i])
  i += 1

'''
sentence = "★마비노기 14주년 기념 축제★ 매주 그리운 NPC와 함께 상상여행을 떠나고, 돌아온 악동 4인방을 도와 축제를 꾸며주세요. 다양한 14주년 이벤트에 참여하면 역대급으로 쏟아지는 푸짐한 선물까지! #마비노기_14주년"
result = komoran.pos(sentence)
print("문장 :", sentence)
print(result)
'''


i = 0
for twit in result:
  #print(twit)
okt = Okt()
twitter = Twitter()

sentence = 'IBK기업은행 '
sentences = '소은지국민은행계좌로30만원이체해줘'
komoran = Komoran()

twitter.add_dictionary('이체해줘', 'Noun')
twitter.add_dictionary('KB 국민은행', 'Noun')

komoran = Komoran(userdic="C:/Users/ADMIN/Desktop/dic.txt")

print(twitter.pos(sentence, stem=True))
print(twitter.pos(sentences, stem=True))

print(komoran.pos(sentence))
print(komoran.pos(sentences))

arr = komoran.pos(sentence)
for word, tag in arr:
    if (tag == 'VV'): print("|||||||")
    print(word, tag)
    if (tag == 'JKO' or tag == 'JKB' or tag == 'JKS'): print("|||||||")

brr = komoran.pos(sentences)
for word, tag in brr:

    if (tag == 'VV' or tag == 'XSV'):
        print("|||||||")

    print(word, tag)
예제 #8
0

# 코모란 형태소 분석기 객체 생성
# komoran = Komoran()
# text = "성남에 있는 성심병원의 주소를 알려주세요."
#
# # 형태소 추출
# morphs = komoran.morphs(text)
# print(morphs)
#
# # 형태소와 품사 태그 추출
# pos = komoran.pos(text)
# print(pos)

# 명사만 추출
# nouns = komoran.nouns(text)
# print(nouns)

#인식이 안되는 새로운 단어 추가
#[단어] TAP [품사]
#엔엘피 NNG
komoran = Komoran(userdic='./user_dic.txt')  # 사용자사전 1 병원이름 정의 X
komoran2 = Komoran(userdic='./user_dic_2.txt')  # 사용자사전 2 병원이름 정의 O
text = "용인에 있는 한성의심원병원 전화번호 알려줘!"
pos = komoran.pos(text)
pos2 = komoran2.pos(text)
print(pos)
print(pos2)
# print(get_keywords(pos, without_tag=False))
# print(get_keywords(pos, without_tag=True))
예제 #9
0
    with open('./doc.txt', 'r', encoding='utf-8') as f:
        contents = f.readlines()
        desc = ''
        for sent in contents:
            sent = sent.replace('\n', ' ')
            desc += sent

    f.close()

    phars = desc.split('.')
    #for phar in phars:

    #tokenized(desc)

    kkma = Kkma()
    article = kkma.sentences(desc)
    print(article)

    twitter = Twitter()
    for sentence in article:
        out_str = twitter.nouns(sentence)
        print(out_str)

    words = komoran.pos(desc, join=True)
    #print(words)

    #print(phar)

    #str_cmp('집합에 관련된 것을 쉽게 처리하기 위해 만든 자료형이다.','집합(set)은 파이썬 2.3부터 지원하기 시작한 자료형으로')
    #main()
예제 #10
0
from konlpy.tag import Komoran

sentence = '국정 농단 태블릿 PC, 설진욱, 가나다라'
print('# before user dic')
komo = Komoran()
print(komo.pos(sentence))
''' 
    임시 파일을 만들어서 위의 코딩을 테스트 해주세요. 
'''
'''
결과값 # before user dic
[('국정', 'NNG'), ('농', 'NNG'), 
('단', 'NNG'), ('태블릿 PC', 'NNP'), (',', 'SP'), 
('설', 'NNB'), ('진', 'NNP'), 
('욱', 'NA'), (',', 'SP'), ('가나', 'NNP'), ('다라', 'NNP')]
'''
예제 #11
0
with open(FEATURE_FILE_NAME, 'rb') as featureFile:
    features = pickle.load(featureFile)

with open(MODEL_FILE_NAME, 'rb') as modelFile:
    classifier = pickle.load(modelFile)

#형태소분석기
komoran = Komoran()

while True:
    sentence = input("문장을 입력하세요(종료=exit): ")

    if sentence == 'exit':
        break

    analyzed = komoran.pos(sentence)
    selected = []

    for word, morph in analyzed:
        if morph in SELECT_MORPH:
            selected.append(word + '/' + morph)

    #document term array
    sentFeatures = np.zeros(shape=(1, len(features)))

    selected = _word_ngrams(NGRAM, selected)

    for token in selected:
        featIdx = features.index(token) if token in features else -1

        if featIdx != -1:
예제 #12
0
hannanum = Hannanum()

normalizations = hannanum.morphs(input_text)
tokenizations = hannanum.pos(input_text)

print('')
print('########    Hannanum    ########')
print('')
print('normalization : ')
print(normalizations)
print('')
print('tokenization : ')
print(tokenizations)
print('')


########    use Komoran    ########
komoran = Komoran()

normalizations = komoran.morphs(input_text)
tokenizations = komoran.pos(input_text)

print('')
print('########    Komoran    ########')
print('')
print('normalization : ')
print(normalizations)
print('')
print('tokenization : ')
print(tokenizations)
print('')
예제 #13
0
파일: sfGen.py 프로젝트: allqoow/msnot
class sfGen():
    def __init__(self, driver, db, taggedSen):
        #self.searchInput = searchInput
        #self.searchInputAnnex = searchInputAnnex
        self.driver = driver
        self.db = db
        self.taggedSen = taggedSen

        from konlpy.tag import Komoran
        self.Komoran = Komoran()

        #self.dictSearchCommon

    def createSearchInputYongeon(self, yongeonPart):
        # 논항구조 가져오기
        index0 = yongeonPart[0]
        index1 = yongeonPart[1]
        searchInput = ""
        for x in self.taggedSen[index0:index1]:
            if "V" in x[1]:
                searchInput = searchInput + x[0]
            elif "J" in x[1]:
                break
            else:
                pass
        searchInput = searchInput + "다"

        searchInputAnnex = {}
        searchInputAnnex["pos"] = "yongeon"

        print searchInput
        print searchInputAnnex
        self.searchInput = searchInput
        self.searchInputAnnex = searchInputAnnex

    def createSearchInputCheeon(self, cheeonPart):
        index0 = cheeonPart[0]
        index1 = cheeonPart[1]
        searchInput = ""
        print index0
        print index1
        for x in self.taggedSen[index0:index1]:
            if re.search(r"J[A-Z]+", x[1]) != None:
                pass
            else:
                searchInput = str(x[0])

        searchInputAnnex = {}
        searchInputAnnex["pos"] = "cheeon"

        print searchInput
        print searchInputAnnex
        self.searchInput = searchInput
        self.searchInputAnnex = searchInputAnnex

    def getSemanticFeature(self):
        dbSearchResult = self.retrieveFromDb(self.searchInput,
                                             self.searchInputAnnex)

        fetched = dbSearchResult.fetch_row()
        # case where there exist search results
        if fetched:
            while fetched:
                print fetched[0][6]
                fetched = dbSearchResult.fetch_row()
        # case where no search result has been found
        else:
            self.dictSearchCommon(self.searchInput, self.searchInputAnnex)
            self.dictSearchCheeon(self.searchInput, self.searchInputAnnex)
            self.insertIntoDb(self.searchInputAnnex)

        # searchInput 		: str
        # searchInputAnnex  : dict
        # returns 			: _mysql.result object

    def retrieveFromDb(self, searchInput, searchInputAnnex):
        searchInputAnnex["pos"]
        sqlQuery = ""
        sqlQuery = sqlQuery + "SELECT * FROM sf_" + searchInputAnnex["pos"]
        sqlQuery = sqlQuery + " WHERE wordname REGEXP \'" + searchInput + "\'"
        print sqlQuery
        db2.query(sqlQuery)
        result = db2.store_result()
        #self.db.query(sqlQuery)
        #result = self.db.store_result()
        return result

    def dictSearchCommon(self, searchInput, searchInputAnnex):
        self.searchInput = searchInput
        self.searchInputAnnex = searchInputAnnex
        # direct to homepage of Daum한국어사전
        # select
        self.driver.get("http://dic.daum.net/index.do?dic=kor")
        searchBox = self.driver.find_element_by_class_name("tf_keyword")
        searchInput = unicode(self.searchInput)
        searchBox.send_keys(searchInput)
        self.driver.find_element_by_class_name("btn_search").click()

        if re.search(r"search\.do\?", self.driver.current_url) != None:
            try:
                self.driver.find_element_by_class_name("txt_cleansch").click()
            except selenium.common.exceptions.NoSuchElementException:
                pass

        elems = self.driver.find_elements_by_class_name("fold_open")
        for x in elems:
            # folding unfolded boxes
            try:
                x.find_element_by_class_name("btn_fold").click()
            except selenium.common.exceptions.NoSuchElementException:
                pass

    def dictSearchCheeon(self, searchInput, searchInputAnnex):
        ret = ""
        searchInput = self.searchInput
        elems = self.driver.find_elements_by_class_name("box_word")

        for x in elems:
            retCommon = ""
            # unfolding folded boxes
            try:
                x.find_element_by_class_name("btn_fold").click()
            except selenium.common.exceptions.NoSuchElementException:
                #break
                pass
                #print "패쓰!"

            try:
                curl = self.driver.current_url
                kkw = curl.split("wordid=")[1].split("&")[0]
                kku = x.get_attribute("data-supid")
                retCommon = retCommon + kkw + "|" + kku + "|" + pos.text + "|"
            except:
                break

            try:
                pos = x.find_element_by_class_name("tit_ex")
                #if pos.text not in ["명사","인칭 대명사"]:
                #	#print "체언이 아니자나!"
                #	break
            except:
                break

            try:
                retCommon = retCommon + x.find_element_by_class_name(
                    "txt_subword").text + "|"
            except selenium.common.exceptions.NoSuchElementException:
                retCommon = retCommon + searchInput + "|"

            try:
                retCommon = retCommon + x.find_element_by_class_name(
                    "txt_hanja").text + "|"
            except selenium.common.exceptions.NoSuchElementException:
                retCommon = retCommon + searchInput + "|"

            #print retCommon
            descs = x.find_elements_by_class_name("desc_item")
            #retSpecific = ""
            sfCfmd = ""
            descRaw = ""

            if retCommon.split("|")[2] not in ["명사", "의존 명사", "인칭 대명사"]:
                print "체언이 아니자나!"
            else:
                for y in descs:
                    descRaw = descRaw + y.text + "/"
                    for z in y.text.split(".")[:-1]:
                        desc = z.strip()
                        desc = str(desc)
                        sfCdd = ""

                        #elif re.search(r"말한다", desc) != None:
                        #	sfCdd = desc.split(" ")[-2]
                        #부르는 또는 이르는...일컫는 말... 앞에서는
                        if re.search(r"통틀어 이르는 말", desc) != None:
                            if re.search(r".+을", desc.split(" ")[-4]) != None:
                                sfCdd = desc.split(" ")[-4].rstrip("을")
                            elif re.search(r".+를",
                                           desc.split(" ")[-4]) != None:
                                sfCdd = desc.split(" ")[-4].rstrip("를")

                        elif re.search(r"높여 이르는 말", desc) != None:
                            if re.search(r".+을", desc.split(" ")[-4]) != None:
                                sfCdd = desc.split(" ")[-4].rstrip("을")
                            elif re.search(r".+를",
                                           desc.split(" ")[-4]) != None:
                                sfCdd = desc.split(" ")[-4].rstrip("를")
                        else:
                            sfCdd = desc.split(" ")[-1]
                        #print sfCdd

                        if self.Komoran.pos(sfCdd)[-1][1] == "ETN":
                            sfCfmd = sfCfmd + "행동;"
                        else:
                            sfCfmd = sfCfmd + sfCdd + ";"
                    sfCfmd = sfCfmd + "/"
                #print sfCfmd
                retSpecific = sfCfmd + "|" + descRaw

                ret = ret + retCommon + retSpecific + "\n"

            retCommon = ""

        self.dbInput = ret
        print ret
        return ret

    def dictSearchYongeon(self, searchInput, searchInputAnnex):
        ret = ""
        searchInput = self.searchInput
        elems = self.driver.find_elements_by_class_name("box_word")

        for x in elems:
            # initialising variable
            retCommon = ""

            # unfolding folded boxes
            try:
                x.find_element_by_class_name("btn_fold").click()
            except selenium.common.exceptions.NoSuchElementException:
                pass

            # collecting kkw and kku (if any, break otherwise)
            try:
                curl = self.driver.current_url
                kkw = curl.split("wordid=")[1].split("&")[0]
                kku = x.get_attribute("data-supid")
                retCommon = retCommon + kkw + "|" + kku + "|"
            except:
                break

            try:
                retCommon = retCommon + x.find_element_by_class_name(
                    "txt_subword").text + "|"
            except selenium.common.exceptions.NoSuchElementException:
                retCommon = retCommon + searchInput + "|"
                pass

            # collecting hanja (if any)
            try:
                retCommon = retCommon + x.find_elements_by_class_name(
                    "txt_pronounce")[1].text + "|"
            except (selenium.common.exceptions.NoSuchElementException,
                    IndexError):
                try:
                    retCommon = retCommon + self.driver.find_elements_by_class_name(
                        "txt_pronounce")[0].text + "|"
                except selenium.common.exceptions.NoSuchElementException:
                    retCommon = retCommon + "NoHanja|"
            #print retCommon

            # for each description
            retSpecific = ""
            descs = x.find_elements_by_class_name("desc_item")
            for y in descs:
                try:
                    pos = x.find_element_by_class_name("tit_ex")
                    if pos.text not in ["자동사", "타동사", "형용사"]:
                        break
                    #retSpecific = retSpecific + pos.text + "|"
                except selenium.common.exceptions.NoSuchElementException:
                    #print "패쓰!"
                    break

                #print y.text
                desc = str(y.text)

                try:
                    argStruct = str(desc.split(")")[0].split("(")[1])
                    #print ret1

                    taggedDesc = self.Komoran.pos(argStruct)
                    #print taggedDesc

                    argStructPat = []
                    for i in range(len(taggedDesc)):
                        if taggedDesc[i][1] == "JKS":
                            if "JKS0" in argStructPat:
                                argStructPat.append("JKS1")
                            else:
                                argStructPat.append("JKS0")
                        elif re.search(r"JK[A-Z]+", taggedDesc[i][1]) != None:
                            argStructPat.append(str(taggedDesc[i][1]))
                    #print argStructPat

                    switch = argStructPat[0]
                    switchIndex = 1
                    argStructRefined = ""
                    for i in range(len(taggedDesc)):
                        if re.search(r"N[A-Z]+", taggedDesc[i][1]) != None:
                            argStructRefined = argStructRefined + taggedDesc[
                                i][0] + "/" + switch[2:] + " "
                        elif re.search(
                                r"JK[A-Z]+", taggedDesc[i]
                            [1]) != None and switchIndex < len(argStructPat):
                            switch = argStructPat[switchIndex]
                            switchIndex += 1
                    retSpecific = pos.text + "|" + argStructRefined + "|" + desc
                    #print retSpecific
                    #print "ongoing"
                except (UnicodeDecodeError, IndexError):
                    pass
                ret = ret + retCommon + retSpecific + "\n"
                #elif searchWordtype == "N":
                #	ret = ret + str(desc) + ";"
            retCommon = ""
        self.dbInput = ret
        print ret
        return ret

    def insertIntoDb(self, searchInputAnnex):
        #print self.dbInput
        for x in self.dbInput.split("\n")[:-1]:
            dbInput = x
            print dbInput

            kkw = dbInput.split("|")[0]
            kku = dbInput.split("|")[1]

            if searchInputAnnex["pos"] == "cheeon":
                pos = unicode(dbInput.split("|")[2])
                wordname = dbInput.split("|")[3]
                wordname_hanja = dbInput.split("|")[4]

            elif searchInputAnnex["pos"] == "yongeon":
                pos = dbInput.split("|")[4]
                wordname = dbInput.split("|")[2]
                wordname_hanja = dbInput.split("|")[3]

            sf = dbInput.split("|")[5]
            raw_desc = dbInput.split("|")[6]

            sqlQueryBh = ""
            sqlQueryBh = sqlQueryBh + "SELECT * FROM sf_" + searchInputAnnex[
                "pos"]
            sqlQueryBh = sqlQueryBh + " WHERE kkw=\'" + kkw + "\' and kku=\'" + kku + "\'"
            #print sqlQueryBh
            #self.db.query(sqlQueryBh)
            db2.query(sqlQueryBh)
            #result = self.db.store_result()
            result = db2.store_result()

            if len(result.fetch_row()) == 0:
                sqlQuery = "INSERT INTO sf_" + searchInputAnnex["pos"]
                sqlQuery = sqlQuery + " (kkw, kku, pos, wordname, wordname_hanja, sf, raw_desc)"
                sqlQuery = sqlQuery + " VALUES ("
                sqlQuery = sqlQuery + "\'" + kkw + "\'" + ","
                sqlQuery = sqlQuery + "\'" + kku + "\'" + ","
                sqlQuery = sqlQuery + "\'" + pos + "\'" + ","
                sqlQuery = sqlQuery + "\'" + wordname + "\'" + ","
                sqlQuery = sqlQuery + "\'" + wordname_hanja + "\'" + ","
                sqlQuery = sqlQuery + "\'" + sf + "\'" + ","
                sqlQuery = sqlQuery + "\'" + raw_desc + "\'" + ")"
                print sqlQuery
                #self.db.query(sqlQuery)
                db2.query(sqlQuery)
            else:
                print "already inserted"