示例#1
0
    def __init__(self, content, con):
        self.content = content
        self.content = [x.strip() for x in self.content.splitlines()]

        self.con = con

        self.pointer_char = "⎆"  #➾ #➸ #❭
        self.wrapper = textwrap.TextWrapper(placeholder="…")

        self.output = []

        self.tipnumber = 0

        self.mecab = mecablight.MeCabLight()
        self.lemmatizer = Lemmatizer()

        self.transliterator = Transliter(academic)

        self.kogrammarlinks = kogrammarlinks.KoGrammarLinks()

        #self.missing_words = []

        self.particle_classes = ['E', 'J', 'S', 'X']

        ######################
        # connect to database and check tables are there

        self.cur = self.con.cursor()
示例#2
0
    def voice2Text():

        fname = r'/home/ubuntu/handypotter/v2t.txt'

        with open(fname, mode='r', buffering=-1, encoding="UTF-8") as fp:
            text = fp.read()
        fp.close

        mecab = Mecab()

        lemmatizer = Lemmatizer(dictionary_name='default')

        # 품사 구분하여 고유명사, 명사, 동사, 형용사 출력
        tagged_list = mecab.pos(text)
        print(tagged_list)

        tags = ['NNP', 'NNG', 'NP', 'VV', 'VA', 'MAG', 'XR']
        stoptags = [
            'JKS', 'SF', 'XSN', 'EC', 'EP', 'VX', 'NNB', 'EF', 'JX', 'EP+EF',
            'XSV', 'XSA', 'XSN'
        ]

        sentence_token = [t[0] for t in tagged_list if t[1] in tags]
        print(sentence_token)

        return sentence_token
示例#3
0
class myLemmatizer(object):
    def __init__(self):
        from soylemma import Lemmatizer

        self.inst = Lemmatizer()

    def __call__(self, *args):
        docs = []

        for word in args[0]:

            if not re.findall(r'[NN*]', word[1]):
                lem = self.inst.lemmatize(word[0])
                print(f'raw {word}')
                # if re.findall(r'[V*]', lem[1]):
                #     docs.append(lem)
                print(f'candidate{lem}')
                docs.append(lem[0])
            else:
                docs.append(word)
        #print(docs)
        return docs
示例#4
0
testing = False

import sys
import os.path
import html
import urllib.parse
import psycopg2
import textwrap

from konlpy.tag import Mecab
mecab = Mecab()
from konlpy.tag import Okt
okt = Okt()

from soylemma import Lemmatizer
lemmatizer = Lemmatizer()

if (not testing):
    if (len(sys.argv) < 2):
        print("Error, no filename specified.")
        sys.exit(1)
    filename = sys.argv[1]
else:
    filename = os.getcwd(
    ) + "/Documents/Korean Learning/korean-english-interlinear/sample.txt"
    #filename = os.getcwd() + "/Documents/Korean Learning/shincheong.txt"

if not os.path.exists(filename):
    print("Error, file not found.")
    sys.exit(1)
with open(filename) as f:
from chatspace import ChatSpace
spacer = ChatSpace()
for name in franchise_dict:
    franchise_dict[name] = spacer.space(franchise_dict[name])



franchise_keyword = {}
for i in franchise_dict:
    franchise_keyword[i] = []
    
josa = ['하다','이다','있다'] 
stopwords = ['아', '휴', '아이구', '아이쿠', '아이고', '어', '나', '우리', '저희', '따라', '의해', '을', '를', '에','없다', '의', '가', '으로', '너무','좋다', '로', '주다','에게', '뿐이다', '의거하여', '근거하여', '입각하여', '기준으로', '예하면', '예를 들면', '예를 들자면', '저', '소인', '소생', '저희', '지말고', '하지마', '하지마라', '다른', '물론', '또한', '그리고', '비길수 없다', '해서는 안된다', '뿐만 아니라', '만이 아니다', '만은 아니다', '막론하고', '관계없이', '그치지 않다', '그러나', '그런데', '하지만', '든간에', '논하지 않다', '따지지 않다', '설사', '비록', '더라도', '아니면', '만 못하다', '하는 편이 낫다', '불문하고', '향하여', '향해서', '향하다', '쪽으로', '틈타', '이용하여', '타다', '오르다', '제외하고', '이 외에', '이 밖에', '하여야', '비로소', '한다면 몰라도', '외에도', '이곳', '여기', '부터', '기점으로', '따라서', '할 생각이다', '하려고하다', '이리하여', '그리하여', '그렇게 함으로써', '하지만', '일때', '할때', '앞에서', '중에서', '보는데서', '으로써', '로써', '까지', '해야한다', '일것이다', '반드시', '할줄알다', '할수있다', '할수있어', '임에 틀림없다', '한다면', '등', '등등', '제', '겨우', '단지', '다만', '할뿐', '딩동', '댕그', '대해서', '대하여', '대하면', '훨씬', '얼마나', '얼마만큼', '얼마큼', '남짓', '여', '얼마간', '약간', '다소', '좀', '조금', '다수', '몇', '얼마', '지만', '하물며', '또한', '그러나', '그렇지만', '하지만', '이외에도', '대해 말하자면', '뿐이다', '다음에', '반대로', '반대로 말하자면', '이와 반대로', '바꾸어서 말하면', '바꾸어서 한다면', '만약', '그렇지않으면', '까악', '툭', '딱', '삐걱거리다', '보드득', '비걱거리다', '꽈당', '응당', '해야한다', '에 가서', '각', '각각', '여러분', '각종', '각자', '제각기', '하도록하다', '와', '과', '그러므로', '그래서', '고로', '한 까닭에', '하기 때문에', '거니와', '이지만', '대하여', '관하여', '관한', '과연', '실로', '아니나다를가', '생각한대로', '진짜로', '한적이있다', '하곤하였다', '하', '하하', '허허', '아하', '거바', '와', '오', '왜', '어째서', '무엇때문에', '어찌', '하겠는가', '무슨', '어디', '어느곳', '더군다나', '하물며', '더욱이는', '어느때', '언제', '야', '이봐', '어이', '여보시오', '흐흐', '흥', '휴', '헉헉', '헐떡헐떡', '영차', '여차', '어기여차', '끙끙', '아야', '앗', '아야', '콸콸', '졸졸', '좍좍', '뚝뚝', '주룩주룩', '솨', '우르르', '그래도', '또', '그리고', '바꾸어말하면', '바꾸어말하자면', '혹은', '혹시', '답다', '및', '그에 따르는', '때가 되어', '즉', '지든지', '설령', '가령', '하더라도', '할지라도', '일지라도', '지든지', '몇', '거의', '하마터면', '인젠', '이젠', '된바에야', '된이상', '만큼', '어찌됏든', '그위에', '게다가', '점에서 보아', '비추어 보아', '고려하면', '하게될것이다', '일것이다', '비교적', '좀', '보다더', '비하면', '시키다', '하게하다', '할만하다', '의해서', '연이서', '이어서', '잇따라', '뒤따라', '뒤이어', '결국', '의지하여', '기대여', '통하여', '자마자', '더욱더', '불구하고', '얼마든지', '마음대로', '주저하지 않고', '곧', '즉시', '바로', '당장', '하자마자', '밖에 안된다', '하면된다', '그래', '그렇지', '요컨대', '다시 말하자면', '바꿔 말하면', '즉', '구체적으로', '말하자면', '시작하여', '시초에', '이상', '허', '헉', '허걱', '바와같이', '해도좋다', '해도된다', '게다가', '더구나', '하물며', '와르르', '팍', '퍽', '펄렁', '동안', '이래', '하고있었다', '이었다', '에서', '로부터', '까지', '예하면', '했어요', '해요', '함께', '같이', '더불어', '마저', '마저도', '양자', '모두', '습니다', '가까스로', '하려고하다', '즈음하여', '다른', '다른 방면으로', '해봐요', '습니까', '했어요', '말할것도 없고', '무릎쓰고', '개의치않고', '하는것만 못하다', '하는것이 낫다', '매', '매번', '들', '모', '어느것', '어느', '로써', '갖고말하자면', '어디', '어느쪽', '어느것', '어느해', '어느 년도', '라 해도', '언젠가', '어떤것', '어느것', '저기', '저쪽', '저것', '그때', '그럼', '그러면', '요만한걸', '그래', '그때', '저것만큼', '그저', '이르기까지', '할 줄 안다', '할 힘이 있다', '너', '너희', '당신', '어찌', '설마', '차라리', '할지언정', '할지라도', '할망정', '할지언정', '구토하다', '게우다', '토하다', '메쓰겁다', '옆사람', '퉤', '쳇', '의거하여', '근거하여', '의해', '따라', '힘입어', '그', '다음', '버금', '두번째로', '기타', '첫번째로', '나머지는', '그중에서', '견지에서', '형식으로 쓰여', '입장에서', '위해서', '단지', '의해되다', '하도록시키다', '뿐만아니라', '반대로', '전후', '전자', '앞의것', '잠시', '잠깐', '하면서', '그렇지만', '다음에', '그러한즉', '그런즉', '남들', '아무거나', '어찌하든지', '같다', '비슷하다', '예컨대', '이럴정도로', '어떻게', '만약', '만일', '위에서 서술한바와같이', '인 듯하다', '하지 않는다면', '만약에', '무엇', '무슨', '어느', '어떤', '아래윗', '조차', '한데', '그럼에도 불구하고', '여전히', '심지어', '까지도', '조차도', '하지 않도록', '않기 위하여', '때', '시각', '무렵', '시간', '동안', '어때', '어떠한', '하여금', '네', '예', '우선', '누구', '누가 알겠는가', '아무도', '줄은모른다', '줄은 몰랏다', '하는 김에', '겸사겸사', '하는바', '그런 까닭에', '한 이유는', '그러니', '그러니까', '때문에', '그', '너희', '그들', '너희들', '타인', '것', '것들', '너', '위하여', '공동으로', '동시에', '하기 위하여', '어찌하여', '무엇때문에', '붕붕', '윙윙', '나', '우리', '엉엉', '휘익', '윙윙', '오호', '아하', '어쨋든', '만 못하다', '하기보다는', '차라리', '하는 편이 낫다', '흐흐', '놀라다', '상대적으로 말하자면', '마치', '아니라면', '쉿', '그렇지 않으면', '그렇지 않다면', '안 그러면', '아니었다면', '하든지', '아니면', '이라면', '좋아', '알았어', '하는것도', '그만이다', '어쩔수 없다', '하나', '일', '일반적으로', '일단', '한켠으로는', '오자마자', '이렇게되면', '이와같다면', '전부', '한마디', '한항목', '근거로', '하기에', '아울러', '하지 않도록', '않기 위해서', '이르기까지', '되다', '인해', '까닭으로', '이유만으로', '그래서', '이 때문에', '그러므로', '까닭', '있다', '결론을 낼 수 있다', '으로 인하여', '있다', '어떤것', '관계가 있다', '관련이 있다', '연관되다', '어떤것들', '에 대해', '이리하여', '그리하여', '여부', '하기보다는', '하느니', '하면 할수록', '운운', '이러이러하다', '하구나', '하도다', '다시말하면', '다음으로', '에 있다', '에 달려 있다', '우리', '우리들', '오히려', '하기는한데', '어떻게', '어떻해', '어찌됏어', '어때', '어째서', '본대로', '자', '이', '이쪽', '여기', '이것', '이번', '이렇게말하자면', '이런', '이러한', '이와 같은', '요만큼', '요만한 것', '얼마 안 되는 것', '이만큼', '이 정도의', '이렇게 많은 것', '이와 같다', '이때', '이렇구나', '것과 같이', '끼익', '삐걱', '따위', '와 같은 사람들', '부류의 사람들', '왜냐하면', '중의하나', '오직', '오로지', '에 한하다', '하기만 하면', '도착하다', '까지 미치다', '도달하다', '정도에 이르다', '할 지경이다', '결과에 이르다', '관해서는', '여러분', '하고 있다', '한 후', '혼자', '자기', '자기집', '자신', '우에 종합한것과같이', '총적으로 보면', '총적으로 말하면', '총적으로', '대로 하다', '으로서', '참', '그만이다', '할 따름이다', '쿵', '탕탕', '쾅쾅', '둥둥', '봐', '봐라', '아이야', '아니', '와아', '응', '아이', '참나', '년', '월', '일', '령', '영', '일', '이', '삼', '사', '오', '육', '륙', '칠', '팔', '구', '이천육', '이천칠', '이천팔', '이천구', '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉', '령']
from soylemma import Lemmatizer 
lemmatizer = Lemmatizer() 

for i in franchise_keyword:
    for j in franchise_dict[i]:
        j = j.replace(".","")
        j = j.replace('#','') 
        j = j.replace('치킨','')
        j = j.replace (i,'')
        
        for x in j.split():
            if lemmatizer.lemmatize(x) == []:
                if x not in josa or x not in stopwords:
                    franchise_keyword[i].append(x)
            else:
                a = lemmatizer.lemmatize(x)
                if len(a)>0:
示例#6
0
    def __init__(self):
        from soylemma import Lemmatizer

        self.inst = Lemmatizer()
示例#7
0
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line != '':
                result.append(line)
    return result


def remove_tag(_text):
    result = []
    for ch in _text.split(' '):
        result.append(ch.split('/')[0])
    return result


lemmatizer = Lemmatizer()
inputList = get_data_from_file('input.txt')
for q in inputList:
    #ex = '내/NP 가/JKS 이기/VV 면/EC 나/NP 는/JX 비싸/VA ㄴ/ETM 아이스크림/NNG 고르/VA 도록/EC 하/VV ㄹ게/EF ./SF <eot>'
    #ex = '내/NP 가/JKS 잘/MAG 알/VV 는/ETM 디저트/NNG 카페/NNG 가/JKS 있/VV 으니/EC 거기/NP 가/VV 자/EF ./SF <eot>'
    # 가자에서 가자고가 나옴
    ex = q
    morp_str = remove_tag(ex)
    print(morp_str)

    result = []
    isMerge = False
    for i in range(0, len(morp_str) - 1):
        conjugateList = lemmatizer.conjugate(morp_str[i], morp_str[i + 1])
        print(conjugateList)
        if len(conjugateList) > 1:
示例#8
0
class KoInterlinear:
    def __init__(self, content, con):
        self.content = content
        self.content = [x.strip() for x in self.content.splitlines()]

        self.con = con

        self.pointer_char = "&#9094;"  #&#10174; #&#10168; #&#10093;
        self.wrapper = textwrap.TextWrapper(placeholder="…")

        self.output = []

        self.tipnumber = 0

        self.mecab = mecablight.MeCabLight()
        self.lemmatizer = Lemmatizer()

        self.transliterator = Transliter(academic)

        self.kogrammarlinks = kogrammarlinks.KoGrammarLinks()

        #self.missing_words = []

        self.particle_classes = ['E', 'J', 'S', 'X']

        ######################
        # connect to database and check tables are there

        self.cur = self.con.cursor()

    def get_sejongtagset_name(self, tag):
        tag = tag.split("+")[0]  # go with first tag if multiple
        if tag in sejongtagset.SejongTagset:
            return sejongtagset.SejongTagset[tag][0]
        else:
            return ""

    def get_sejongtagset_superclass(self, tag):
        tag = tag.split("+")[0]  # go with first tag if multiple
        if tag in sejongtagset.SejongTagset:
            return sejongtagset.SejongTagset[tag][1]
        else:
            print("Note: can't find " + tag + " in tagset!")
            return ""

    def get_sejongtagset_abbrev(self, tag):
        tag = tag.split("+")[0]  # go with first tag if multiple
        if tag in sejongtagset.SejongTagset:
            return sejongtagset.SejongTagset[tag][2]
        else:
            print("Note: can't find " + tag + " in tagset!")
            return ""

    def get_trans_fetch(self, word, original_word):
        self.cur.execute(
            "SELECT word, def, extradata FROM korean_english WHERE word = %s ORDER BY extradata, wordid;",
            (word, ))
        rows = self.cur.fetchall()
        if type(rows) is list and rows:
            return [(row[1] if word == original_word else "[" + row[0] + "] " +
                     row[1]) for row in rows
                    if row is not None and row[1] != None and row[1] != ""]
        else:
            return []

    def fetch_phrase_translations(self,
                                  wordstr,
                                  nextwordstr=None,
                                  nextnextwordstr=None):
        if wordstr is not None and wordstr != "" and nextwordstr != "" and nextnextwordstr != "":
            if nextwordstr is None or nextwordstr == "":
                Exception("fetch_phrase_translations sent None for word2.")
            if nextnextwordstr is not None and nextnextwordstr != "":
                self.cur.execute(
                    "SELECT phrase, def AS trans FROM korean_english_phrase2word WHERE word1 = %s AND word2 = %s UNION SELECT phrase, def AS trans FROM korean_english_phrase3word WHERE word1 = %s AND word2 = %s AND word3 = %s;",
                    (wordstr, nextwordstr, wordstr, nextwordstr,
                     nextnextwordstr))
            else:
                self.cur.execute(
                    "SELECT phrase, def AS trans FROM korean_english_phrase2word WHERE word1 = %s AND word2 = %s;",
                    (wordstr, nextwordstr))
        else:
            return []

        rows = self.cur.fetchall()

        if type(rows) is list and rows:
            return_rows = [
                " ".join([row[0], row[1]]) for row in rows
                if row is not None and row[1] is not None and row[1] != ""
            ]
            return return_rows
        return []

    def try_lemmatization_methods(self, branch, original_word, block=True):
        plain_word = self.get_plain_word(branch)

        is_verb = any(morph[1][0] == "V" for morph in branch
                      if len(morph) > 1 and len(morph[1]) > 0)

        if plain_word == "":
            return []

        #print("plain word " + plain_word)

        # first look up the given word in dictionary
        translations = self.get_trans_fetch(plain_word, original_word)
        if translations:
            return translations

        # then try lemmatizing with soylemma (returns multiple), see if in dic
        lemmatized_words = self.lemmatizer.lemmatize(plain_word)
        if lemmatized_words:
            #grab just the words, removing classes
            lemmatized_words = set(lemmatized[0]
                                   for lemmatized in lemmatized_words
                                   if lemmatized)
            #print("Soylemma Lemmatization of " + str(plain_word))
            #print(lemmatized_words)
            for lemmatized_word in lemmatized_words:
                # check if mecab's input was classed as verb if we're adding da,
                # soylemma's classification somtimes wrongly ascribes as verb and adds da therefore
                # so, block lemmatization that added 다 in cases where mecab didn't call it a verb
                # but soylemma added 다 anyway to a word that didn't have it to begin with
                if block and (
                        not is_verb
                ) and lemmatized_word[-1] == "다" and plain_word[-1] != "다":
                    #print("blocked soylemma's addition of 다 for word "+plain_word+" "+lemmatized_word)
                    continue
                translations.extend(
                    self.get_trans_fetch(lemmatized_word, original_word))
            if translations:
                #print("Found trans with soylemma:")
                #print(translations)
                return translations

        return []

    def list_unique(self, inlist):
        unique = []
        for item in inlist:
            if item not in unique:
                unique.append(item)
        return unique

    def get_translations(self,
                         original_branch,
                         nextbranch=None,
                         nextnextbranch=None):
        original_branch = self.remove_lead_trail_symbols(original_branch)
        if len(original_branch) == 0:
            return []
        original_word = self.get_plain_word(original_branch)

        #try cutting off trailing particle morphs one by one
        branch = original_branch
        cutbranch = branch
        while True:
            translations = self.try_lemmatization_methods(
                cutbranch, original_word)
            if translations:
                #if len(cutbranch) < len(original_branch):
                #print("Found with cutparticle***\n" + str(original_branch) + ": " + str(cutbranch) + ": " + str(translations))
                #else:
                #print("Found with original*\n" + str(original_branch) + ": " + str(cutbranch) + ": " + str(translations))
                return translations
            cutbranch = self.remove_single_trailing_particle(branch)
            if len(cutbranch) < len(branch):
                branch = cutbranch
            else:
                break

        # at this point we should have either all trailing particles off or
        # if only started with all particles, a single particle chunk left

        translations = self.try_lemmatization_methods(branch, original_word)
        if translations:
            #print("Found**\n" + str(original_branch) + ": " + str(branch) + ": " + str(translations))
            return translations

        #this might be handy if there are particles in the middle that are exposed after cuts
        branch_wo_end_particles = self.remove_all_trailing_particles(branch)
        translations = self.try_lemmatization_methods(branch_wo_end_particles,
                                                      original_word)
        if translations:
            #print("Found with branchwoendparticles*********\n" + str(original_branch) + ": " + str(branch_wo_end_particles) + ": " + str(translations))
            return translations

        main_chunks = self.get_main_chunks(branch_wo_end_particles,
                                           ['N', 'V', 'M'])
        main_chunks.extend(self.get_main_chunks(branch_wo_end_particles,
                                                ['M']))
        main_chunks.extend(self.get_main_chunks(branch_wo_end_particles,
                                                ['V']))
        main_chunks.extend(self.get_main_chunks(branch_wo_end_particles,
                                                ['N']))
        main_chunks_unique = self.list_unique(main_chunks)
        main_chunks_NVM_largest_sorted = sorted(main_chunks_unique,
                                                key=self.get_plain_word_len)
        main_chunks_NVM_largest_sorted.reverse()

        if main_chunks_NVM_largest_sorted:
            for main_chunk in main_chunks_NVM_largest_sorted:
                #print("-"+str(main_chunk))

                chunktrans = self.try_lemmatization_methods(
                    main_chunk, original_word)
                if chunktrans:
                    #print("Found with parsed chunks********************************\n" + str(original_branch) + ": " + str(main_chunk) + ": " + str(translations))
                    translations.extend(chunktrans)
                    continue

                #try adding da if it's a verb
                if main_chunk and len(main_chunk[0]) > 1 and len(
                        main_chunk[0][1]) > 0 and main_chunk[0][1][0] == 'V':
                    main_chunk_da = main_chunk
                    main_chunk_da.append(("다", "EC"))
                    translations = self.try_lemmatization_methods(
                        main_chunk_da, original_word)
                    if translations:
                        #print("Found with 다 addition*************************************\n" + str(original_branch) + ": " + str(main_chunk_da) + ": " + str(translations))
                        continue

                #take a morph off
                if len(main_chunk) > 1:
                    shortchunk = main_chunk[0:-1]
                    translations = self.try_lemmatization_methods(
                        shortchunk, original_word)
                if translations:
                    #print("Found with morph off end*****************************************\n" + str(original_branch) + ": " + str(shortchunk) + ": " + str(translations))
                    continue
                # take two off
                if len(main_chunk) > 2:
                    shortchunk = main_chunk[0:-2]
                    translations = self.try_lemmatization_methods(
                        shortchunk, original_word)
                if translations:
                    #print("Found with 2 morph off end*****************************************\n" + str(original_branch) + ": " + str(shortchunk) + ": " + str(translations))
                    continue

                # take char off end of total chunk flattened
                if main_chunk and len(main_chunk[0]) > 1:
                    firstclass = main_chunk[0][1]
                    word = self.get_plain_word(main_chunk)
                    word = word[0:-1]
                    translations = self.try_lemmatization_methods(
                        [(word, firstclass)], original_word)
                    if translations:
                        #print("Found with char off end*****************************************\n" + str(original_branch) + ": " + str([(word, firstclass)]) + ": " + str(translations))
                        continue

        translations = self.list_unique(translations)

        if translations:
            return translations

        ##one last-ditch try at unblocking soylemma's addition of da on end of verbs
        ##that it disagrees with mecab on
        #translations = self.try_lemmatization_methods(branch, original_word, False)
        #if translations:
        #print("Found with ublocked soylemma*********************************************\n" + str(original_branch) + ": " + str(branch) + ": " + str(translations))
        #return translations
        #translations = self.try_lemmatization_methods(branch_wo_end_particles, original_word, False)
        #if translations:
        #print("Found with ublocked soylemma*********************************************\n" + str(original_branch) + ": " + str(branch_wo_end_particles) + ": " + str(translations))
        #return translations

        return []

    def get_main_chunks(self, branch, classes=['M', 'N', 'I', 'V']):
        inside = False
        main_chunks = []
        main_chunk = []
        for i in range(0, len(branch)):
            morph = branch[i]
            if len(morph) > 1 and len(morph[1]) > 0 and morph[1][0] in classes:
                inside = True
                main_chunk.append(morph)
            else:
                if inside:
                    main_chunks.append(main_chunk)
                    main_chunk = []
                inside = False
        if inside:
            main_chunks.append(main_chunk)
        return main_chunks

    def get_plain_branch(self, branch):
        if branch == None:
            return None
        without_symbols = [
            x for x in branch
            if len(x) > 1 and len(x[1]) > 0 and x[1][0] != "S"
        ]
        return without_symbols

    def remove_lead_trail_symbols(self, branch):
        out = branch
        while out and len(out[-1]) > 1 and out[-1][1][0] == 'S':
            out = out[0:-1]
        while out and len(out[0]) > 1 and out[0][1][0] == 'S':
            del out[0]
        return out

    def remove_all_trailing_particles(self, branch):
        out = branch
        while len(out) > 1 and len(
                out[-1]) > 1 and out[-1][1][0] in self.particle_classes:
            out = out[0:-1]
        return out

    def remove_single_trailing_particle(self, branch):
        if len(branch) > 1 and len(
                branch[-1]) > 1 and branch[-1][1][0] in self.particle_classes:
            return branch[0:-1]
        return branch

    def get_plain_word(self, branch):
        if branch == None:
            return None
        without_symbols = [
            (x[0] if len(x) > 1 and len(x[1]) > 0 and x[1][0] != "S" else "")
            for x in branch
        ]
        return "".join(without_symbols)

    def get_plain_word_len(self, branch):
        if branch == None:
            Exception("NoneType branch")
        return sum([(len(x[0])
                     if len(x) > 1 and len(x[1]) > 0 and x[1][0] != "S" else 0)
                    for x in branch])

    def end_of_sentence_punc(self, word):
        for morph in word:
            if len(morph) > 1 and len(
                    morph[1]) > 0 and morph[1][0] == 'S' and (
                        "." in morph[0] or "?" in morph[0] or "!" in morph[0]):
                return True
        return False

    def format_passage(self, passage):
        #print("call on: ")
        #print_tree(passage)
        #print(type(passage))
        if type(passage) is list:
            #then presumably we have a passage here, but check it's not empty or weird
            if passage:
                if type(passage[0]) is list:
                    # we have a list of lists, assume passage
                    self.xprint('<div class=wrapper>')
                    self.xprint('    <ol class=sentence>')

                    lenpassage = len(passage)

                    #use this to track where found end of sentence punctuations
                    laststartingpoint = 0

                    for i in range(0, lenpassage):
                        # grab the word we're working on and also the following words
                        # for various matching purposes
                        word = passage[i]
                        nextword = passage[i +
                                           1] if i < lenpassage - 1 else None
                        nextnextword = passage[
                            i + 2] if i < lenpassage - 1 - 1 else None

                        # print the passage translation if we're at
                        # end of a sentence or end of the passage
                        if i == lenpassage - 1 or (
                                i > 0 and self.end_of_sentence_punc(word)):
                            trans_link_passage = passage[laststartingpoint:i +
                                                         1]
                            laststartingpoint = i + 1
                        else:
                            trans_link_passage = None

                        self.format_word(word, nextword, nextnextword,
                                         trans_link_passage)

                    self.xprint('    </ol>')
                    self.xprint('</div>')
                else:
                    print("error, unexpected passage type" +
                          str(type(passage)))
        elif type(passage) is str:
            self.xprint('<div class=wrapper>')

            if passage != "":
                self.xprint('    <ol class=sentence>')
                self.xprint('    <li>')
                self.xprint('      <ol class=comment>')
                self.xprint(
                    '        <li lang=en_MORPH style="color: var(--page-Noun);">'
                )
                self.xprint(html.escape(passage))
                self.xprint('        </li>')
                self.xprint('      </ol>')
                self.xprint('    </li>')
                self.xprint('    </ol>')

            self.xprint('</div>')
        else:
            print("error, expected list or str, got something else")

    def format_word(self,
                    branch,
                    nextbranch=None,
                    nextnextbranch=None,
                    trans_link_passage=None):
        # example: branch = [('“', 'SSO'), ('톱질', 'NNG'), ('하', 'XSV'), ('세', 'EC')]
        non_symbol_branch = self.get_plain_branch(branch)
        plain_word = self.get_plain_word(branch)

        full_word = "".join([("<span style='color: var(--page-" +
                              self.get_sejongtagset_superclass(x[1]) + ");'>" +
                              html.escape(x[0]) + "</span>") for x in branch])
        full_word_length = sum(len(x[0]) for x in branch)

        transliteration = '<span style="color: var(--page-InterInfo)">' + html.escape(
            self.transliterator.translit(plain_word)) + "</span>"

        krdictlink = (
            '<a title="KRDict" class=diclink target="_blank" ' +
            'rel="noopener noreferrer" href="' +
            "https://krdict.korean.go.kr/eng/smallDic/searchResult?nation=eng&nationCode=6&ParaWordNo=&mainSearchWord="
            + urllib.parse.quote(plain_word) +
            f'">{self.pointer_char}&nbsp;KRDict</a>')
        naverlink = (
            '&nbsp;&nbsp;<a title="Naver" class=diclink target="_blank" ' +
            'rel="noopener noreferrer" href="' +
            "https://en.dict.naver.com/#/search?query=" +
            urllib.parse.quote(plain_word) +
            f'">{self.pointer_char}&nbsp;Naver</a><br>')
        daumlink1 = ('<a title="Daum" class=diclink target="_blank" ' +
                     'rel="noopener noreferrer" href="' +
                     "https://small.dic.daum.net/search.do?q=" +
                     urllib.parse.quote(plain_word) + "&dic=eng" +
                     f'">{self.pointer_char}&nbsp;Daum</a>')
        daumlink2 = (
            '&nbsp;&nbsp;<a title="Phrases" class=diclink target="_blank" ' +
            'rel="noopener noreferrer" href="' +
            "https://small.dic.daum.net/search.do?q=" +
            urllib.parse.quote(plain_word) + "&dic=ee" +
            f'">{self.pointer_char}&nbsp;Phrases</a>')

        pos_info = "-".join(
            [self.get_sejongtagset_abbrev(x[1]) for x in non_symbol_branch])

        pos_info_long = "\n".join([
            ("<span style='color: var(--page-" +
             self.get_sejongtagset_superclass(x[1]) + ");'>" +
             html.escape(x[0]) + " " + self.get_sejongtagset_name(x[1]) +
             "</span>") for x in non_symbol_branch
        ])

        grammarlink_matches = self.kogrammarlinks.search(branch, nextbranch)
        grammarlink_matches_html = ''
        if grammarlink_matches:
            grammarlink_matches_html = '<hr><p>'
            #pos_info = pos_info + "﹡"
            for glkey, gldef, glsource, gllink in grammarlink_matches:
                if glsource == "KRDict search" or glsource == "KRDict" or glsource == "YUF":
                    grammarlink_matches_html = (
                        grammarlink_matches_html +
                        '<a class=diclink onclick="particlesearch(\'' +
                        glkey.lstrip("-~") + '\'); return false;" href="#">' +
                        html.escape(glkey + ' ' + gldef) + ' ' +
                        self.pointer_char + '&nbsp;' + glsource + '</a><br>')
                else:
                    grammarlink_matches_html = (
                        grammarlink_matches_html +
                        '<a class=diclink target="_blank" rel="noopener noreferrer" href="'
                        + gllink + '">' + html.escape(glkey + ' ' + gldef) +
                        ' ' + self.pointer_char + '&nbsp;' + glsource +
                        '</a><br>')
            grammarlink_matches_html = grammarlink_matches_html + '</p>'

        translations = self.get_translations(branch, nextbranch,
                                             nextnextbranch)
        if translations:
            translations_html = ('<hr><p>' +
                                 html.escape(";\n".join(translations)) +
                                 '</p>')
        else:
            translations_html = ''

        phrase_translations = self.fetch_phrase_translations(
            plain_word, self.get_plain_word(nextbranch),
            self.get_plain_word(nextnextbranch))

        if phrase_translations:
            phrase_translations_html = (
                '<hr><p style="font-weight:bold;color: var(--page-Noun);">' +
                html.escape(";\n".join(phrase_translations)) + '</p>')
        else:
            phrase_translations_html = ''

        if trans_link_passage and trans_link_passage[0] and type(
                trans_link_passage[0][0]) is tuple:
            trans_link_passage_text_encoded = urllib.parse.quote(" ".join(
                "".join(tup[0] for tup in twig)
                for twig in trans_link_passage))

            trans_link_passage_html = f'<a class=diclink target="_blank" rel="noopener noreferrer" href="https://papago.naver.com/?sk=ko&tk=en&st={trans_link_passage_text_encoded}">{self.pointer_char}&nbsp;passage Papago</a><br><a class=diclink target="_blank" rel="noopener noreferrer" href="https://translate.google.com/?sl=ko&tl=en&text={trans_link_passage_text_encoded}&op=translate">{self.pointer_char}&nbsp;passage GTrans</a><hr>'

            full_word = full_word + '&nbsp;<span style="color: var(--page-Particle); font-weight: normal;">' + self.pointer_char + "</span>"

        else:
            trans_link_passage_html = ""

        total_lines_remain = 4

        self.wrapper.width = round(full_word_length *
                                   3.5) if full_word_length > 1 else 2 * 3

        if phrase_translations:
            tshort_t = ";\n".join(phrase_translations)
            self.wrapper.max_lines = 2
            phrase_translations_html_short = (
                '<p style="font-weight:bold;color: var(--page-Noun)">' +
                html.escape(self.wrapper.fill(tshort_t)).replace(
                    "\n", "<BR>") + '</p>')
            total_lines_remain = total_lines_remain - self.wrapper.max_lines
        else:
            phrase_translations_html_short = ""

        if translations:
            self.wrapper.max_lines = total_lines_remain
            translations_html_short = html.escape(
                self.wrapper.fill("; ".join(translations))).replace(
                    "\n", "<BR>")  #.replace(": ", ":&nbsp;")
        else:
            translations_html_short = "—"

        description = (html.escape(pos_info) + "<p>" +
                       translations_html_short +
                       phrase_translations_html_short + "</p>")

        tooltip = (trans_link_passage_html + transliteration + '<hr>' +
                   krdictlink + naverlink + daumlink1 + daumlink2 + '<hr><p>' +
                   pos_info_long + '</p>' + translations_html +
                   phrase_translations_html +
                   grammarlink_matches_html).replace("\n", "<br>")

        self.xprint('    <li>')
        self.xprint(
            f'      <ol class=word onclick="showtooltip(event, \'tipnumber{self.tipnumber}\')">'
        )
        self.xprint(f'        <li lang=es>{full_word}</li>')
        self.xprint(
            f'        <li lang=en_MORPH class=tooltip>{description}<span class=tooltiptext id="tipnumber{self.tipnumber}">{tooltip}</span></li><br>'
        )
        self.xprint('      </ol>')
        self.xprint('    </li>')

        self.tipnumber = self.tipnumber + 1

        #sys.stdout.flush()

    def xprint(self, message):
        self.output.append(message)

    def generate(self):
        structured_content = []

        for j in range(len(self.content)):
            line = self.content[j]
            if line == "":
                structured_content.append(line)
            elif line[0] == "#":
                structured_content.append(line[1:])
            else:
                # see https://pypi.org/project/python-mecab-ko/
                structured_content.append(self.mecab.pos(line))

        self.tipnumber = 0

        for passage in structured_content:
            self.format_passage(passage)

        self.cur.close()
        self.con.close()

        return "\n".join(self.output)
示例#9
0
    ptm.splitter.NLTK(),
    ptm.tokenizer.MeCab(mecab_path),
    #ptm.tokenizer.Komoran(),
    ptm.lemmatizer.SejongPOSLemmatizer(),
    ptm.helper.SelectWordOnly(),
    #ptm.ngram.NGramTokenizer(1, 2, concat=' '))
    ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))

documents = [
    '오늘은 비가와서 그런지 매우 우울한 날이다', '시험이 끝나야 놀지 스트레스 받아ㅠㅠ', '행복한 하루의 끝이라 좋네!'
]

corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2)
#result = pipeline.processCorpus(corpus)

result = pipeline.processCorpus(documents)
print(result)

from soylemma import Lemmatizer
lemmatizer = Lemmatizer(dictionary_name='default')
re = lemmatizer.lemmatize('밝은')
print('result ' + str(re))

test_list = ['http://www.google.com', "why", "ftpfjdjkwjkjw", "no no!"]
PROTOCOLS = ('http', 'https', 'ftp', 'git')
for s in test_list:
    if s.startswith(tuple(p for p in PROTOCOLS)):
        print("true " + s)
    else:
        print("false " + s)