def sameCheck(input_string1, input_string2): # 유니코드 한글을 비교하여 유사도 리턴. # 0 - 초,중,종 모두 다름 # 1 - 한개가 같음 # 2 - 두개가 같음 # 3 - 동일한 글자 # 단일 자음은 종성으로 간주하고 convert_dictionary 를 이용하여 유사도 비교 convert_dictionary = { 1: 5, 4: 8, 7: 11, 8: 13, 16: 21, 17: 22, 19: 25, 21: 27, 22: 0, 23: 2, 24: 3, 25: 4, 26: 5, 27: 6, 0: 0 } var1 = hangul.separate(input_string1) var2 = hangul.separate(input_string2) same_point = 0 if var1[0] == -75: # 한글이 아닌 기호여서, full match 만 해야 함 for index in range(0, 3): if var1[index] == var2[index]: if index == 2: if var1[index] != 0: same_point += 1 else: same_point += 1 if same_point == 3: return 3 else: return 0 else: if var2[0] == -54 and var2[1] == 11: # 단독자음 처리 if convert_dictionary[var1[2]] == var2[2]: return 1 for index in range(0, 3): # 한글 처리 if var1[index] == var2[index]: if index == 2: if var1[index] != 0: same_point += 1 else: same_point += 1 return same_point
def get_onehot_vector(sent): """ convert sentecne to vector :return: list """ try: return_vector = [] embeddings = np.zeros([30]) idx = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' '] num_reg = re.compile("[0-9- ]") if (type(sent) not in [type('str'), type([])]): raise Exception("input must be str") if (type(sent) == type([])): sent = sent[0] for char in sent: vector_a = np.copy(embeddings) vector_b = np.copy(embeddings) vector_c = np.copy(embeddings) vector_d = np.copy(embeddings) if (num_reg.match(char) == None and hangul.is_hangul(char)): anl = hangul.separate(char) vector_a[anl[0] if anl[0] > 0 else 0] = 1 vector_b[anl[1] if anl[1] > 0 else 0] = 1 vector_c[anl[2] if anl[2] > 0 else 0] = 1 elif (num_reg.match(char)): vector_d[idx.index(char)] = 1 return_vector.append( np.append(vector_a, [vector_b, vector_c, vector_d])) return np.array(return_vector) except Exception as e: print("error on get_onehot_vector : {0}".format(e))
def text2label(string): digit_dict = { '0': '영', '1': '일', '2': '이', '3': '삼', '4': '사', '5': '오', '6': '육', '7': '칠', '8': '팔', '9': '구' } char_length = 72 labels = list() for seq in string: label = list() seq = filter.sub('', seq) for i, c in enumerate(seq): if c == ' ': label.append(70) else: if c.isdigit(): c = digit_dict[c] cho, joong, jong = hangul.separate(c) label.append(one_hot(cho, char_length)) label.append(one_hot(joong + 21, char_length)) if jong: label.append(one_hot(jong + 42, char_length)) labels.append(label) return np.array(labels)
def String_to_Token_List(string): #English or special char.... if re.search(r'[가-힣\d\s\.,\?!]+', string) is None or re.search( r'[가-힣\d\s\.,\?!]+', string).group() != string: print(string) return False regex_DtoS = r'(?:^|[^\d])(\d{4})(?:$|[^\d])' string = re.sub( regex_DtoS, lambda x: re.sub(r'\d{4}', lambda y: Number_to_String(y.group()), x.group()), string) regex_CtoS1 = r"([+-]?\d[\d,]*)[\.]?\d*" regex_CtoS2 = r"(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)" string = re.sub( regex_CtoS1 + regex_CtoS2, lambda x: re.sub( regex_CtoS1, lambda y: Count_Number(int(y.group())), x.group()), string) regex_NtoS = r"([+-]?\d[\d,]*)[\.]?\d*" string = re.sub(regex_NtoS, lambda x: Read_Number(int(x.group())), string) token_List = [] token_List.append(0) #<EOS> for char in string: if char == " ": token_List.append(2) continue elif char == ".": token_List.append(71) continue elif char == ",": token_List.append(72) continue elif char == "?": token_List.append(73) continue elif char == "!": token_List.append(74) continue elif hangul.is_hangul(char): onset, nucleus, coda = hangul.separate(char) onset += 3 nucleus += 3 + 19 coda += 3 + 19 + 21 token_List.extend([onset, nucleus, coda]) else: raise Exception("Not handled letter") token_List.append(1) #<EOE> return token_List
def convert_hangul_to_index(string):#, size): #string = unicode(string) list = []#np.ndarray([size, 3]) for i in range(len(string)): #exception if not hangul.is_hangul(string[i]): continue char3 = hangul.separate(string[i]) idx = char3[0] + char3[1] * FirNum + char3[2] * FirNum * SecNum list.append([idx]) if len(list)==0: list.append([ClassNum - 1]) return np.array(list)
def get_onehot_vector(self, sent): """ convert sentecne to vector :return: list """ try: return_vector = [] embeddings = np.zeros([40]) idx = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] num_reg = re.compile("[a-z0-9- ]") if (type(sent) not in [type('str'), type([])]): raise Exception("input must be str") if (type(sent) == type([])): sent = sent[0] for char in sent: vector_a = np.copy(embeddings) vector_b = np.copy(embeddings) vector_c = np.copy(embeddings) vector_d = np.copy(embeddings) if (num_reg.match(char) == None and hangul.is_hangul(char)): anl = hangul.separate(char) vector_a[anl[0] if anl[0] > 0 else 0] = 1 vector_b[anl[1] if anl[1] > 0 else 0] = 1 vector_c[anl[2] if anl[2] > 0 else 0] = 1 elif (num_reg.match(char)): vector_d[idx.index(char)] = 1 else : vector_d[39] = 1 return_vector.append(np.append(vector_a, [vector_b, vector_c, vector_d])) return np.array(return_vector) except Exception as e: print("error on get_onehot_vector : {0}".format(e))
def komoran_processing(komoran, complex_verb_set): tokenList = list() formerTokenTag = "" original_token = "" word_cnt = len(komoran) word_idx = 0 for phrase in komoran: phrase_cnt = len(phrase) idx = 0 word_idx += 1 tokenWord = "" formerTag = "" new_check = False while idx < phrase_cnt: token = phrase[idx].getFirst().replace(" ", "") #remove white space in proper nouns if " " not in token and len(token) > 9: idx += 1 new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue tag = phrase[idx].getSecond() ########## FIRST PHONEME if idx == 0 or (idx == 1 and formerTag == "XPN"): #XPN: 체언접두사 if formerTokenTag == "REMOVE_EC": tokenList[-1] = "%s " % tokenList[-1] # To combine the former token and the current token elif formerTokenTag == "MAG": tokenWord = tokenList[-1] del tokenList[-1] elif formerTokenTag == "EC": if tag not in {"VV", "VA", "VX"} or (tag in {"VV", "VA", "VX"} and token not in complex_verb_set): tokenList[-1] = original_token formerTokenTag = "" original_token = "" # prefix(체언접두사) if tag == "XPN": tokenWord = token formerTag = tag idx += 1 new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue else: # common noun, proper noun, root adverb and foreign language if tag in {"NNG", "NNP", "SL", "XR", "MAG"}: if tag == "SL": token = token.lower() if tag == "MAG": if token in {"안", "못", "잘못"}: formerTokenTag = "MAG" new_check = True else: idx += 1 new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue elif tag == "NNP": token = "%s " % token elif formerTokenTag == "NNB" and token == "밖": token = "밖에" tokenWord = tokenWord.rstrip("_") formerTokenTag = "JX" new_check = True else: pass # dependent nouns elif tag == "NNB" and token in {"수", "지", "때문"}: if token == "수" and formerTokenTag in {"VV", "VA", "VX"}: formerTokenTag = "NNB1" elif token == "수" and formerTokenTag not in {"VV", "VA", "VX"}: formerTokenTag = "" pass # 사람의 '수' 등의 형태소분석 오류 elif token == "지" and formerTokenTag in {"VV", "VA", "VX"}: formerTokenTag = "NNB2" elif token == "때문": formerTokenTag = "NNB3" else: formerTokenTag = "" new_check = False idx += 1 continue new_check = True try: if formerTokenTag.startswith("NNB"): token = "%s_%s" % (tokenList[-1], token) del tokenList[-1] else: pass except IndexError: # not dependent nouns formerTokenTag = "" new_check = False idx += 1 continue # verb, adjective and auxiliary predicate elif tag in ["VV", "VA", "VX"]: if formerTokenTag == "EC": token = "%s%s" % (tokenList[-1], "%s다" % token) del tokenList[-1] elif formerTokenTag == "JX": if token == "없": # 수밖에_없다 token = "_%s다" % token else: token = " %s다" % token elif formerTokenTag.startswith("NNB"): if formerTokenTag.endswith("1") and token in {"있", "없"}: # 하다_수_있다 tokenWord = tokenList[-1] token = "_%s다" % token del tokenList[-1] elif formerTokenTag.endswith("2") and token in {"모르", "않", "말"}: # 하다_지_모르다 tokenWord = tokenList[-1] token = "_%s다" % token del tokenList[-1] elif formerTokenTag.endswith("3"): # 사랑_때문_자라다 tokenWord = tokenList[-1] token = "_%s다" % token del tokenList[-1] else: token = "%s다" % token # only in case negative words elif token in ["않", "없", "못하", "말", "싫", "주"]: try: token = "%s%s" % (tokenList[-1], "_%s다" % token) del tokenList[-1] except IndexError: token = "_%s다" % token else: token = "%s다" % token formerTokenTag = tag new_check = True elif formerTokenTag == "NNB1" and tag == "JX": if token == "밖에": # 수 밖에 = 수밖에 token = "%s%s" % (tokenList[-1].rstrip("_"), token) del tokenList[-1] formerTokenTag = "JX" new_check = True elif token == "도": # 갈 수도 있다 = 가다_수_있다 new_check = True else: # just pass the rest idx += 1 formerTag = tag new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue tokenWord += token formerTag = tag # checking 'formerTokenTag' to be newly assigned new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) ########## FROM SECOND PHONEME else: if formerTokenTag == "REMOVE_EC": tokenWord = "%s " % tokenWord elif formerTokenTag.startswith("NNB") and tag not in {"VV", "VA", "VX"}: tokenWord = "%s " % tokenWord if formerTag in ["ETM", "ETN"]: # ETM: 관성형전성어미, ETN: 명사형전성어미 tokenWord = "%s " % tokenWord # common noun, proper noun, root and foreign language, suffix(noun) and dependent noun if tag in {"NNG", "NNP", "XR", "SL", "NNB", "XSN"}: if tag == "SL": token = token.lower() if tag == "NNP": if formerTag == "NNP": tokenWord = "%s%s " % (tokenWord, token) else: tokenWord = "%s %s " % (tokenWord, token) elif tag == "NNB": if token in {"수", "지", "때문"}: if token == "수": if formerTag in {"VV", "VA", "VX"}: tokenWord = "%s_%s" % (tokenWord.strip(), token) formerTokenTag = "NNB1" else: tokenWord = "%s %s" % (tokenWord.strip(), token) elif token == "지" and formerTag in {"VV", "VA", "VX"}: tokenWord = "%s_%s" % (tokenWord.strip(), token) formerTokenTag = "NNB2" elif token == "때문": tokenWord = "%s_%s" % (tokenWord.strip(), token) formerTokenTag = "NNB3" else: pass new_check = True else: pass else: tokenWord += token # 부정지정사 '아니' elif tag == "VCN": tokenWord = "%s_%s다" % (tokenWord, token) # 수 밖에 = 수밖에 elif (formerTokenTag == "NNB1" or formerTag == "NNB1") and tag == "JX": if token == "밖에": tokenWord = "%s%s" % (tokenWord.rstrip("_"), token) formerTokenTag = "JX" new_check = True elif token == "도": new_check = True # suffix(adjective and verb) elif tag in {"XSA", "XSV"}: token = "%s다" % token tokenWord += token # connective endings elif tag == "EC": original_token = tokenWord newToken = tokenWord.rstrip("다") try: jong = hangul.separate(newToken[-1]) #마지막 글자 분해 except IndexError: idx += 1 formerTag = tag new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue if token in {"아", "어"}: if formerTag == "VCP": idx += 1 formerTag = tag new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue # 받침없음 if jong[-1] == 0: if jong[1] == 20 and token == "어": # 달리+어=달려 newJong = hangul.build(jong[0], 6, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = newToken elif jong[1] == 18 and jong[0] == 5: # '르' 불규칙: 마르+아=말라 tmp = hangul.separate(newToken[-2]) newJong = hangul.build(tmp[0], tmp[1], 8) tmp2 = hangul.separate(token) newJong2 = hangul.build(5, tmp2[1], tmp2[2]) if len(newToken) == 2: # 마르, 오르, 바르.. newToken = "%s%s" % (newJong, newJong2) else: newToken = "%s%s%s" % (newToken[:-2], newJong, newJong2) tokenWord = newToken elif jong[1] == 18 and token == "어": # 'ㅡ' 불규칙: 쓰+어=써 newJong = hangul.build(jong[0], 4, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = newToken elif jong[1] == 13 and token == "어": # 세우+어=세워 newJong = hangul.build(jong[0], 14, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = newToken elif jong[1] == 8 and token == "아": # 따라오+아=따라와 newJong = hangul.build(jong[0], 9, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = newToken elif jong[1] == 11 and token == "어": # 되+어=되어 newToken = "%s%s" % (newToken, "어") tokenWord = newToken elif jong == (18,0,0) and token in {"아", "어"}: if len(newToken) == 1: tokenWord = "해" else: tokenWord = "%s해" % newToken[:-1] else: tokenWord = newToken # 펴, 자 # 받침있음 else: # '빨개야', '파래야' 등의 'ㅎ' 탈락은 형태소분석 자체가 잘 되지 않아 불규칙 적용하지 않음 # '묻다'는 '땅에 묻다'와 '물어보다'의 의미가 구분되지 않아 불규칙 적용하지 않음 if jong[-1] == 7 and \ (newToken[-1] in ("걷", "싣", "듣") or newToken[-2:] in ("깨닫", "일컫")): # ㄷ받침 newJong = hangul.build(jong[0], jong[1], 8) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) elif jong[-1] == 19 and \ (newToken[-1] in ("긋", "낫", "붓", "잇", "젓", "짓")): # ㅅ받침 newJong = hangul.build(jong[0], jong[1], 0) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) elif jong[-1] == 17 \ and newToken[-1] not in ("입", "잡", "씹", "좁", "접", "뽑"): # ㅂ받침: 눕+어=누워 newJong = hangul.build(jong[0], jong[1], 0) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) if token == "어": token = "워" elif token == "아": token = "와" tokenWord = "%s%s" % (newToken, token) formerTokenTag = "EC" new_check = True elif token in {"어야", "아야", "어다", "아다"}: ending = token[-1] if jong[-1] == 0: # 받침 없음 if jong[1] == 20 and token.startswith("어"): # 달려야, 마셔야 newJong = hangul.build(jong[0], 6, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s%s" % (newToken[:-1], newJong, ending) tokenWord = newToken elif jong[1] == 18 and jong[0] == 5: # '르' 불규칙: 마르+아야=말라야 tmp = hangul.separate(newToken[-2]) newJong = hangul.build(tmp[0], tmp[1], 8) tmp2 = hangul.separate(token[0]) newJong2 = hangul.build(5, tmp2[1], tmp2[2]) if len(newToken) == 2: # 마르, 오르, 바르.. newToken = "%s%s%s" % (newJong, newJong2, ending) else: newToken = "%s%s%s%s" % (newToken[:-2], newJong, newJong2, ending) tokenWord = newToken elif jong[1] == 18 and token.startswith("어"): # 'ㅡ' 불규칙: 쓰+어야=써야 newJong = hangul.build(jong[0], 4, jong[-1]) if len(newToken) == 1: newToken = "%s%s" % (newJong, ending) else: newToken = "%s%s%s" % (newToken[:-1], newJong, ending) tokenWord = newToken elif jong[1] == 13 and token.startswith("어"): # 세우+어야=세워야 newJong = hangul.build(jong[0], 14, jong[-1]) if len(newToken) == 1: newToken = "%s%s" % (newJong, ending) else: newToken = "%s%s%s" % (newToken[:-1], newJong, ending) tokenWord = newToken elif jong[1] == 8 and token.startswith("아"): # 따라오+아야=따라와야 newJong = hangul.build(jong[0], 9, jong[-1]) if len(newToken) == 1: newToken = "%s%s" % (newJong, ending) else: newToken = "%s%s%s" % (newToken[:-1], newJong, ending) tokenWord = newToken elif jong == (18,0,0) and (token.startswith("아") or token.startswith("어")): # 해야 if len(newToken) == 1: tokenWord = "해%s" % ending else: tokenWord = "%s해%s" % (newToken[:-1], ending) else: tokenWord = "%s%s" % (newToken, ending) # 펴야, 자야 # 받침 있음 elif jong[-1] != 0: if jong[-1] == 17 \ and newToken[-1] not in ("입", "잡", "씹", "좁", "접", "뽑"): # ㅂ받침: 눕+어=누워 newJong = hangul.build(jong[0], jong[1], 0) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) if token.startswith("어"): # 아름다워야 tokenWord = "%s워%s" % (newToken, ending) elif token.startswith("아"): # 고와야 tokenWord = "%s와%s" % (newToken, ending) elif jong[-1] == 19 and \ (newToken[-1] in ("긋", "낫", "붓", "잇", "젓", "짓")): # ㅅ받침 newJong = hangul.build(jong[0], jong[1], 0) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = "%s%s" % (newToken, token) elif jong[-1] == 7 and \ (newToken[-1] in ("걷", "싣", "듣") or newToken[-2:] in ("깨닫", "일컫")): # ㄷ받침 newJong = hangul.build(jong[0], jong[1], 8) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = "%s%s%s" % (newToken, token[0], ending) else: tokenWord = "%s%s" % (newToken, token) formerTokenTag = "EC" new_check = True elif token.endswith("지") and formerTag in {"VV", "VA", "VX"}: # 하다_지_모른다 tokenWord = "%s다_지_" % newToken formerTokenTag = "EC" new_check = True else: formerTokenTag = "REMOVE_EC" new_check = True # verb and adjective elif tag in ["VV", "VA", "VX"]: if formerTokenTag == "EC": if tag == "VX" and token == "가지": # 해가지고, 떠가지고 = 하다, 뜨다 tokenWord = original_token else: tokenWord = "%s%s" % (tokenWord, token+"다") elif formerTokenTag.startswith("NNB"): if formerTokenTag.endswith("1") and token in {"있", "없"}: # 하다_수_있다 tokenWord = "%s_%s다" % (tokenWord, token) elif formerTokenTag.endswith("2") and token in {"모르", "않", "말"}: # 하다_지_모르다 tokenWord = "%s_%s다" % (tokenWord, token) elif formerTokenTag.endswith("3"): # 사랑_때문_자라다 tokenWord = "%s_%s다" % (tokenWord, token) else: tokenWord += "%s다" % token elif formerTokenTag == "JX": if token == "없": tokenWord += "_%s다" % token else: tokenWord += " %s다" % token else: if tag in ("VX", "VA") and token in ["않", "없", "못하", "말", "주"]: tokenWord += "_%s다" % token else: # rest all tag including VX # insert white space if V-V or VA-VA case(grammar error) if formerTokenTag == tag: tokenWord += " %s다" % token else: tokenWord += "%s다" % token formerTokenTag = tag new_check = True # adverb elif tag == "MAG" and token == "못": tokenWord += "_%s" % token elif tag == "MAG" and token == "없이": tokenWord = "%s없다" % tokenWord # JKB: 부사격조사 elif tag == "JKB" and token == "같이": tokenWord = "%s같다" % tokenWord else: idx += 1 formerTag = tag new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue formerTag = tag new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) idx += 1 tokenList.append(tokenWord.strip()) # if it's the last word in 'Komoran' if formerTokenTag == "EC": tokenList[-1] = original_token finalToken = " ".join([a for a in tokenList if a not in {"", "하다", "암트다"}]) # stop word if "안 하다" in finalToken: finalToken = finalToken.replace("안 하다", "안하다") for neg in {"없다", "않다", "못하다", "안하다"}: if neg == finalToken: continue if " _%s" % neg in finalToken: finalToken = finalToken.replace(" _%s" % neg, " %s" % neg) if " %s" % neg in finalToken: finalToken = finalToken.replace(" %s" % neg, "_%s" % neg) if (" _%s" % neg) in finalToken: finalToken = finalToken.replace((" _%s" % neg), ("_%s" % neg)) finalToken = finalToken.strip("_").replace("__", "_").replace(" _", " ").replace("_ ", " ") return re.sub(r"\s+", " ", finalToken)
def test_separation(): assert hangul.separate(u'가') == (0, 0, 0) assert hangul.separate(u'까') == (1, 0, 0) assert hangul.separate(u'갸') == (0, 2, 0) assert hangul.separate(u'각') == (0, 0, 1)
def test_separation(): assert hangul.separate(u"가") == (0, 0, 0) assert hangul.separate(u"까") == (1, 0, 0) assert hangul.separate(u"갸") == (0, 2, 0) assert hangul.separate(u"각") == (0, 0, 1)
def komoran_processing(komoran, complex_verb_set): tokenList = list() formerTokenTag = "" original_token = "" word_cnt = len(komoran) word_idx = 0 for phrase in komoran: phrase_cnt = len(phrase) idx = 0 word_idx += 1 tokenWord = "" formerTag = "" new_check = False while idx < phrase_cnt: token = phrase[idx].getFirst().replace( " ", "") #remove white space in proper nouns if " " not in token and len(token) > 9: idx += 1 new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue tag = phrase[idx].getSecond() ########## FIRST PHONEME if idx == 0 or (idx == 1 and formerTag == "XPN"): #XPN: 체언접두사 if formerTokenTag == "REMOVE_EC": tokenList[-1] = "%s " % tokenList[-1] # To combine the former token and the current token elif formerTokenTag == "MAG": tokenWord = tokenList[-1] del tokenList[-1] elif formerTokenTag == "EC": if tag not in {"VV", "VA", "VX" } or (tag in {"VV", "VA", "VX"} and token not in complex_verb_set): tokenList[-1] = original_token formerTokenTag = "" original_token = "" # prefix(체언접두사) if tag == "XPN": tokenWord = token formerTag = tag idx += 1 new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue else: # common noun, proper noun, root adverb and foreign language if tag in {"NNG", "NNP", "SL", "XR", "MAG"}: if tag == "SL": token = token.lower() if tag == "MAG": if token in {"안", "못", "잘못"}: formerTokenTag = "MAG" new_check = True else: idx += 1 new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue elif tag == "NNP": token = "%s " % token elif formerTokenTag == "NNB" and token == "밖": token = "밖에" tokenWord = tokenWord.rstrip("_") formerTokenTag = "JX" new_check = True else: pass # dependent nouns elif tag == "NNB" and token in {"수", "지", "때문"}: if token == "수" and formerTokenTag in { "VV", "VA", "VX" }: formerTokenTag = "NNB1" elif token == "수" and formerTokenTag not in { "VV", "VA", "VX" }: formerTokenTag = "" pass # 사람의 '수' 등의 형태소분석 오류 elif token == "지" and formerTokenTag in { "VV", "VA", "VX" }: formerTokenTag = "NNB2" elif token == "때문": formerTokenTag = "NNB3" else: formerTokenTag = "" new_check = False idx += 1 continue new_check = True try: if formerTokenTag.startswith("NNB"): token = "%s_%s" % (tokenList[-1], token) del tokenList[-1] else: pass except IndexError: # not dependent nouns formerTokenTag = "" new_check = False idx += 1 continue # verb, adjective and auxiliary predicate elif tag in ["VV", "VA", "VX"]: if formerTokenTag == "EC": token = "%s%s" % (tokenList[-1], "%s다" % token) del tokenList[-1] elif formerTokenTag == "JX": if token == "없": # 수밖에_없다 token = "_%s다" % token else: token = " %s다" % token elif formerTokenTag.startswith("NNB"): if formerTokenTag.endswith("1") and token in { "있", "없" }: # 하다_수_있다 tokenWord = tokenList[-1] token = "_%s다" % token del tokenList[-1] elif formerTokenTag.endswith("2") and token in { "모르", "않", "말" }: # 하다_지_모르다 tokenWord = tokenList[-1] token = "_%s다" % token del tokenList[-1] elif formerTokenTag.endswith("3"): # 사랑_때문_자라다 tokenWord = tokenList[-1] token = "_%s다" % token del tokenList[-1] else: token = "%s다" % token # only in case negative words elif token in ["않", "없", "못하", "말", "싫", "주"]: try: token = "%s%s" % (tokenList[-1], "_%s다" % token) del tokenList[-1] except IndexError: token = "_%s다" % token else: token = "%s다" % token formerTokenTag = tag new_check = True elif formerTokenTag == "NNB1" and tag == "JX": if token == "밖에": # 수 밖에 = 수밖에 token = "%s%s" % (tokenList[-1].rstrip("_"), token) del tokenList[-1] formerTokenTag = "JX" new_check = True elif token == "도": # 갈 수도 있다 = 가다_수_있다 new_check = True else: # just pass the rest idx += 1 formerTag = tag new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue tokenWord += token formerTag = tag # checking 'formerTokenTag' to be newly assigned new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) ########## FROM SECOND PHONEME else: if formerTokenTag == "REMOVE_EC": tokenWord = "%s " % tokenWord elif formerTokenTag.startswith("NNB") and tag not in { "VV", "VA", "VX" }: tokenWord = "%s " % tokenWord if formerTag in ["ETM", "ETN"]: # ETM: 관성형전성어미, ETN: 명사형전성어미 tokenWord = "%s " % tokenWord # common noun, proper noun, root and foreign language, suffix(noun) and dependent noun if tag in {"NNG", "NNP", "XR", "SL", "NNB", "XSN"}: if tag == "SL": token = token.lower() if tag == "NNP": if formerTag == "NNP": tokenWord = "%s%s " % (tokenWord, token) else: tokenWord = "%s %s " % (tokenWord, token) elif tag == "NNB": if token in {"수", "지", "때문"}: if token == "수": if formerTag in {"VV", "VA", "VX"}: tokenWord = "%s_%s" % (tokenWord.strip(), token) formerTokenTag = "NNB1" else: tokenWord = "%s %s" % (tokenWord.strip(), token) elif token == "지" and formerTag in { "VV", "VA", "VX" }: tokenWord = "%s_%s" % (tokenWord.strip(), token) formerTokenTag = "NNB2" elif token == "때문": tokenWord = "%s_%s" % (tokenWord.strip(), token) formerTokenTag = "NNB3" else: pass new_check = True else: pass else: tokenWord += token # 부정지정사 '아니' elif tag == "VCN": tokenWord = "%s_%s다" % (tokenWord, token) # 수 밖에 = 수밖에 elif (formerTokenTag == "NNB1" or formerTag == "NNB1") and tag == "JX": if token == "밖에": tokenWord = "%s%s" % (tokenWord.rstrip("_"), token) formerTokenTag = "JX" new_check = True elif token == "도": new_check = True # suffix(adjective and verb) elif tag in {"XSA", "XSV"}: token = "%s다" % token tokenWord += token # connective endings elif tag == "EC": original_token = tokenWord newToken = tokenWord.rstrip("다") try: jong = hangul.separate(newToken[-1]) #마지막 글자 분해 except IndexError: idx += 1 formerTag = tag new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue if token in {"아", "어"}: if formerTag == "VCP": idx += 1 formerTag = tag new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue # 받침없음 if jong[-1] == 0: if jong[1] == 20 and token == "어": # 달리+어=달려 newJong = hangul.build(jong[0], 6, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = newToken elif jong[1] == 18 and jong[ 0] == 5: # '르' 불규칙: 마르+아=말라 tmp = hangul.separate(newToken[-2]) newJong = hangul.build(tmp[0], tmp[1], 8) tmp2 = hangul.separate(token) newJong2 = hangul.build(5, tmp2[1], tmp2[2]) if len(newToken) == 2: # 마르, 오르, 바르.. newToken = "%s%s" % (newJong, newJong2) else: newToken = "%s%s%s" % (newToken[:-2], newJong, newJong2) tokenWord = newToken elif jong[ 1] == 18 and token == "어": # 'ㅡ' 불규칙: 쓰+어=써 newJong = hangul.build(jong[0], 4, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = newToken elif jong[1] == 13 and token == "어": # 세우+어=세워 newJong = hangul.build(jong[0], 14, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = newToken elif jong[1] == 8 and token == "아": # 따라오+아=따라와 newJong = hangul.build(jong[0], 9, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = newToken elif jong[1] == 11 and token == "어": # 되+어=되어 newToken = "%s%s" % (newToken, "어") tokenWord = newToken elif jong == (18, 0, 0) and token in {"아", "어"}: if len(newToken) == 1: tokenWord = "해" else: tokenWord = "%s해" % newToken[:-1] else: tokenWord = newToken # 펴, 자 # 받침있음 else: # '빨개야', '파래야' 등의 'ㅎ' 탈락은 형태소분석 자체가 잘 되지 않아 불규칙 적용하지 않음 # '묻다'는 '땅에 묻다'와 '물어보다'의 의미가 구분되지 않아 불규칙 적용하지 않음 if jong[-1] == 7 and \ (newToken[-1] in ("걷", "싣", "듣") or newToken[-2:] in ("깨닫", "일컫")): # ㄷ받침 newJong = hangul.build(jong[0], jong[1], 8) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) elif jong[-1] == 19 and \ (newToken[-1] in ("긋", "낫", "붓", "잇", "젓", "짓")): # ㅅ받침 newJong = hangul.build(jong[0], jong[1], 0) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) elif jong[-1] == 17 \ and newToken[-1] not in ("입", "잡", "씹", "좁", "접", "뽑"): # ㅂ받침: 눕+어=누워 newJong = hangul.build(jong[0], jong[1], 0) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) if token == "어": token = "워" elif token == "아": token = "와" tokenWord = "%s%s" % (newToken, token) formerTokenTag = "EC" new_check = True elif token in {"어야", "아야", "어다", "아다"}: ending = token[-1] if jong[-1] == 0: # 받침 없음 if jong[1] == 20 and token.startswith( "어"): # 달려야, 마셔야 newJong = hangul.build(jong[0], 6, jong[-1]) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s%s" % (newToken[:-1], newJong, ending) tokenWord = newToken elif jong[1] == 18 and jong[ 0] == 5: # '르' 불규칙: 마르+아야=말라야 tmp = hangul.separate(newToken[-2]) newJong = hangul.build(tmp[0], tmp[1], 8) tmp2 = hangul.separate(token[0]) newJong2 = hangul.build(5, tmp2[1], tmp2[2]) if len(newToken) == 2: # 마르, 오르, 바르.. newToken = "%s%s%s" % (newJong, newJong2, ending) else: newToken = "%s%s%s%s" % (newToken[:-2], newJong, newJong2, ending) tokenWord = newToken elif jong[1] == 18 and token.startswith( "어"): # 'ㅡ' 불규칙: 쓰+어야=써야 newJong = hangul.build(jong[0], 4, jong[-1]) if len(newToken) == 1: newToken = "%s%s" % (newJong, ending) else: newToken = "%s%s%s" % (newToken[:-1], newJong, ending) tokenWord = newToken elif jong[1] == 13 and token.startswith( "어"): # 세우+어야=세워야 newJong = hangul.build(jong[0], 14, jong[-1]) if len(newToken) == 1: newToken = "%s%s" % (newJong, ending) else: newToken = "%s%s%s" % (newToken[:-1], newJong, ending) tokenWord = newToken elif jong[1] == 8 and token.startswith( "아"): # 따라오+아야=따라와야 newJong = hangul.build(jong[0], 9, jong[-1]) if len(newToken) == 1: newToken = "%s%s" % (newJong, ending) else: newToken = "%s%s%s" % (newToken[:-1], newJong, ending) tokenWord = newToken elif jong == (18, 0, 0) and (token.startswith("아") or token.startswith("어")): # 해야 if len(newToken) == 1: tokenWord = "해%s" % ending else: tokenWord = "%s해%s" % (newToken[:-1], ending) else: tokenWord = "%s%s" % (newToken, ending ) # 펴야, 자야 # 받침 있음 elif jong[-1] != 0: if jong[-1] == 17 \ and newToken[-1] not in ("입", "잡", "씹", "좁", "접", "뽑"): # ㅂ받침: 눕+어=누워 newJong = hangul.build(jong[0], jong[1], 0) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) if token.startswith("어"): # 아름다워야 tokenWord = "%s워%s" % (newToken, ending) elif token.startswith("아"): # 고와야 tokenWord = "%s와%s" % (newToken, ending) elif jong[-1] == 19 and \ (newToken[-1] in ("긋", "낫", "붓", "잇", "젓", "짓")): # ㅅ받침 newJong = hangul.build(jong[0], jong[1], 0) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = "%s%s" % (newToken, token) elif jong[-1] == 7 and \ (newToken[-1] in ("걷", "싣", "듣") or newToken[-2:] in ("깨닫", "일컫")): # ㄷ받침 newJong = hangul.build(jong[0], jong[1], 8) if len(newToken) == 1: newToken = newJong else: newToken = "%s%s" % (newToken[:-1], newJong) tokenWord = "%s%s%s" % (newToken, token[0], ending) else: tokenWord = "%s%s" % (newToken, token) formerTokenTag = "EC" new_check = True elif token.endswith("지") and formerTag in { "VV", "VA", "VX" }: # 하다_지_모른다 tokenWord = "%s다_지_" % newToken formerTokenTag = "EC" new_check = True else: formerTokenTag = "REMOVE_EC" new_check = True # verb and adjective elif tag in ["VV", "VA", "VX"]: if formerTokenTag == "EC": if tag == "VX" and token == "가지": # 해가지고, 떠가지고 = 하다, 뜨다 tokenWord = original_token else: tokenWord = "%s%s" % (tokenWord, token + "다") elif formerTokenTag.startswith("NNB"): if formerTokenTag.endswith("1") and token in { "있", "없" }: # 하다_수_있다 tokenWord = "%s_%s다" % (tokenWord, token) elif formerTokenTag.endswith("2") and token in { "모르", "않", "말" }: # 하다_지_모르다 tokenWord = "%s_%s다" % (tokenWord, token) elif formerTokenTag.endswith("3"): # 사랑_때문_자라다 tokenWord = "%s_%s다" % (tokenWord, token) else: tokenWord += "%s다" % token elif formerTokenTag == "JX": if token == "없": tokenWord += "_%s다" % token else: tokenWord += " %s다" % token else: if tag in ("VX", "VA") and token in [ "않", "없", "못하", "말", "주" ]: tokenWord += "_%s다" % token else: # rest all tag including VX # insert white space if V-V or VA-VA case(grammar error) if formerTokenTag == tag: tokenWord += " %s다" % token else: tokenWord += "%s다" % token formerTokenTag = tag new_check = True # adverb elif tag == "MAG" and token == "못": tokenWord += "_%s" % token elif tag == "MAG" and token == "없이": tokenWord = "%s없다" % tokenWord # JKB: 부사격조사 elif tag == "JKB" and token == "같이": tokenWord = "%s같다" % tokenWord else: idx += 1 formerTag = tag new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) continue formerTag = tag new_check, formerTokenTag = ending_check( new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt) idx += 1 tokenList.append(tokenWord.strip()) # if it's the last word in 'Komoran' if formerTokenTag == "EC": tokenList[-1] = original_token finalToken = " ".join([a for a in tokenList if a not in {"", "하다", "암트다"}]) # stop word if "안 하다" in finalToken: finalToken = finalToken.replace("안 하다", "안하다") for neg in {"없다", "않다", "못하다", "안하다"}: if neg == finalToken: continue if " _%s" % neg in finalToken: finalToken = finalToken.replace(" _%s" % neg, " %s" % neg) if " %s" % neg in finalToken: finalToken = finalToken.replace(" %s" % neg, "_%s" % neg) if (" _%s" % neg) in finalToken: finalToken = finalToken.replace((" _%s" % neg), ("_%s" % neg)) finalToken = finalToken.strip("_").replace("__", "_").replace(" _", " ").replace( "_ ", " ") return re.sub(r"\s+", " ", finalToken)