def get_onehot_vector(sent): """ convert sentecne to vector :return: list """ try: return_vector = [] embeddings = np.zeros([30]) idx = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' '] num_reg = re.compile("[0-9- ]") if (type(sent) not in [type('str'), type([])]): raise Exception("input must be str") if (type(sent) == type([])): sent = sent[0] for char in sent: vector_a = np.copy(embeddings) vector_b = np.copy(embeddings) vector_c = np.copy(embeddings) vector_d = np.copy(embeddings) if (num_reg.match(char) == None and hangul.is_hangul(char)): anl = hangul.separate(char) vector_a[anl[0] if anl[0] > 0 else 0] = 1 vector_b[anl[1] if anl[1] > 0 else 0] = 1 vector_c[anl[2] if anl[2] > 0 else 0] = 1 elif (num_reg.match(char)): vector_d[idx.index(char)] = 1 return_vector.append( np.append(vector_a, [vector_b, vector_c, vector_d])) return np.array(return_vector) except Exception as e: print("error on get_onehot_vector : {0}".format(e))
def String_to_Token_List(string): #English or special char.... if re.search(r'[가-힣\d\s\.,\?!]+', string) is None or re.search( r'[가-힣\d\s\.,\?!]+', string).group() != string: print(string) return False regex_DtoS = r'(?:^|[^\d])(\d{4})(?:$|[^\d])' string = re.sub( regex_DtoS, lambda x: re.sub(r'\d{4}', lambda y: Number_to_String(y.group()), x.group()), string) regex_CtoS1 = r"([+-]?\d[\d,]*)[\.]?\d*" regex_CtoS2 = r"(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)" string = re.sub( regex_CtoS1 + regex_CtoS2, lambda x: re.sub( regex_CtoS1, lambda y: Count_Number(int(y.group())), x.group()), string) regex_NtoS = r"([+-]?\d[\d,]*)[\.]?\d*" string = re.sub(regex_NtoS, lambda x: Read_Number(int(x.group())), string) token_List = [] token_List.append(0) #<EOS> for char in string: if char == " ": token_List.append(2) continue elif char == ".": token_List.append(71) continue elif char == ",": token_List.append(72) continue elif char == "?": token_List.append(73) continue elif char == "!": token_List.append(74) continue elif hangul.is_hangul(char): onset, nucleus, coda = hangul.separate(char) onset += 3 nucleus += 3 + 19 coda += 3 + 19 + 21 token_List.extend([onset, nucleus, coda]) else: raise Exception("Not handled letter") token_List.append(1) #<EOE> return token_List
def get_jamos(character): if hangul.is_hangul(character): character_jamos = decompose_character(character, final_char=True) elif character in string.punctuation: character_jamos = ['.', '.', '.'] elif character.isdigit(): character_jamos = ['0', '0', '0'] elif character.isalpha(): character_jamos = ['a', 'a', 'a'] else: character_jamos = ['x', 'x', 'x'] return character_jamos
def convert_hangul_to_index(string):#, size): #string = unicode(string) list = []#np.ndarray([size, 3]) for i in range(len(string)): #exception if not hangul.is_hangul(string[i]): continue char3 = hangul.separate(string[i]) idx = char3[0] + char3[1] * FirNum + char3[2] * FirNum * SecNum list.append([idx]) if len(list)==0: list.append([ClassNum - 1]) return np.array(list)
def convert_index_to_hangul(list): str = '' #list = list.view(-1, 3) for i in range(len(list)): remain1 = list[i] char3 = remain1 // (FirNum * SecNum) remain2 = remain1 - char3 * FirNum * SecNum char2 = remain2 // FirNum remain3 = remain2 - char2 * FirNum char1 = remain3 hg = hangul.synthesize(char1, char2, char3) if hangul.is_hangul(hg): str = str + hg return str
def get_onehot_vector(self, sent): """ convert sentecne to vector :return: list """ try: return_vector = [] embeddings = np.zeros([40]) idx = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] num_reg = re.compile("[a-z0-9- ]") if (type(sent) not in [type('str'), type([])]): raise Exception("input must be str") if (type(sent) == type([])): sent = sent[0] for char in sent: vector_a = np.copy(embeddings) vector_b = np.copy(embeddings) vector_c = np.copy(embeddings) vector_d = np.copy(embeddings) if (num_reg.match(char) == None and hangul.is_hangul(char)): anl = hangul.separate(char) vector_a[anl[0] if anl[0] > 0 else 0] = 1 vector_b[anl[1] if anl[1] > 0 else 0] = 1 vector_c[anl[2] if anl[2] > 0 else 0] = 1 elif (num_reg.match(char)): vector_d[idx.index(char)] = 1 else : vector_d[39] = 1 return_vector.append(np.append(vector_a, [vector_b, vector_c, vector_d])) return np.array(return_vector) except Exception as e: print("error on get_onehot_vector : {0}".format(e))
def test_is_hangul(): assert hangul.is_hangul(u'한') == True assert hangul.is_hangul('A') == False assert hangul.is_hangul('1') == False assert hangul.is_hangul(None) == False
def test_is_hangul(): assert hangul.is_hangul(u"한") assert not hangul.is_hangul("A") assert not hangul.is_hangul("1") assert not hangul.is_hangul(None)
# 첫번째 글자는 무조건 한자 # 한글 뜻, 음, 획수, 영어뜻 포함 #d={'hanja':l[0],'meanings':[],'sounds':[], 'eng_meanings':[], 'strokes':int(l[i+1:-1])} # 한글 뜻, 음, 획수 포함 #d={'hanja':l[0],'meanings':[],'sounds':[], 'strokes':int(l[i+1:-1])} # 한글 뜻, 음만 포함 d = {'hanja': l[0], 'meanings': [], 'sounds': []} l = l[2:i].replace(';', ',').split(',') # 영어 뜻을 제대로 분리하려면 수정 필요 for i in range(len(l)): f = False for c in l[i]: if not (hangul.is_hangul(c) or c == ' '): f = True break if f: break kor = l[:i] eng = l[i:] for i in range(len(kor)): kor[i] = kor[i].strip() for i in range(len(eng)): eng[i] = eng[i].strip() for s in kor: tmp = s.split() try: if tmp[0] == '': raise Exception() d['meanings'].append(' '.join(tmp[:-1]))
for idx, i in enumerate(all_name): idx_dict[i] = int(idx) matrix = make_matrix(raw2) matrix = pd.DataFrame(matrix) matrix.columns = all_name matrix.index = all_name ## 데이터 클렌징. ## 오타로 인한 추가 구분 테이터를 통합. for i in matrix.columns: tmp = 0 for j in i: tmp += hangul.is_hangul(j) if tmp != 0: print(i,all_dict[i]) matrix = data_cleansing('심상정',matrix) matrix = data_cleansing('민병두',matrix) ## 의원직이 박탈된 사람은 제거. with open(r'C:\Github\Project_Social_Bigdata_Assignment\Data\remove_list.pickle', 'rb') as f: remove_list = pickle.load(f) for i in remove_list: try: matrix = data_cleansing(i,matrix, delete = True) except: print(i)