示例#1
0
def get_onehot_vector(sent):
    """
    convert sentecne to vector
    :return: list
    """
    try:
        return_vector = []
        embeddings = np.zeros([30])
        idx = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' ']
        num_reg = re.compile("[0-9- ]")

        if (type(sent) not in [type('str'), type([])]):
            raise Exception("input must be str")

        if (type(sent) == type([])):
            sent = sent[0]

        for char in sent:
            vector_a = np.copy(embeddings)
            vector_b = np.copy(embeddings)
            vector_c = np.copy(embeddings)
            vector_d = np.copy(embeddings)

            if (num_reg.match(char) == None and hangul.is_hangul(char)):
                anl = hangul.separate(char)
                vector_a[anl[0] if anl[0] > 0 else 0] = 1
                vector_b[anl[1] if anl[1] > 0 else 0] = 1
                vector_c[anl[2] if anl[2] > 0 else 0] = 1
            elif (num_reg.match(char)):
                vector_d[idx.index(char)] = 1
            return_vector.append(
                np.append(vector_a, [vector_b, vector_c, vector_d]))
        return np.array(return_vector)
    except Exception as e:
        print("error on get_onehot_vector : {0}".format(e))
示例#2
0
def String_to_Token_List(string):
    #English or special char....
    if re.search(r'[가-힣\d\s\.,\?!]+', string) is None or re.search(
            r'[가-힣\d\s\.,\?!]+', string).group() != string:
        print(string)
        return False

    regex_DtoS = r'(?:^|[^\d])(\d{4})(?:$|[^\d])'
    string = re.sub(
        regex_DtoS,
        lambda x: re.sub(r'\d{4}', lambda y: Number_to_String(y.group()),
                         x.group()), string)

    regex_CtoS1 = r"([+-]?\d[\d,]*)[\.]?\d*"
    regex_CtoS2 = r"(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"
    string = re.sub(
        regex_CtoS1 + regex_CtoS2, lambda x: re.sub(
            regex_CtoS1, lambda y: Count_Number(int(y.group())), x.group()),
        string)

    regex_NtoS = r"([+-]?\d[\d,]*)[\.]?\d*"
    string = re.sub(regex_NtoS, lambda x: Read_Number(int(x.group())), string)

    token_List = []
    token_List.append(0)
    #<EOS>
    for char in string:
        if char == " ":
            token_List.append(2)
            continue
        elif char == ".":
            token_List.append(71)
            continue
        elif char == ",":
            token_List.append(72)
            continue
        elif char == "?":
            token_List.append(73)
            continue
        elif char == "!":
            token_List.append(74)
            continue
        elif hangul.is_hangul(char):
            onset, nucleus, coda = hangul.separate(char)
            onset += 3
            nucleus += 3 + 19
            coda += 3 + 19 + 21
            token_List.extend([onset, nucleus, coda])
        else:
            raise Exception("Not handled letter")

    token_List.append(1)
    #<EOE>

    return token_List
示例#3
0
def get_jamos(character):
    if hangul.is_hangul(character):
        character_jamos = decompose_character(character, final_char=True)
    elif character in string.punctuation:
        character_jamos = ['.', '.', '.']
    elif character.isdigit():
        character_jamos = ['0', '0', '0']
    elif character.isalpha():
        character_jamos = ['a', 'a', 'a']
    else:
        character_jamos = ['x', 'x', 'x']
    return character_jamos
示例#4
0
def convert_hangul_to_index(string):#, size):
	#string = unicode(string)
	list = []#np.ndarray([size, 3])
	for i in range(len(string)):
		#exception
		if not hangul.is_hangul(string[i]):
			continue
		char3 = hangul.separate(string[i])
		idx = char3[0] +  char3[1] * FirNum + char3[2] * FirNum * SecNum
		list.append([idx])
	if len(list)==0:
		list.append([ClassNum - 1])

	return np.array(list)
示例#5
0
def convert_index_to_hangul(list):
	str = ''
	#list = list.view(-1, 3)
	for i in range(len(list)):
		remain1 = list[i]
		char3 = remain1 // (FirNum * SecNum)
		remain2 = remain1 - char3 * FirNum * SecNum
		char2 = remain2 // FirNum
		remain3 = remain2 - char2 * FirNum
		char1 = remain3
		
		hg = hangul.synthesize(char1, char2, char3)
		if hangul.is_hangul(hg):
			str = str + hg

	
	return str
示例#6
0
    def get_onehot_vector(self, sent):
        """
        convert sentecne to vector
        :return: list
        """
        try:
            return_vector = []
            embeddings = np.zeros([40])
            idx = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' ',
                   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
                   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
            num_reg = re.compile("[a-z0-9- ]")

            if (type(sent) not in [type('str'), type([])]):
                raise Exception("input must be str")

            if (type(sent) == type([])):
                sent = sent[0]

            for char in sent:
                vector_a = np.copy(embeddings)
                vector_b = np.copy(embeddings)
                vector_c = np.copy(embeddings)
                vector_d = np.copy(embeddings)

                if (num_reg.match(char) == None and hangul.is_hangul(char)):
                    anl = hangul.separate(char)
                    vector_a[anl[0] if anl[0] > 0 else 0] = 1
                    vector_b[anl[1] if anl[1] > 0 else 0] = 1
                    vector_c[anl[2] if anl[2] > 0 else 0] = 1
                elif (num_reg.match(char)):
                    vector_d[idx.index(char)] = 1
                else :
                    vector_d[39] = 1
                return_vector.append(np.append(vector_a, [vector_b, vector_c, vector_d]))
            return np.array(return_vector)
        except Exception as e:
            print("error on get_onehot_vector : {0}".format(e))
示例#7
0
def test_is_hangul():
    assert hangul.is_hangul(u'한') == True
    assert hangul.is_hangul('A') == False
    assert hangul.is_hangul('1') == False
    assert hangul.is_hangul(None) == False
示例#8
0
def test_is_hangul():
    assert hangul.is_hangul(u"한")
    assert not hangul.is_hangul("A")
    assert not hangul.is_hangul("1")
    assert not hangul.is_hangul(None)
示例#9
0
    # 첫번째 글자는 무조건 한자

    # 한글 뜻, 음, 획수, 영어뜻 포함
    #d={'hanja':l[0],'meanings':[],'sounds':[], 'eng_meanings':[], 'strokes':int(l[i+1:-1])}

    # 한글 뜻, 음, 획수 포함
    #d={'hanja':l[0],'meanings':[],'sounds':[], 'strokes':int(l[i+1:-1])}

    # 한글 뜻, 음만 포함
    d = {'hanja': l[0], 'meanings': [], 'sounds': []}

    l = l[2:i].replace(';', ',').split(',')  # 영어 뜻을 제대로 분리하려면 수정 필요
    for i in range(len(l)):
        f = False
        for c in l[i]:
            if not (hangul.is_hangul(c) or c == ' '):
                f = True
                break
        if f: break
    kor = l[:i]
    eng = l[i:]
    for i in range(len(kor)):
        kor[i] = kor[i].strip()
    for i in range(len(eng)):
        eng[i] = eng[i].strip()

    for s in kor:
        tmp = s.split()
        try:
            if tmp[0] == '': raise Exception()
            d['meanings'].append(' '.join(tmp[:-1]))
示例#10
0
for idx, i in enumerate(all_name):
    idx_dict[i] = int(idx)


matrix = make_matrix(raw2) 
matrix = pd.DataFrame(matrix)
matrix.columns = all_name 
matrix.index = all_name 

## 데이터 클렌징. 
## 오타로 인한 추가 구분 테이터를 통합. 

for i in matrix.columns:
    tmp = 0
    for j in i:
        tmp += hangul.is_hangul(j)
    if tmp != 0:
        print(i,all_dict[i])
        
matrix = data_cleansing('심상정',matrix)
matrix = data_cleansing('민병두',matrix)
    
## 의원직이 박탈된 사람은 제거.
with open(r'C:\Github\Project_Social_Bigdata_Assignment\Data\remove_list.pickle', 'rb') as f:
    remove_list = pickle.load(f)

for i in remove_list:
    try:
        matrix = data_cleansing(i,matrix, delete = True)
    except:
        print(i)
示例#11
0
def test_is_hangul():
    assert hangul.is_hangul(u'한') == True
    assert hangul.is_hangul('A') == False
    assert hangul.is_hangul('1') == False
    assert hangul.is_hangul(None) == False