def tokenize(text, as_id=False, symbol_type=1, debug=False): j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2( ), load_symbols_3(), load_symbols_4() text = normalize(text) pre_tokens = list(hangul_to_jamo(text)) pre_tokens = [ hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens ] tokens = [] if symbol_type == 1: if debug: print(char_to_id_1) for token in pre_tokens: tokens += list(j2hj[token]) if as_id: return [char_to_id_1[token] for token in tokens] + [char_to_id_1[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 2: if debug: print(char_to_id_2) for token in pre_tokens: tokens += list(j2hcj[token]) if as_id: return [char_to_id_2[token] for token in tokens] + [char_to_id_2[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 3: if debug: print(char_to_id_3) for token in pre_tokens: tokens += list(j2sj[token]) if as_id: return [char_to_id_3[token] for token in tokens] + [char_to_id_3[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 4: if debug: print(char_to_id_4) for token in pre_tokens: tokens += list(j2shcj[token]) if as_id: return [char_to_id_4[token] for token in tokens] + [char_to_id_4[EOS]] else: return [token for token in tokens] + [EOS]
def tokenize(text, as_id=False): text = normalize(text) tokens = list(hangul_to_jamo(text)) # print(tokens) if as_id: return [char_to_id[token] for token in tokens] + [char_to_id[EOS]] else: return [token for token in tokens] + [EOS]
def text_to_sequence(text): sequence = [] if not 0x1100 <= ord(text[0]) <= 0x1113: text = ''.join(list(hangul_to_jamo(text))) for s in text: sequence.append(_symbol_to_id[s]) sequence.append(_symbol_to_id['~']) return sequence
def tokenize(text, as_id=False, symbol_type=1, debug=False): j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2( ), load_symbols_3(), load_symbols_4() # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다. text = normalize(text) pre_tokens = list( hangul_to_jamo(text) ) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~'] tokens = [] if symbol_type == 1: if debug: print(char_to_id_1) for token in pre_tokens: tokens += list(j2hj[token]) if as_id: return [char_to_id_1[token] for token in tokens] + [char_to_id_1[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 2: if debug: print(char_to_id_2) for token in pre_tokens: tokens += list(j2hcj[token]) if as_id: return [char_to_id_2[token] for token in tokens] + [char_to_id_2[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 3: if debug: print(char_to_id_3) for token in pre_tokens: tokens += list(j2sj[token]) if as_id: return [char_to_id_3[token] for token in tokens] + [char_to_id_3[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 4: if debug: print(char_to_id_4) for token in pre_tokens: tokens += list(j2shcj[token]) if as_id: return [char_to_id_4[token] for token in tokens] + [char_to_id_4[EOS]] else: return [token for token in tokens] + [EOS]
def tokenize(text, as_id=False): # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다. text = normalize(text) tokens = list(hangul_to_jamo(text)) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~'] if as_id: return [char_to_id[token] for token in tokens] + [char_to_id[EOS]] else: return [token for token in tokens] + [EOS]
def tokenize(text, as_id=False): # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다. text = normalize(text) tokens = list(hangul_to_jamo(text)) # '안녕' --> ['ㅇ','ㅏ','ㄴ','ㄴ','ㅕ','ㅇ'] if as_id: return [char_to_id[token] for token in tokens] + [char_to_id[EOS]] else: return [token for token in tokens] + [EOS]
def test_hangul_to_jamo(self): """hangul_to_jamo tests Arguments may be iterables or characters. hangul_to_jamo should split every Hangul character into U+11xx jamo for any given string. Anything else is unchanged. """ test_cases = ["자", "모", "한", "글", "서", "울", "평", "양", "한굴", "Do you speak 한국어?", "자모=字母"] desired_jamo = [(chr(0x110c), chr(0x1161)), (chr(0x1106), chr(0x1169)), (chr(0x1112), chr(0x1161), chr(0x11ab)), (chr(0x1100), chr(0x1173), chr(0x11af)), (chr(0x1109), chr(0x1165)), (chr(0x110b), chr(0x116e), chr(0x11af)), (chr(0x1111), chr(0x1167), chr(0x11bc)), (chr(0x110b), chr(0x1163), chr(0x11bc)), (chr(0x1112), chr(0x1161), chr(0x11ab), chr(0x1100), chr(0x116e), chr(0x11af)), tuple(_ for _ in "Do you speak ") + (chr(0x1112), chr(0x1161), chr(0x11ab), chr(0x1100), chr(0x116e), chr(0x11a8), chr(0x110b), chr(0x1165)) + ('?',), (chr(0x110c), chr(0x1161), chr(0x1106), chr(0x1169), "=", "字", "母")] for hangul, target in zip(test_cases, desired_jamo): trial = jamo.hangul_to_jamo(hangul) assert trial.__name__ == "<genexpr>",\ ("hangul_to_jamo didn't return" "an instance of a generator.") trial = tuple(trial) assert target == trial,\ ("Converted {hangul} to {failure}, but expected " "({lead}, {vowel}, " "{tail}).").format(hangul=hangul, lead=hex(ord(target[0])), vowel=hex(ord(target[1])), tail=hex(ord(target[2])) if len(target) == 3 else "", failure=tuple([hex(ord(_)) for _ in trial]))\ if len(hangul) == 1 else\ ("Incorrectly converted {hangul} to " "{failure}.".format(hangul=hangul, failure=[hex(ord(_)) for _ in trial]))
def to_jamo(data, use_counter=False): if use_counter: analysis_result = Counter() else: analysis_result = set() for x in tqdm(data): analysis_result.update(hangul_to_jamo(x)) return analysis_result
def run_eval(args): synth = Synthesizer() synth.load(args.checkpoint) base_path = get_output_base_path(args.checkpoint) for i, text in enumerate(sentences): jamo = ''.join(list(hangul_to_jamo(text))) path = '%s-%d.wav' % (base_path, i) print('Synthesizing: %s' % path) with open(path, 'wb') as f: f.write(synth.synthesize(jamo, base_path, i))
def decode(self, encoded_text): decoded_sent = '' for ch in encoded_text: if jamo.is_hangul_char(ch): decoded_sent += ''.join( [self.decoder_dict[ch] for ch in jamo.hangul_to_jamo(ch)]) else: decoded_sent += ch decoded_sent = decoded_sent.replace('x', '').replace('ŋ', 'ng').replace( 'ƒ', 'wh') return decoded_sent
def text_to_sequence(text): text_filter = "[,./!@#$%^&*()?]" text = re.sub(re.compile(text_filter), '', text) sequence = [] if not 0x1100 <= ord(text[0]) <= 0x1113: text = ''.join(list(hangul_to_jamo(text))) for s in text: sequence.append(_symbol_to_id[s]) sequence.append(_symbol_to_id['~']) # ~ 문장 구분자 추가\ sequence = np.asarray(sequence) return sequence
def test_hangul_to_jamo(self): """hangul_to_jamo tests Arguments may be iterables or characters. hangul_to_jamo should split every Hangul character into U+11xx jamo for any given string. Anything else is unchanged. """ test_cases = [ "자", "모", "한", "글", "서", "울", "평", "양", "한굴", "Do you speak 한국어?", "자모=字母" ] desired_jamo = [ (chr(0x110c), chr(0x1161)), (chr(0x1106), chr(0x1169)), (chr(0x1112), chr(0x1161), chr(0x11ab)), (chr(0x1100), chr(0x1173), chr(0x11af)), (chr(0x1109), chr(0x1165)), (chr(0x110b), chr(0x116e), chr(0x11af)), (chr(0x1111), chr(0x1167), chr(0x11bc)), (chr(0x110b), chr(0x1163), chr(0x11bc)), (chr(0x1112), chr(0x1161), chr(0x11ab), chr(0x1100), chr(0x116e), chr(0x11af)), tuple(_ for _ in "Do you speak ") + (chr(0x1112), chr(0x1161), chr(0x11ab), chr(0x1100), chr(0x116e), chr(0x11a8), chr(0x110b), chr(0x1165)) + ('?', ), (chr(0x110c), chr(0x1161), chr(0x1106), chr(0x1169), "=", "字", "母") ] for hangul, target in zip(test_cases, desired_jamo): trial = jamo.hangul_to_jamo(hangul) assert trial.__name__ == "<genexpr>",\ ("hangul_to_jamo didn't return" "an instance of a generator.") trial = tuple(trial) assert target == trial,\ ("Converted {hangul} to {failure}, but expected " "({lead}, {vowel}, " "{tail}).").format(hangul=hangul, lead=hex(ord(target[0])), vowel=hex(ord(target[1])), tail=hex(ord(target[2])) if len(target) == 3 else "", failure=tuple([hex(ord(_)) for _ in trial]))\ if len(hangul) == 1 else\ ("Incorrectly converted {hangul} to " "{failure}.".format(hangul=hangul, failure=[hex(ord(_)) for _ in trial]))
def encode(input_path, label_path): # Collect signals input = np.load( os.path.join(input_dir, input_path.numpy().decode('utf8'))).astype('float32') # lower frame rate input = build_lfr(input) # instance normalization input = (input - input.mean()) / input.std() with open(os.path.join(label_dir, label_path.numpy().decode('utf8')), 'r', encoding='utf-8') as f_in: label = f_in.readline() if args.token_style == 'jamo': label = hangul_to_jamo(label) label = np.array([_symbol_to_id[SOS]] + [_symbol_to_id[x] for x in label] + [_symbol_to_id[EOS]]).astype('int32') else: label = np.array([token_index[SOS]] + [token_index[x] for x in label] + [token_index[EOS]]).astype('int32') return input, label
def korean_to_jamo(text): return "".join(hangul_to_jamo(text))
_JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)]) _JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)]) _JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)]) _VAILD_JAMO = [jamo for jamo in _JAMO_LEADS + _JAMO_VOWELS + _JAMO_TAILS] korean_symbol = [_pad] + [_special] + list(_punctuation) + [_space] + _VAILD_JAMO if __name__ == '__main__': print(korean_symbol) print(len(korean_symbol)) symbol_to_id = {s: i for i, s in enumerate(korean_symbol)} text = '안녕하세요 3 분반' from jamo import hangul_to_jamo h2j = "".join(hangul_to_jamo(text)) print([symbol_to_id[jamo] for jamo in h2j]) print([jamo for jamo in h2j])
self.model.enc_input: [np.asarray(seq, dtype=np.int32)], self.model.sequence_length: np.asarray([len(seq)], dtype=np.int32), self.model.dec_input: dec_input }) if i < 200: dec_input[:, i, :] = mel_out[5 * i - 1, :] pred.extend(mel_out[5 * (i - 1):5 * i, :]) np.save(os.path.join(args.save_dir, 'mel-{}'.format(idx)), pred, allow_pickle=False) input_seq = sequence_to_text(seq) alignment_dir = os.path.join(args.save_dir, 'align-{}.png'.format(idx)) plot_alignment(alignment, alignment_dir, input_seq) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--step', required=True) parser.add_argument('--save_dir', default='./output') args = parser.parse_args() os.makedirs(args.save_dir, exist_ok=True) synth = Synthesizer() synth.load(args.step) for i, text in enumerate(sentences): jamo = ''.join(list(hangul_to_jamo(text))) synth.synthesize(args, jamo, i)
def tokenize(text, as_id=True, symbol_type=0, debug=False): j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2( ), load_symbols_3(), load_symbols_4() text = normalize(text) # pre_tokens = list(hangul_to_jamo(text)) tokens = [] # symbol_type=1 if symbol_type: pre_tokens = list(hangul_to_jamo(text)) pre_tokens = [ hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens ] else: pre_tokens = runKoG2PTest(text, './text/rulebook.txt').split(' ') if as_id: return [char_to_id_1[pre] for pre in pre_tokens] + [char_to_id_1[EOS]] else: return [pre for pre in pre_tokens] + [EOS] if symbol_type == 1: if debug: print(char_to_id_1) for token in pre_tokens: tokens += list(j2hj[token]) if as_id: return [char_to_id_1[token] for token in tokens] + [char_to_id_1[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 2: if debug: print(char_to_id_2) for token in pre_tokens: tokens += list(j2hcj[token]) if as_id: return [char_to_id_2[token] for token in tokens] + [char_to_id_2[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 3: if debug: print(char_to_id_3) for token in pre_tokens: tokens += list(j2sj[token]) if as_id: return [char_to_id_3[token] for token in tokens] + [char_to_id_3[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 4: if debug: print(char_to_id_4) for token in pre_tokens: tokens += list(j2shcj[token]) if as_id: return [char_to_id_4[token] for token in tokens] + [char_to_id_4[EOS]] else: return [token for token in tokens] + [EOS]
def tokenize(text): text = normalize(text) tokens = list(hangul_to_jamo(text)) return tokens + ['~']
print(text) print(normalize(text)) print("=" * 30) # test_normalize("제 전화번호는 01012345678이에요.") # test_normalize("60 대 30으로") # test_normalize("2020년 월드컵에서는 한국74이 4강") # test_normalize("3개월 전에 골프를 치다가") # test_normalize("1025호실 환자") # test_normalize("2013년에는 작은 아파트에 대한") # test_normalize("국어 시험에서 80점을 받았어요.") # test_normalize('근처에 24시간 여는 슈퍼마켓 있나요?') # test_normalize('지금은 23시10분 입니다') test_normalize('아버지는 20살 때부터 버스를 모셨다.') # test_normalize("이 상자는 가로 30, 세로 50, 높이 20센티다.") # test_normalize("3, 6, 9 게임 아세요?") # test_normalize("남은 시간이 6개월이래요") # test_normalize("36 개월 할부") # test_normalize("114에 전화를 해서 번호를 알아보시지 그러세요?") # test_normalize("축구에서 한 팀은 11명으로 이루어진다.") # test_normalize("그 연극은 5월 1일부터 10월 31일까지 월요일을 제외하고 매일 공연됩니다.") # test_normalize("우리의 목표는 에너지 소비를 10% 줄이는 것입니다.") # test_normalize('5 시 36분 32초') # test_normalize('2 명 입니다') # test_normalize('3명 입니다') # test_normalize("mp3 파일을 홈페이지에서 다운로드 받으시기 바랍니다.") # test_normalize("오늘(13일) 3,600마리 강아지가") # test_normalize("33001명의 사람이 모였습니다") # test_normalize("60.3%") print(list(hangul_to_jamo(list(hangul_to_jamo('남은 시간이 "6개월이래요”')))))
def text_to_idx(self, text): return [ self.symbol_dic[token] for token in hangul_to_jamo(text) if token in self.symbol_dic ]
if not is_count and kor.startswith("일") and len(kor) > 1: kor = kor[1:] if float_str is not None: kor += "쩜 " kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str) if num_str.startswith("+"): kor = "플러스 " + kor elif num_str.startswith("-"): kor = "마이너스 " + kor return kor + unit_str if __name__ == "__main__": def test_normalize(text): print(text) print(normalize(text)) print("=" * 30) test_normalize("어제 미술관 옆 동물원에 갔어요.") test_normalize("오늘(13일) 3,600마리 강아지가") test_normalize("60.3%") test_normalize('"저돌"(猪突) 입니다.') test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”') test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다") test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다") print(list(hangul_to_jamo(list(hangul_to_jamo('어제 미술관 옆 동물원에 갔어요.')))))
import json import csv from jamo import hangul_to_jamo import argparse from utils import load_json from text.korean import normalize if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--alignment_path', required=True) parser.add_argument('--remove_prefix', required=True) config = parser.parse_args() data = load_json(config.alignment_path, encoding="utf8") out_txt = config.alignment_path.replace('alignment.json','transcript.txt') f = csv.writer(open(out_txt, "w"), delimiter='|') for file in data: filename = file.replace(config.remove_prefix, '') text = data[file] norm = normalize(text) decomp = list(hangul_to_jamo(norm)) f.writerow([filename, text, norm, ''.join(decomp), "0.0"])
def _text_to_jaso(self, line: str) -> List[str]: jasos = list(jamo.hangul_to_jamo(line)) return jasos
kor += "쩜 " kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str) if num_str.startswith("+"): kor = "플러스 " + kor elif num_str.startswith("-"): kor = "마이너스 " + kor return kor + unit_str if __name__ == "__main__": def test_normalize(text): print(text) print(normalize(text)) print("=" * 30) test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute") test_normalize("오늘(13일) 3,600마리 강아지가") test_normalize("60.3%") test_normalize('"저돌"(猪突) 입니다.') test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”') test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다") test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다") print( list( hangul_to_jamo( list( hangul_to_jamo( '비대위원장이 지난 1월 이런 말을 했습니다? “난 그냥 산돼지처럼 돌파하는 스타일이다”')))))
def tokenize(text, as_id=False, symbol_type=1, debug=False): j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2(), load_symbols_3(), load_symbols_4() text = normalize(text) pre_tokens = list(hangul_to_jamo(text)) pre_tokens = [hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens] tokens = [] if symbol_type == 1: if debug: print(char_to_id_1) for token in pre_tokens: #token = token.encode('utf-8','ignore') token = token.replace('\u201d',' ') token = token.replace('\u2026',' ') token = token.replace('\u2018',' ') token = token.replace('\u201c',' ') token = token.replace('\u2019',' ') token = token.replace('\xe1\x84\x8b',' ') token = token.replace('\xb7',' ') token = token.replace('\xa0',' ') tokens += list(j2hj[token]) if as_id: return [char_to_id_1[token] for token in tokens] + [char_to_id_1[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 2: if debug: print(char_to_id_2) for token in pre_tokens: tokens += list(j2hcj[token]) if as_id: return [char_to_id_2[token] for token in tokens] + [char_to_id_2[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 3: if debug: print(char_to_id_3) for token in pre_tokens: tokens += list(j2sj[token]) if as_id: return [char_to_id_3[token] for token in tokens] + [char_to_id_3[EOS]] else: return [token for token in tokens] + [EOS] elif symbol_type == 4: if debug: print(char_to_id_4) for token in pre_tokens: tokens += list(j2shcj[token]) if as_id: return [char_to_id_4[token] for token in tokens] + [char_to_id_4[EOS]] else: return [token for token in tokens] + [EOS]