def _load_dataset(self): # file_list = os.listdir('/home/Data/FoodDetection/data/text_recognition/Korean/public_crop') file_list = os.listdir( '/Data/FoodDetection/data/text_recognition/Korean/public_crop') dataset = [] for file_name in file_list: # img = os.path.join('/home/Data/FoodDetection/data/text_recognition/Korean/public_crop/', file_name) img = os.path.join( '/Data/FoodDetection/data/text_recognition/Korean/public_crop/', file_name) # 세로글자 인경우 제외 # h, w, c = np.asarray(img).shape # if h > w : # continue label = file_name.replace('.jpg', '').replace(' ', '') continue_flag = False if self.mode == 'jamo': label_split = j2hcj(h2j(label)) # 특수문자 ㅗ for char in label_split: if char not in jamo_printable: continue_flag = True if continue_flag: continue top_tmp = [] middle_tmp = [] bottom_tmp = [] for char in label: decomposed = j2hcj(h2j(char)) for i, label in enumerate( [top_tmp, middle_tmp, bottom_tmp]): try: label.append(decomposed[i]) except: label.append(' ') # for img, top, middle, bottom in zip(img, top_tmp, middle_tmp, bottom_tmp): dataset.append([img, top_tmp, middle_tmp, bottom_tmp]) elif self.mode == 'syllable': # label = list(label) for syllable in label: if syllable not in syllable_printable: continue_flag = True if continue_flag: continue dataset.append([img, label]) return dataset
def combine(self, verb, ending, rule): if not rule: return [] stop, postfix, start = rule.split(",") stop = None if stop == "" else int(stop) start = None if start == "" else int(start) # STEP 1. Decompose verb verb = h2j(verb) # h: hangul syl. j: jamo # STEP 2. Slice 1 verb = verb[:stop] # STEP 3. Merge 2 and postfix wordform = verb + postfix # STEP 4. Decompose ending ending = h2j(ending) ending = "".join(hcj_to_jamo(char, "tail") if is_hcj(char) else char for char in ending) # STEP 5. Slice 4 ending = ending[start:] # STEP 6. Merge 3 and 5 wordform +="|" + ending # STEP 7. Compose 6 wordform = self.compose(wordform) return wordform
def hangul_to_sequence(hangul_text): # load conversion dictionaries ### clean number hangul_text_ = date_to_hangul(hangul_text) hangul_text_ = number_to_hangul(hangul_text_) hangul_text_ = clean_text(hangul_text_) ### add end of sentence symbol hangul_text_ = hangul_text_ + u"␃" # ␃: EOS ### get dictionary of chars hangul_to_ids= _symbol_to_id ### process jamos text = [h2j(char) for char in hangul_text_] text = chain.from_iterable(text) hangul_text_ = [h2j(char) for char in text] hangul_text_ = chain.from_iterable(hangul_text_) sequence = [] try: ### convert jamos to ids using dictionary for char in hangul_text_: if char in symbols: sequence.append(hangul_to_ids[char]) else: try: print(char) sequence.append(hangul_to_ids[symbols[hangul_symbol_hcj.index(char)]]) except Exception as e: sequence.append(hangul_to_ids['.']) except KeyError as e: raise KeyError('KeyError (at key: {}) when processing: {}'.format(e,hangul_text)) return sequence
async def on_message(message): if message.author.id in playing and message.author.id != client.user.id and message.channel.id == user[message.author.id]['channel']: async with message.channel.typing(): await asyncio.sleep(random.randint(0, config['timeover']*300) / 1000) jamo_txt = str(jamo.j2hcj(jamo.h2j(user[message.author.id]['this'][-1]))) if jamo_txt.startswith("ㄹ"): jamo_char = [user[message.author.id]['this'][-1], hangulutils.join_jamos("ㄴ"+str(jamo_txt[1:]))] else: jamo_char = message.content[0] if user[message.author.id]['this'][-1] in jamo_char: if not message.content in user[message.author.id]['used']: if message.content in word: temp = [] jamo_char = [] try: jamo_txt = str(jamo.j2hcj(jamo.h2j(message.content[-1]))) if jamo_txt.startswith("ㄹ"): jamo_char = [message.content[-1], hangulutils.join_jamos("ㅇ"+str(jamo_txt[1:]))] for i in range(len(word)): if word[i][0] in jamo_char: temp.append(word[i]) else: for i in range(len(word)): if word[i].startswith(message.content[-1]): temp.append(word[i]) user[message.author.id]['used'].append(message.content) user[message.author.id]['this'] = temp[random.randint(0, len(temp))] if message.author.id in playing: await message.channel.send("`"+message.author.display_name+"`\n**"+user[message.author.id]['this']+"**") user[message.author.id]['used'].append(user[message.author.id]['this']) user[message.author.id]['count'] = user[message.author.id]['count'] + 1 await wait(user[message.author.id]['count'], message.author.id, message) except Exception as ex: if message.author.id in playing: playing.remove(message.author.id) if user[message.author.id]['count']: embed = discord.Embed(title='게임승리', description=f"{message.author.display_name}\n`{str(user[message.author.id]['count'])}`") await message.channel.send(embed=embed) else: await message.channel.send("이미 사용한 단어자나요 :thinking:") if message.content.startswith(config['prefix']+"끝말"): if not message.author.id in playing: playing.append(message.author.id) user[message.author.id] = {} user[message.author.id]['used'] = [] user[message.author.id]['this'] = [] user[message.author.id]['this'] = "" user[message.author.id]['this'] = word[random.randint(0, len(word))] await message.channel.send("`"+message.author.display_name+"`\n**"+user[message.author.id]['this']+"**") user[message.author.id]['used'].append(user[message.author.id]['this']) user[message.author.id]['channel'] = message.channel.id user[message.author.id]['count'] = 0 user[message.author.id]['status'] = 0 await wait(user[message.author.id]['count'], message.author.id, message) else: await message.channel.send("이미 게임중이잖아요!\n뭐하는거시에오 ㅇ0ㅇㅠㅠㅠ")
def MypartFunction(request): startword = request.GET['startword'] user_log = request.session.get('user') user_id = request.session.get('user_id') user = Tuser.objects.get(user_id=user_id) ureview2 = Treview.objects.filter(treviewid=user_id) print(ureview2) ureview = [] for i in ureview2: # print(j2hcj(h2j(i.tourid.tourname))[0], startword) if j2hcj(h2j(i.tourid.tourname))[0] == startword: print(j2hcj(h2j(i.tourid.tourname))[0]) ureview.append(i) if startword == "*": if j2hcj(h2j(i.tourid.tourname))[0] not in [ 'ㄱ', 'ㄴ', 'ㄷ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅅ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ', 'ㄲ', 'ㄸ', 'ㅆ', 'ㅉ', 'ㅃ' ]: print(j2hcj(h2j(i.tourid.tourname))) ureview.append(i) print(ureview) # print(ureview) paginator = Paginator(ureview, 20) page = request.GET.get('page') try: data = paginator.page(page) except PageNotAnInteger: data = paginator.page(1) except EmptyPage: data = paginator.page(paginator.num_pages) # num_pages = 0 print(data) ## 개별 페이지 표시용 allpage = range(paginator.num_pages + 1) # return render(request, 'board.html', {'data':data, 'allpage':allpage}) urls = [] for ur in ureview: urdic = { 'tourid': ur.tourid.tourid, 'tourname': ur.tourid.tourname, 'area': ur.tourid.city + " " + ur.tourid.town, 'rating': ur.rating } urls.append(urdic) context = { 'data': data, 'allpage': allpage, 'w': startword, 'user': user, 'user_log': user_log, 'user_id': user_id } return render(request, 'mypartreview.html', context)
def load_data(mode="train"): '''Loads data Args: mode: "train" or "synthesize". ''' # Load vocabulary char2idx, idx2char = load_vocab() # load conversion dictionaries j2hcj, j2sj, j2shcj = load_j2hcj(), load_j2sj(), load_j2shcj() # Parse fpaths, text_lengths, texts = [], [], [] transcript = os.path.join(hp.data, 'jss.v1.0.txt') lines = codecs.open(transcript, 'r', 'utf-8').readlines() if mode == "train": lines = lines[:-100] else: lines = lines[-100:] for line in lines: fname, text = line.strip().split("|") fpath = os.path.join(hp.data, fname) fpaths.append(fpath) text += "␃" # ␃: EOS if hp.token_type == "char": # syllable text = list(text) else: text = [h2j(char) for char in text] text = chain.from_iterable(text) if hp.token_type == "j": # jamo text = [h2j(char) for char in text] elif hp.token_type == "sj": # single jamo text = [j2sj.get(j, j) for j in text] elif hp.token_type == "hcj": # hangul compatibility jamo text = [j2hcj.get(j, j) for j in text] elif hp.token_type == "shcj": # single hangul compatibility jamo text = [j2shcj.get(j, j) for j in text] text = chain.from_iterable(text) text = [char2idx[char] for char in text if char in char2idx] text_lengths.append(len(text)) if mode == "train": texts.append(np.array(text, np.int32).tostring()) else: texts.append(text + [0] * (hp.max_N - len(text))) return fpaths, text_lengths, texts
def jamo_to_korean(text): text = h2j(text) idx = 0 new_text = "" candidates = [] while True: if idx >= len(text): new_text += _get_text_from_candidates(candidates) break char = text[idx] mode = get_mode(char) if mode == 0: new_text += _get_text_from_candidates(candidates) candidates = [char] elif mode == -1: new_text += _get_text_from_candidates(candidates) new_text += char candidates = [] else: candidates.append(char) idx += 1 return new_text
def con_menu(post): #post내에 있는 nng들을 모두 가지는 nng_list생성 nng_list = [] ####nn = ['NNG','NNP','NNB','NP']#### for word in api.analyze(post): for morph in word.morphs: if morph.tag == 'NNG': nng_list.append(morph.lex) #nng_list내에 디저트 메뉴사전에 있는 단어가 3개 이상이면 nng_count=1, 단어가 하나도 없다면 0으로 break count = 0 nng_count = 0 while True: if nng_count >= 3: return 1 break elif count >= len(nng_list): return 0 break nng_name = nng_list[count] nng_first = j2hcj(h2j(nng_name))[0] if nng_name in menu_dic[nng_first]: nng_count += 1 count += 1
def create_phoneme_dictionary(source_path): grapheme_dict, phoneme_dict = {}, {} for lab_file in tqdm(glob(get_path(source_path, "**", "*.lab"))): sentence = read_file(lab_file) word_list = sentence.split(" ") grapheme_list = h2j(sentence).split(" ") phoneme_list = h2j(g2p(sentence)).split(" ") for idx, word in enumerate(word_list): if not word in grapheme_dict.keys(): grapheme_dict[word] = " ".join(grapheme_list[idx]) if not word in phoneme_dict.keys(): phoneme_dict[word] = " ".join(phoneme_list[idx]) return grapheme_dict, phoneme_dict
def con_verb(post): #post내에 있는 동사들을 모두 가지는 verb_list생성 verb_list = [] need_mm = ['VV', 'VA', 'VX'] for word in api.analyze(post): for morph in word.morphs: if morph.tag in need_mm: verb_list.append(morph.lex) #nng_list내에 디저트 메뉴사전에 있는 단어가 2개 이상이면 nng_count=1, 단어가 하나도 없다면 0으로 break count = 0 verb_count = 0 while True: if verb_count >= 2: return 1 break elif count >= len(verb_list): return 0 break verb = verb_list[count] verb_first = j2hcj(h2j(verb))[0] if verb in verb_dic[verb_first]: verb_count += 1 count += 1
def plot(alignment, info, text): char_len, audio_len = alignment.shape # 145, 200 fig, ax = plt.subplots(figsize=(char_len/5, 5)) im = ax.imshow( alignment.T, aspect='auto', origin='lower', interpolation='none') xlabel = 'Encoder timestep' ylabel = 'Decoder timestep' if info is not None: xlabel += '\n{}'.format(info) plt.xlabel(xlabel) plt.ylabel(ylabel) if text: jamo_text = j2hcj(h2j(normalize(text))) pad = [PAD] * (char_len - len(jamo_text) - 1) plt.xticks(range(char_len), [tok for tok in jamo_text] + [EOS] + pad) if text is not None: while True: if text[-1] in [EOS, PAD]: text = text[:-1] else: break plt.title(text) plt.tight_layout()
def page_text_finder(self, report_text): page_text = '' text = '' found = False company_name = self.file_nm.split('_')[3] company_num = self.file_nm.split('_')[4][1:] company_dict = {'LG상사': 'LG 상사'} # To resolve hangul encoding issue company_name = hangul.join_jamos(j2hcj(h2j(company_name))) if company_name in company_dict.keys(): company_name = company_dict[company_name] for line in report_text.split('\n'): if "page_id" in line and '||Title|| ' + company_name in text and company_num in text: page_text = text found = True break elif "page_id" in line: text = '' else: text += line + '\n' return page_text, found, company_name, company_num
def get_prefix_list(word, prefix_length): prefix_list = list() word = word[:prefix_length] alphabets = j2hcj(h2j(word)) for i in range(0, len(alphabets)): prefix_list.append(alphabets[:i + 1]) return prefix_list
def find_complement(input_string): # ('되다'의 경우 현재 보격 조사 판별 X) temp_string = input_string complementArr = [] N_cnt = 0 for i in range(len(temp_string)): if temp_string[i][1].find('JKC') != -1: # 형태소 분석을 한 결과에서 보격 조사를 찾음 for j in range(0, i): # 문장 처음부터 보격 조사 까지 N_cnt = 0 if (temp_string[j][1] == 'NNG' or temp_string[j][1] == 'NNP' or temp_string[j][1] == 'NNB' or temp_string[j][1] == 'NP'): N_cnt = j # 보격 조사에 가장 가까운 명사를 찾아서 for k in range(N_cnt, i + 1): #명사부터 보격 조사까지 complementArr.append(temp_string[k]) # 저장 if temp_string[i][1].find('JKS') != -1: do_jamo = j2hcj(h2j(temp_string[i + 1][0])) if (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅚ') or \ (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅙ'): for j in range(0, i): # 문장 처음부터 보격 조사 까지 N_cnt = 0 if (temp_string[j][1] == 'NNG' or temp_string[j][1] == 'NNP' or temp_string[j][1] == 'NNB' or temp_string[j][1] == 'NP'): N_cnt = j # 보격 조사에 가장 가까운 명사를 찾아서 for k in range(N_cnt, i + 1): # 명사부터 보격 조사까지 complementArr.append(temp_string[k]) # 저장 return complementArr # 한 문장 안에 보어가 여러 개가 될 수 있으므로 list의 형식으로 값을 반환
def pack_samples(batch): # Return val b_as_char_tensor = [] b_as_jamo_tensor = [] for e in batch: e_char_seq = [ torch.LongTensor([c2i[c] for c in tok]) for tok in e[0].split() ] e_jamo_seq = [ torch.LongTensor([j2i[j] for j in jamo.j2hcj(jamo.h2j(tok))]) for tok in e[0].split() ] b_as_char_tensor.append(e_char_seq) b_as_jamo_tensor.append(e_jamo_seq) b_lens = [len(t) for t in b_as_char_tensor] b_ch_padded = nn.utils.rnn.pad_sequence(sum(b_as_char_tensor, []), batch_first=True) b_jm_padded = nn.utils.rnn.pad_sequence(sum(b_as_jamo_tensor, []), batch_first=True) b_as_char_tensor = [ b_ch_padded[x - y:x] for x, y in zip(accumulate(b_lens), b_lens) ] b_as_jamo_tensor = [ b_jm_padded[x - y:x] for x, y in zip(accumulate(b_lens), b_lens) ] b_as_char_tensor = nn.utils.rnn.pad_sequence(b_as_char_tensor, batch_first=True) b_as_jamo_tensor = nn.utils.rnn.pad_sequence(b_as_jamo_tensor, batch_first=True) assert b_as_char_tensor.shape[0] == b_as_char_tensor.shape[ 0] # Same batch size assert b_as_char_tensor.shape[1] == b_as_char_tensor.shape[ 1] # Same max token count assert b_as_jamo_tensor.shape[0] == b_as_jamo_tensor.shape[0] assert b_as_jamo_tensor.shape[1] == b_as_jamo_tensor.shape[1] if batch[0][1] is not None: b_scores = torch.FloatTensor([float(e[1]) for e in batch]) else: b_scores = None if len(cuda_device) > 0: b_as_char_tensor = b_as_char_tensor.to(f"cuda:{cuda_device[0]}") b_as_jamo_tensor = b_as_jamo_tensor.to(f"cuda:{cuda_device[0]}") if b_scores is not None: b_scores = b_scores.to(f"cuda:{cuda_device[0]}") b_lens = torch.LongTensor(b_lens) return b_as_char_tensor, b_as_jamo_tensor, b_lens, b_scores
def save_to_txt(file_nm, file_text): root_dir = '/Users/daniel/Desktop/test_2/after_inspec_txt/' path = root_dir + file_nm path = hangul.join_jamos(j2hcj(h2j(path))) print(file_nm) with open(path, 'w') as out_file: out_file.write(file_text)
def string2jamo(string, letter=False): """Convert Korean string into Hangul Jamo sequence Args: letter : If true, return in Hangul compatibility Jamo. """ jamos = h2j(string) if letter: return ''.join([conv_hcj(c) for c in jamos]) return jamos
def get_jongsung_TF(sample_word): sample_text_list = list(sample_word) last_word = sample_text_list[-1] last_word_jamo_list = list(j2hcj(h2j(last_word))) last_jamo = last_word_jamo_list[-1] jongsung_TF = "T" if last_jamo in ['ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ', 'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ','ㅔ', 'ㅟ', 'ㅖ', 'ㅒ']: jongsung_TF = "F" return jongsung_TF
def inflect(verb, ending, rule): if not rule: return [] verb = h2j(verb) ending = h2j(ending) ending = "".join( hcj_to_jamo(char, "tail") if is_hcj(char) else char for char in ending) rules = rule[1:-1].split("/") forms = [] for rule in rules: end, insertion, start = rule.split(",") end = int(end) if not end == "" else 100 start = int(start) if not start == "" else 0 form = verb[:end] + insertion + ending[start:] form = j2syl(form) forms.append(form) return forms
def get_jongsung_TF(sentence): sentence = list(sentence) last_word = sentence[-1] last_word = list(j2hcj(h2j(last_word))) jongsung = "T" if last_word[-1] in ('ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ', 'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ', 'ㅔ', 'ㅟ', 'ㅖ', 'ㅒ', '2', '4', '5', '9'): jongsung = "F" return jongsung
def count_con_vow_num_spe(sentence): sentence = j2hcj(h2j(sentence)) # print(sentence) # 초성 리스트 CHOSUNG_LIST = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 중성 리스트 JUNGSUNG_LIST = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' ] # 종성 리스트 JONGSUNG_LIST = [ 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 숫자 리스트 NUMBER_LIST = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] # 특수문자 리스트 SPECIAL_LIST = [ '~', '@', '#', '$', '%', '&', '*', '(', ')', '_', '-', '+', '=', '+', '-', '`', ';', "'", ':', '>', '<', '/' ] count_consonant = [] count_vowel = [] count_number = [] count_special = [] for word in sentence: if word in CHOSUNG_LIST or word in JONGSUNG_LIST: count_consonant.append(word) elif word in JUNGSUNG_LIST: count_vowel.append(word) elif word in NUMBER_LIST: count_number.append(word) elif word in SPECIAL_LIST: count_special.append(word) #숫자로 끝나는지 여부 체크 end_with_number_flag = 0 if sentence[len(sentence) - 1] in NUMBER_LIST: end_with_number_flag = 1 count_consonant = len(count_consonant) count_vowel = len(count_vowel) count_number = end_with_number_flag count_special = len(count_special) return count_consonant, count_vowel, count_number, count_special
def write_multispeaker_emotion_metadata(source_path, savepath, speaker_dict): """ save-format filename | transcript | transcript_jamo | transcript_phoneme | speaker_label | emotion_label => LJ-Speech-styled metadata format """ contents = "" for lab_file in tqdm(glob(get_path(source_path, "**", "*.lab"))): filename = lab_file.split("/")[-1].replace("lab", "wav") transcript = read_file(lab_file) transcript_jamo = h2j(transcript) transcript_phoneme = h2j(g2p(transcript)) speaker_label = speaker_dict[filename[:3]] emotion_label = "{:05d}".format(int(lab_file.replace(".lab", "")[-5:]) - 1)[-3] contents += "{}|{}|{}|{}|{}|{}\n".format(filename, transcript, transcript_jamo, transcript_phoneme, speaker_label, emotion_label) with open(savepath, "w", encoding='utf-8') as f: f.write(contents)
def plot_alignment(alignment, path, info=None, text=None, isKorean=True): if text: tmp_alignment = alignment[:len(h2j(text)) + 2] plot(tmp_alignment, info, text, isKorean) plt.savefig(path, format='png') else: plot(alignment, info, text, isKorean) plt.savefig(path, format='png') print(" [*] Plot saved: {}".format(path))
def create_batch_inputs_from_texts(tests): sequences = [text_to_sequence(text) for in texts] inputs = _prepare_inputs(sequences) input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32) for idx, (seq, text) in enumerate(zip(inputs, texts)): recovered_text = sequence_to_text(seq, skip_eos_and_pad=True) if recovered text != h2j(text): log(" [{}] {}".format(idx, text)) log(" [{}] {}".format(idx, recovered_text)) log("="*30)
def splitOffFinalJamo(c): assert len(c) == 1 assert isHangul(c) # important to check if there even is a tail! # otherwise, the following call won't work if jamoTail(c) > 0: # get the compatibility Jamo finalJamo = j2hcj(h2j(c)[-1]) lead, vowel = jamoLead(c), jamoVowel(c) return assembleHangul(lead, vowel, 0) + finalJamo else: return c # null final: nothing to split off
def decomposition(sentence): sentence = j2hcj(h2j(sentence)) index = [] for item in sentence: if (not isHangul(item) and item.isalpha()): index.insert(-1, sentence.find(item)) break if (len(index)): part1 = list(sentence[:index[0] - 1]) part2 = sentence[index[0]:].split() return ''.join((part1 + part2)) else: return sentence
def plot_alignment( alignment, path, info=None, text=None, isKorean=True): if text: # text = '대체 투입되었던 구급대원이' tmp_alignment = alignment[:len(h2j(text)) + 2] # '대체 투입되었던 구급대원이' 푼 후, 길이 측정 <--- padding제거 효과 plot(tmp_alignment, info, text, isKorean) plt.savefig(path, format='png') else: plot(alignment, info, text, isKorean) plt.savefig(path, format='png') print(" [*] Plot saved: {}".format(path))
def read_kss_meta(path): # Parse char2idx, _ = load_vocab_tool('ko') meta = pd.read_table(path, sep='|', header=None) meta.columns = ['fpath', 'ori', 'expanded', 'decomposed', 'duration', 'en'] fpaths, texts = [], [] meta.expanded = 'P' + meta.expanded + 'E' for fpath, text in zip(meta.fpath.values, meta.expanded.values): t = np.array([char2idx[ch] for ch in jamo.h2j(text)]) f = os.path.join(os.path.basename(fpath).replace('wav', 'npy')) texts.append(t) fpaths.append(f) return fpaths, texts, texts
def find_tense(sentence): tense_table = [[ 'past', ], [ 'present', ], [ 'future', ]] # 문자열과 시제를 함께 저장할 테이블 # ____________________________ # | past(0행) | 문장 | ... # | __________________________ # | present(1행)| 문장 | ... # | __________________________ # | future(2행) | 문장 | ... # | __________________________ special_future = 0 # '것','이'를 처리하기 위한 변수 is_present_flag = True # 현재시제 판단 위한 변수 for i in range(len(sentence)): # 미래시제 1: '것''이' if sentence[i][1].find('NNB') != -1 and sentence[i][0].find('것') != -1: do_jamo = j2hcj(h2j(sentence[i - 1][0])) # jamo를 이용해 분리(할->ㅎㅏㄹ) if len(do_jamo ) > 2 and do_jamo[2] == 'ㄹ': # 종성이 있고, -ㄹ 것이 가 미래형으로 구분 special_future = special_future + 1 # NNB 는 '것'이므로 ++함 if sentence[i][1].find('VCP') != -1 and sentence[i][0].find('이') != -1: special_future = special_future + 1 # VCP 는 '이'이므로 ++함 if special_future == 2: # '것'과 '이'가 모두 존재하면 미래 시제로 판단 tense_table[2].append(sentence) is_present_flag = False break # 높임 표현(시, 십, 세, 심, 실)의 경우 처리 if sentence[i][1].find('EP') != -1 \ and not sentence[i][0].find('시') != -1 \ and not sentence[i][0].find('십') != -1 \ and not sentence[i][0].find('세') != -1 \ and not sentence[i][0].find('실') != -1 \ and not sentence[i][0].find('심') != -1: # 미래시제 2: '겠' if sentence[i][0].find('겠') != -1: tense_table[2].append(sentence) is_present_flag = False # 과거시제 else: tense_table[0].append(sentence) is_present_flag = False break # 현재시제 if is_present_flag == True: tense_table[1].append(sentence) return tense_table
def create_batch_inputs_from_texts(texts): # create_batch_inputs_from_texts 함수 define sequences = [text_to_sequence(text) for text in texts] # 받은 값을 전부 text_to_sequence함수 위치 : text/__init__.py inputs = _prepare_inputs(sequences) input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32) # input_length는 inputs의 원소의 갯수 for idx, (seq, text) in enumerate(zip(inputs, texts)): recovered_text = sequence_to_text(seq, skip_eos_and_pad=True) if recovered_text != h2j(text): log(" [{}] {}".format(idx, text)) log(" [{}] {}".format(idx, recovered_text)) log("="*30) return inputs, input_lengths
def dividehangul(string): realletter = 0 realtail = 0 headcounts = defaultdict(int) vowelcounts = defaultdict(int) tailcounts = defaultdict(int) headfound = set() vowelfound = set() tailfound = set() for letter in string: parts = jamo.j2hcj(jamo.h2j(letter)) if len(parts) > 2: head = parts[0] vowel = parts[1] tail = parts[2] realletter += 1#realletter equals realvowel realtail += 1#find list of jamo headfound.add(head) vowelfound.add(vowel) tailfound.add(tail) headcounts[head] += 1 vowelcounts[vowel] += 1 tailcounts[tail] += 1 elif len(parts) > 1: head = parts[0] vowel = parts[1] realletter += 1 headfound.add(head) vowelfound.add(vowel) headcounts[head] += 1 vowelcounts[vowel] += 1 headp = {} vowelp = {} tailp = {} with codecs.open('headjamo.txt', encoding='utf-8', mode='r') as f: for x in f.read().strip(): headp[x] = headcounts[x] / realletter if realletter != 0 else 0 with codecs.open('voweljamo.txt', encoding='utf-8', mode='r') as f: for x in f.read().strip(): vowelp[x] = vowelcounts[x] / realletter if realletter != 0 else 0 with codecs.open('tailjamo.txt', encoding='utf-8', mode='r') as f: for x in f.read().strip(): tailp[x] = tailcounts[x] / realtail if realtail != 0 else 0 return (headp, vowelp, tailp)
def test_h2j(self): """h2j tests Arguments may be iterables or characters. h2j should split every Hangul character into U+11xx jamo for any given string. Anything else is unchanged. """ tests = ["한굴", "자모=字母"] targets = ["한굴", "자모=字母"] tests_idempotent = ["", "test123~", "ㄱㄲㄴㄷㆆㅿ"] targets_idempotent = tests_idempotent all_tests = itertools.chain(zip(tests, targets), zip(tests_idempotent, targets_idempotent)) for test, target in all_tests: trial = jamo.h2j(test) assert trial == target,\ ("Converted {test} to {trial}, but " "expected {target}.").format(test=test, trial=trial, target=target)
import sys from jamo import h2j, j2hcj from collections import Counter string = sys.stdin.readline().strip() divided = [] for x in j2hcj(h2j(string)): divided.append(x) counts = Counter() for letter in divided: counts[letter] += 1 print(counts)