def _load_dataset(self): # file_list = os.listdir('/home/Data/FoodDetection/data/text_recognition/Korean/public_crop') file_list = os.listdir( '/Data/FoodDetection/data/text_recognition/Korean/public_crop') dataset = [] for file_name in file_list: # img = os.path.join('/home/Data/FoodDetection/data/text_recognition/Korean/public_crop/', file_name) img = os.path.join( '/Data/FoodDetection/data/text_recognition/Korean/public_crop/', file_name) # 세로글자 인경우 제외 # h, w, c = np.asarray(img).shape # if h > w : # continue label = file_name.replace('.jpg', '').replace(' ', '') continue_flag = False if self.mode == 'jamo': label_split = j2hcj(h2j(label)) # 특수문자 ㅗ for char in label_split: if char not in jamo_printable: continue_flag = True if continue_flag: continue top_tmp = [] middle_tmp = [] bottom_tmp = [] for char in label: decomposed = j2hcj(h2j(char)) for i, label in enumerate( [top_tmp, middle_tmp, bottom_tmp]): try: label.append(decomposed[i]) except: label.append(' ') # for img, top, middle, bottom in zip(img, top_tmp, middle_tmp, bottom_tmp): dataset.append([img, top_tmp, middle_tmp, bottom_tmp]) elif self.mode == 'syllable': # label = list(label) for syllable in label: if syllable not in syllable_printable: continue_flag = True if continue_flag: continue dataset.append([img, label]) return dataset
async def on_message(message): if message.author.id in playing and message.author.id != client.user.id and message.channel.id == user[message.author.id]['channel']: async with message.channel.typing(): await asyncio.sleep(random.randint(0, config['timeover']*300) / 1000) jamo_txt = str(jamo.j2hcj(jamo.h2j(user[message.author.id]['this'][-1]))) if jamo_txt.startswith("ㄹ"): jamo_char = [user[message.author.id]['this'][-1], hangulutils.join_jamos("ㄴ"+str(jamo_txt[1:]))] else: jamo_char = message.content[0] if user[message.author.id]['this'][-1] in jamo_char: if not message.content in user[message.author.id]['used']: if message.content in word: temp = [] jamo_char = [] try: jamo_txt = str(jamo.j2hcj(jamo.h2j(message.content[-1]))) if jamo_txt.startswith("ㄹ"): jamo_char = [message.content[-1], hangulutils.join_jamos("ㅇ"+str(jamo_txt[1:]))] for i in range(len(word)): if word[i][0] in jamo_char: temp.append(word[i]) else: for i in range(len(word)): if word[i].startswith(message.content[-1]): temp.append(word[i]) user[message.author.id]['used'].append(message.content) user[message.author.id]['this'] = temp[random.randint(0, len(temp))] if message.author.id in playing: await message.channel.send("`"+message.author.display_name+"`\n**"+user[message.author.id]['this']+"**") user[message.author.id]['used'].append(user[message.author.id]['this']) user[message.author.id]['count'] = user[message.author.id]['count'] + 1 await wait(user[message.author.id]['count'], message.author.id, message) except Exception as ex: if message.author.id in playing: playing.remove(message.author.id) if user[message.author.id]['count']: embed = discord.Embed(title='게임승리', description=f"{message.author.display_name}\n`{str(user[message.author.id]['count'])}`") await message.channel.send(embed=embed) else: await message.channel.send("이미 사용한 단어자나요 :thinking:") if message.content.startswith(config['prefix']+"끝말"): if not message.author.id in playing: playing.append(message.author.id) user[message.author.id] = {} user[message.author.id]['used'] = [] user[message.author.id]['this'] = [] user[message.author.id]['this'] = "" user[message.author.id]['this'] = word[random.randint(0, len(word))] await message.channel.send("`"+message.author.display_name+"`\n**"+user[message.author.id]['this']+"**") user[message.author.id]['used'].append(user[message.author.id]['this']) user[message.author.id]['channel'] = message.channel.id user[message.author.id]['count'] = 0 user[message.author.id]['status'] = 0 await wait(user[message.author.id]['count'], message.author.id, message) else: await message.channel.send("이미 게임중이잖아요!\n뭐하는거시에오 ㅇ0ㅇㅠㅠㅠ")
def MypartFunction(request): startword = request.GET['startword'] user_log = request.session.get('user') user_id = request.session.get('user_id') user = Tuser.objects.get(user_id=user_id) ureview2 = Treview.objects.filter(treviewid=user_id) print(ureview2) ureview = [] for i in ureview2: # print(j2hcj(h2j(i.tourid.tourname))[0], startword) if j2hcj(h2j(i.tourid.tourname))[0] == startword: print(j2hcj(h2j(i.tourid.tourname))[0]) ureview.append(i) if startword == "*": if j2hcj(h2j(i.tourid.tourname))[0] not in [ 'ㄱ', 'ㄴ', 'ㄷ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅅ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ', 'ㄲ', 'ㄸ', 'ㅆ', 'ㅉ', 'ㅃ' ]: print(j2hcj(h2j(i.tourid.tourname))) ureview.append(i) print(ureview) # print(ureview) paginator = Paginator(ureview, 20) page = request.GET.get('page') try: data = paginator.page(page) except PageNotAnInteger: data = paginator.page(1) except EmptyPage: data = paginator.page(paginator.num_pages) # num_pages = 0 print(data) ## 개별 페이지 표시용 allpage = range(paginator.num_pages + 1) # return render(request, 'board.html', {'data':data, 'allpage':allpage}) urls = [] for ur in ureview: urdic = { 'tourid': ur.tourid.tourid, 'tourname': ur.tourid.tourname, 'area': ur.tourid.city + " " + ur.tourid.town, 'rating': ur.rating } urls.append(urdic) context = { 'data': data, 'allpage': allpage, 'w': startword, 'user': user, 'user_log': user_log, 'user_id': user_id } return render(request, 'mypartreview.html', context)
def get_prefix_list(word, prefix_length): prefix_list = list() word = word[:prefix_length] alphabets = j2hcj(h2j(word)) for i in range(0, len(alphabets)): prefix_list.append(alphabets[:i + 1]) return prefix_list
def con_verb(post): #post내에 있는 동사들을 모두 가지는 verb_list생성 verb_list = [] need_mm = ['VV', 'VA', 'VX'] for word in api.analyze(post): for morph in word.morphs: if morph.tag in need_mm: verb_list.append(morph.lex) #nng_list내에 디저트 메뉴사전에 있는 단어가 2개 이상이면 nng_count=1, 단어가 하나도 없다면 0으로 break count = 0 verb_count = 0 while True: if verb_count >= 2: return 1 break elif count >= len(verb_list): return 0 break verb = verb_list[count] verb_first = j2hcj(h2j(verb))[0] if verb in verb_dic[verb_first]: verb_count += 1 count += 1
def find_complement(input_string): # ('되다'의 경우 현재 보격 조사 판별 X) temp_string = input_string complementArr = [] N_cnt = 0 for i in range(len(temp_string)): if temp_string[i][1].find('JKC') != -1: # 형태소 분석을 한 결과에서 보격 조사를 찾음 for j in range(0, i): # 문장 처음부터 보격 조사 까지 N_cnt = 0 if (temp_string[j][1] == 'NNG' or temp_string[j][1] == 'NNP' or temp_string[j][1] == 'NNB' or temp_string[j][1] == 'NP'): N_cnt = j # 보격 조사에 가장 가까운 명사를 찾아서 for k in range(N_cnt, i + 1): #명사부터 보격 조사까지 complementArr.append(temp_string[k]) # 저장 if temp_string[i][1].find('JKS') != -1: do_jamo = j2hcj(h2j(temp_string[i + 1][0])) if (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅚ') or \ (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅙ'): for j in range(0, i): # 문장 처음부터 보격 조사 까지 N_cnt = 0 if (temp_string[j][1] == 'NNG' or temp_string[j][1] == 'NNP' or temp_string[j][1] == 'NNB' or temp_string[j][1] == 'NP'): N_cnt = j # 보격 조사에 가장 가까운 명사를 찾아서 for k in range(N_cnt, i + 1): # 명사부터 보격 조사까지 complementArr.append(temp_string[k]) # 저장 return complementArr # 한 문장 안에 보어가 여러 개가 될 수 있으므로 list의 형식으로 값을 반환
def con_menu(post): #post내에 있는 nng들을 모두 가지는 nng_list생성 nng_list = [] ####nn = ['NNG','NNP','NNB','NP']#### for word in api.analyze(post): for morph in word.morphs: if morph.tag == 'NNG': nng_list.append(morph.lex) #nng_list내에 디저트 메뉴사전에 있는 단어가 3개 이상이면 nng_count=1, 단어가 하나도 없다면 0으로 break count = 0 nng_count = 0 while True: if nng_count >= 3: return 1 break elif count >= len(nng_list): return 0 break nng_name = nng_list[count] nng_first = j2hcj(h2j(nng_name))[0] if nng_name in menu_dic[nng_first]: nng_count += 1 count += 1
def page_text_finder(self, report_text): page_text = '' text = '' found = False company_name = self.file_nm.split('_')[3] company_num = self.file_nm.split('_')[4][1:] company_dict = {'LG상사': 'LG 상사'} # To resolve hangul encoding issue company_name = hangul.join_jamos(j2hcj(h2j(company_name))) if company_name in company_dict.keys(): company_name = company_dict[company_name] for line in report_text.split('\n'): if "page_id" in line and '||Title|| ' + company_name in text and company_num in text: page_text = text found = True break elif "page_id" in line: text = '' else: text += line + '\n' return page_text, found, company_name, company_num
def plot(alignment, info, text): char_len, audio_len = alignment.shape # 145, 200 fig, ax = plt.subplots(figsize=(char_len/5, 5)) im = ax.imshow( alignment.T, aspect='auto', origin='lower', interpolation='none') xlabel = 'Encoder timestep' ylabel = 'Decoder timestep' if info is not None: xlabel += '\n{}'.format(info) plt.xlabel(xlabel) plt.ylabel(ylabel) if text: jamo_text = j2hcj(h2j(normalize(text))) pad = [PAD] * (char_len - len(jamo_text) - 1) plt.xticks(range(char_len), [tok for tok in jamo_text] + [EOS] + pad) if text is not None: while True: if text[-1] in [EOS, PAD]: text = text[:-1] else: break plt.title(text) plt.tight_layout()
def pack_samples(batch): # Return val b_as_char_tensor = [] b_as_jamo_tensor = [] for e in batch: e_char_seq = [ torch.LongTensor([c2i[c] for c in tok]) for tok in e[0].split() ] e_jamo_seq = [ torch.LongTensor([j2i[j] for j in jamo.j2hcj(jamo.h2j(tok))]) for tok in e[0].split() ] b_as_char_tensor.append(e_char_seq) b_as_jamo_tensor.append(e_jamo_seq) b_lens = [len(t) for t in b_as_char_tensor] b_ch_padded = nn.utils.rnn.pad_sequence(sum(b_as_char_tensor, []), batch_first=True) b_jm_padded = nn.utils.rnn.pad_sequence(sum(b_as_jamo_tensor, []), batch_first=True) b_as_char_tensor = [ b_ch_padded[x - y:x] for x, y in zip(accumulate(b_lens), b_lens) ] b_as_jamo_tensor = [ b_jm_padded[x - y:x] for x, y in zip(accumulate(b_lens), b_lens) ] b_as_char_tensor = nn.utils.rnn.pad_sequence(b_as_char_tensor, batch_first=True) b_as_jamo_tensor = nn.utils.rnn.pad_sequence(b_as_jamo_tensor, batch_first=True) assert b_as_char_tensor.shape[0] == b_as_char_tensor.shape[ 0] # Same batch size assert b_as_char_tensor.shape[1] == b_as_char_tensor.shape[ 1] # Same max token count assert b_as_jamo_tensor.shape[0] == b_as_jamo_tensor.shape[0] assert b_as_jamo_tensor.shape[1] == b_as_jamo_tensor.shape[1] if batch[0][1] is not None: b_scores = torch.FloatTensor([float(e[1]) for e in batch]) else: b_scores = None if len(cuda_device) > 0: b_as_char_tensor = b_as_char_tensor.to(f"cuda:{cuda_device[0]}") b_as_jamo_tensor = b_as_jamo_tensor.to(f"cuda:{cuda_device[0]}") if b_scores is not None: b_scores = b_scores.to(f"cuda:{cuda_device[0]}") b_lens = torch.LongTensor(b_lens) return b_as_char_tensor, b_as_jamo_tensor, b_lens, b_scores
def save_to_txt(file_nm, file_text): root_dir = '/Users/daniel/Desktop/test_2/after_inspec_txt/' path = root_dir + file_nm path = hangul.join_jamos(j2hcj(h2j(path))) print(file_nm) with open(path, 'w') as out_file: out_file.write(file_text)
def normalizeToCompatJamo(s): out = '' for c in s: if isNonCompatibilityJamo(c): out += j2hcj(c) else: out += c assert len(s) == len(out) return out
def get_jongsung_TF(sample_word): sample_text_list = list(sample_word) last_word = sample_text_list[-1] last_word_jamo_list = list(j2hcj(h2j(last_word))) last_jamo = last_word_jamo_list[-1] jongsung_TF = "T" if last_jamo in ['ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ', 'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ','ㅔ', 'ㅟ', 'ㅖ', 'ㅒ']: jongsung_TF = "F" return jongsung_TF
def get_jongsung_TF(sentence): sentence = list(sentence) last_word = sentence[-1] last_word = list(j2hcj(h2j(last_word))) jongsung = "T" if last_word[-1] in ('ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ', 'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ', 'ㅔ', 'ㅟ', 'ㅖ', 'ㅒ', '2', '4', '5', '9'): jongsung = "F" return jongsung
def count_con_vow_num_spe(sentence): sentence = j2hcj(h2j(sentence)) # print(sentence) # 초성 리스트 CHOSUNG_LIST = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 중성 리스트 JUNGSUNG_LIST = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' ] # 종성 리스트 JONGSUNG_LIST = [ 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 숫자 리스트 NUMBER_LIST = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] # 특수문자 리스트 SPECIAL_LIST = [ '~', '@', '#', '$', '%', '&', '*', '(', ')', '_', '-', '+', '=', '+', '-', '`', ';', "'", ':', '>', '<', '/' ] count_consonant = [] count_vowel = [] count_number = [] count_special = [] for word in sentence: if word in CHOSUNG_LIST or word in JONGSUNG_LIST: count_consonant.append(word) elif word in JUNGSUNG_LIST: count_vowel.append(word) elif word in NUMBER_LIST: count_number.append(word) elif word in SPECIAL_LIST: count_special.append(word) #숫자로 끝나는지 여부 체크 end_with_number_flag = 0 if sentence[len(sentence) - 1] in NUMBER_LIST: end_with_number_flag = 1 count_consonant = len(count_consonant) count_vowel = len(count_vowel) count_number = end_with_number_flag count_special = len(count_special) return count_consonant, count_vowel, count_number, count_special
def splitOffFinalJamo(c): assert len(c) == 1 assert isHangul(c) # important to check if there even is a tail! # otherwise, the following call won't work if jamoTail(c) > 0: # get the compatibility Jamo finalJamo = j2hcj(h2j(c)[-1]) lead, vowel = jamoLead(c), jamoVowel(c) return assembleHangul(lead, vowel, 0) + finalJamo else: return c # null final: nothing to split off
def decomposition(sentence): sentence = j2hcj(h2j(sentence)) index = [] for item in sentence: if (not isHangul(item) and item.isalpha()): index.insert(-1, sentence.find(item)) break if (len(index)): part1 = list(sentence[:index[0] - 1]) part2 = sentence[index[0]:].split() return ''.join((part1 + part2)) else: return sentence
def find_tense(sentence): tense_table = [[ 'past', ], [ 'present', ], [ 'future', ]] # 문자열과 시제를 함께 저장할 테이블 # ____________________________ # | past(0행) | 문장 | ... # | __________________________ # | present(1행)| 문장 | ... # | __________________________ # | future(2행) | 문장 | ... # | __________________________ special_future = 0 # '것','이'를 처리하기 위한 변수 is_present_flag = True # 현재시제 판단 위한 변수 for i in range(len(sentence)): # 미래시제 1: '것''이' if sentence[i][1].find('NNB') != -1 and sentence[i][0].find('것') != -1: do_jamo = j2hcj(h2j(sentence[i - 1][0])) # jamo를 이용해 분리(할->ㅎㅏㄹ) if len(do_jamo ) > 2 and do_jamo[2] == 'ㄹ': # 종성이 있고, -ㄹ 것이 가 미래형으로 구분 special_future = special_future + 1 # NNB 는 '것'이므로 ++함 if sentence[i][1].find('VCP') != -1 and sentence[i][0].find('이') != -1: special_future = special_future + 1 # VCP 는 '이'이므로 ++함 if special_future == 2: # '것'과 '이'가 모두 존재하면 미래 시제로 판단 tense_table[2].append(sentence) is_present_flag = False break # 높임 표현(시, 십, 세, 심, 실)의 경우 처리 if sentence[i][1].find('EP') != -1 \ and not sentence[i][0].find('시') != -1 \ and not sentence[i][0].find('십') != -1 \ and not sentence[i][0].find('세') != -1 \ and not sentence[i][0].find('실') != -1 \ and not sentence[i][0].find('심') != -1: # 미래시제 2: '겠' if sentence[i][0].find('겠') != -1: tense_table[2].append(sentence) is_present_flag = False # 과거시제 else: tense_table[0].append(sentence) is_present_flag = False break # 현재시제 if is_present_flag == True: tense_table[1].append(sentence) return tense_table
def AE_irregularOperation_1(c): assert len(c) == 1 assert isHangul(c) # important to check if there even is a tail! # otherwise, the following call won't work # only for 애 if jamoVowel(c) == 2: # get the compatibility Jamo finalJamo = j2hcj(h2j(c)[-1]) lead, tail = jamoLead(c), jamoTail(c) # lead null consonant is 12 # 애->아+아, or 앴->아+았 return assembleHangul(lead, 1, 0) + assembleHangul(12, 1, tail) else: return c # null final: nothing to split off
def dividehangul(string): realletter = 0 realtail = 0 headcounts = defaultdict(int) vowelcounts = defaultdict(int) tailcounts = defaultdict(int) headfound = set() vowelfound = set() tailfound = set() for letter in string: parts = jamo.j2hcj(jamo.h2j(letter)) if len(parts) > 2: head = parts[0] vowel = parts[1] tail = parts[2] realletter += 1#realletter equals realvowel realtail += 1#find list of jamo headfound.add(head) vowelfound.add(vowel) tailfound.add(tail) headcounts[head] += 1 vowelcounts[vowel] += 1 tailcounts[tail] += 1 elif len(parts) > 1: head = parts[0] vowel = parts[1] realletter += 1 headfound.add(head) vowelfound.add(vowel) headcounts[head] += 1 vowelcounts[vowel] += 1 headp = {} vowelp = {} tailp = {} with codecs.open('headjamo.txt', encoding='utf-8', mode='r') as f: for x in f.read().strip(): headp[x] = headcounts[x] / realletter if realletter != 0 else 0 with codecs.open('voweljamo.txt', encoding='utf-8', mode='r') as f: for x in f.read().strip(): vowelp[x] = vowelcounts[x] / realletter if realletter != 0 else 0 with codecs.open('tailjamo.txt', encoding='utf-8', mode='r') as f: for x in f.read().strip(): tailp[x] = tailcounts[x] / realtail if realtail != 0 else 0 return (headp, vowelp, tailp)
def EU_irregularOperation(c): assert len(c) == 1 assert isHangul(c) # important to check if there even is a tail! # otherwise, the following call won't work # only for 아,어 if jamoVowel(c) == 1 or jamoVowel(c) == 5: # get the compatibility Jamo finalJamo = j2hcj(h2j(c)[-1]) lead, vowel, tail = jamoLead(c), jamoVowel(c), jamoTail(c) # lead null consonant is 12 # 아->으+아, or 았->으+았 # 어->으+어, or 었->으+었 return assembleHangul(lead, 19, 0) + assembleHangul(12, vowel, tail) else: return c # null final: nothing to split off
def _load_dataset(self): # kor_path = '/home/Data/FoodDetection/data/text_recognition/Korean/synthetic_data/data' kor_path = '/Data/FoodDetection/data/text_recognition/Korean/synthetic_data/data' kor_images_labels = [] with open(os.path.join(kor_path, 'gt.txt'), 'r') as f: files = f.readlines() if self.need_samples == None: self.need_samples = len(files) print(f'{self.need_samples} files will be loaded') random_ids = np.random.choice(range(len(files)), size=self.need_samples, replace=False) for idx, file in enumerate(tqdm(np.asarray(files)[random_ids])): try: img_path, label = file.split(' ') img = os.path.join(kor_path, f'{img_path}.jpg') label = label.strip('\n') if self.mode == 'jamo': # label = j2hcj(h2j(label)) top_tmp = [] middle_tmp = [] bottom_tmp = [] for char in label: decomposed = j2hcj(h2j(char)) for i, label in enumerate( [top_tmp, middle_tmp, bottom_tmp]): try: label.append(decomposed[i]) except: label.append(' ') kor_images_labels.append( [img, top_tmp, middle_tmp, bottom_tmp]) elif self.mode == 'syllable': kor_images_labels.append([img, label]) except Exception as e: print(e) continue return kor_images_labels
def find_s(sentence): s_table = [] # 주어들만 저장할 테이블 for k in range(len(sentence)): # 테이블에 저장된 한 문장 길이 동안 if ((sentence[k][0] == '가' and sentence[k][1] == 'JKS') or (sentence[k][0] == '이' and sentence[k][1] == 'JKS')): do_jamo = j2hcj(h2j(sentence[k + 1][0])) # 뒤에 '되', '돼'가 오면 보어로 처리해야함 if (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅚ') or \ (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅙ'): break # 가,이 중 주격 조사인 것들에 한해 cnt = 0 for m in range(0, k): # 주격 조사 앞에 있는 것들중 if (sentence[m][1] == 'NNG' or sentence[m][1] == 'NNP' or sentence[m][1] == 'NNB' or sentence[m][1] == 'NP'): # 명사에 해당 되는 것들 중에 cnt = m # 가장 주격 조사에 가까운 것을 s_table.append(sentence[cnt]) # 주어라고 저장 s_table.append(sentence[k]) # 주어 뒤에 조사(확인용) if ((sentence[k][0] == '은' and sentence[k][1] == 'JX') or (sentence[k][0] == '는' and sentence[k][1] == 'JX')): # 은, 는 중 보조사 인것들에 한해 jks_cnt = -1 # 주격조사count변수 jx_cnt = -1 for x in range(len(sentence)): # 테이블의 i번째 문장 길이동안 if (sentence[x][1] == 'JKS'): # jsk(주격 조사가 있으면) jks_cnt += 1 # count변수++ for jx in range(0, k): if ((sentence[jx][0] == '은' and sentence[jx][1] == 'JX') or (sentence[jx][0] == '는' and sentence[jx][1] == 'JX')): jx_cnt += 1 if (jks_cnt < 0 and jx_cnt < 0): # 만약 주격 조사가 없으면 N_cnt = 0 for z in range(0, k): # 은, 는 앞에 있는 것들중 if (sentence[z][1] == 'NNG' or sentence[z][1] == 'NNP' or sentence[z][1] == 'NNB' or sentence[z][1] == 'NP'): # 명사에 해당 되는 것들 중에 N_cnt = z # 가장 주격 조사에 가까운 것을 s_table.append(sentence[N_cnt]) # 주어라고 저장 s_table.append(sentence[k]) # 주어 뒤에 조사(확인용) return s_table
def convert_pdf_to_txt(self, pdf_file): """PDF파일을 텍스트로 변환해주는 함수 Args: pdf ([PDF]): PDF파일 Returns: [dict]: PDF에서 텍스트로 변환된 결과물 """ output_string = StringIO() self.file_nm = pdf_file.split(".")[0] file_ex = pdf_file.split(".")[1] self.pdf_path = self.report_pdf_dir + pdf_file self.pdf_path = hangul.join_jamos(j2hcj(h2j(self.pdf_path))) laparams = LAParams(line_overlap=.5, char_margin=1.35, line_margin=1.0, word_margin=0.01, boxes_flow=.5, detect_vertical=False, all_texts=False) rsrcmgr = PDFResourceManager() device = FinanceConverter(rsrcmgr, output_string, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text found = False with open(self.pdf_path, 'rb') as in_file: for page_num, page in enumerate(PDFPage.get_pages(in_file, check_extractable=True)): interpreter.process_page(page) page_text = output_string.getvalue() report_text, found, company_nm, company_num = self.page_text_finder( page_text) if found: break if not found: report_text = None return report_text, company_nm, company_num
def test_j2hcj(self): """j2hcj tests Arguments may be iterables or single characters. j2hcj should convert every U+11xx jamo character into U+31xx HCJ in a given input. Anything else is unchanged. """ test_strings = ["", "test123", "ᄀᄁᄂᄃᇹᇫ"] target_strings = ["", "test123", "ㄱㄲㄴㄷㆆㅿ"] all_tests = itertools.chain(zip(test_strings, target_strings)) for test, target in all_tests: trial = jamo.j2hcj(test) assert trial == target,\ ("Matched {test} to {trial}, but " "expected {target}.").format(test=''.join(test), trial=trial, target=target)
def test_j2hcj(self): """j2hcj tests Arguments may be iterables or single characters. j2hcj should convert every U+11xx jamo character into U+31xx HCJ in a given input. Anything else is unchanged. """ test_strings = ["", "test123", "ᄀᄁᄂᄃᇹᇫ"] target_strings = ["", "test123", "ㄱㄲㄴㄷㆆㅿ"] all_tests = itertools.chain(zip(test_strings, target_strings)) for test, target in all_tests: trial = jamo.j2hcj(test) assert trial == target,\ ("Matched {test} to {trial}, but " "expected {target}.").format(test=''.join(test), trial=trial, target=target)
async def on_message(message): if message.content.startswith("ㅃ"): t = "" ctx = message.content for i in ctx[1:]: if 44032 > ord(i) or ord(i) > 55204: t = t + i else: i = j2hcj(h2j(i)) i = "ㅃ" + i[1:] t = t + join_jamos(i) await message.channel.send(f"{message.author.name}:{t}") if message.content.startswith("!호에"): a = message.content result = "호" for i in a[3:]: s = bin(ord(i))[2:] s = s.replace("1", "ㅇ") s = s.replace("0", "ㅔ") result += (s) result = join_jamos(result) await message.channel.send(f"{message.author.name}:{result}")
def plot(alignment, info, text, isKorean=True): char_len, audio_len = alignment.shape # 145, 200 fig, ax = plt.subplots(figsize=(char_len / 5, 5)) im = ax.imshow(alignment.T, aspect='auto', origin='lower', interpolation='none') xlabel = 'Encoder timestep' ylabel = 'Decoder timestep' if info is not None: xlabel += '\n{}'.format(info) plt.xlabel(xlabel) plt.ylabel(ylabel) # plt.legend('19000step',fontsize=15, loc='upper left') if text: if isKorean: jamo_text = j2hcj(h2j(normalize(text))) else: jamo_text = text pad = [PAD] * (char_len - len(jamo_text) - 1) A = [tok for tok in jamo_text] + [EOS] + pad A = [x if x != ' ' else '' for x in A] # 공백이 있으면 그 뒤가 출력되지 않는 문제... plt.xticks(range(char_len), A) if text is not None: while True: if text[-1] in [EOS, PAD]: text = text[:-1] else: break plt.title('90000 step inna \n' + text) #plt.title('90000 step kss \n' + text) plt.tight_layout()
def insert_dot(text, dot): pattern1 = re.compile(r'\S(ㄷㅏ)$') #다 # pattern2 = re.compile(r'(.ㅔ|ㅏ|ㅓ|ㅐ|ㅗ|ㅜ)(ㅇㅛ)$') # ㅔ요, ㅏ요, ㅓ요 # pattern3 = re.compile(r'(ㅆ)(ㅈ|ㅊ)(ㅛ)$') # pattern4 = re.compile(r'(ㅂ|ㅣ)(ㄴㅣㄲㅏ)$') #ㅂ니까 # pattern5 = re.compile(r'(ㄴㄷㅔ)$') text_list = [] _1 = 0 _2 = 0 _3 = 0 _4 = 0 _5 = 0 _6 = 0 for _ in text.split(' '): new_ = j2hcj(h2j(_)) if pattern1.findall(new_): text_list.append(_.replace(_, _+dot)) _1 += 1 # elif pattern2.findall(new_): # text_list.append(_.replace(_, _+dot)) # _2 += 1 # list2.append(_) # elif pattern3.findall(new_): # text_list.append(_.replace(_, _+dot)) # _3 += 1 # list3.append(_) # elif pattern4.findall(new_): # text_list.append(_.replace(_, _+dot)) # _4 += 1 # elif pattern5.findall(new_): # text_list.append(_.replace(_,_+dot)) # _5 += 1 else: text_list.append(_) _6 += 1 # print('pattern1 = {}, pattern2= {}, pattern3 = {}, pattern5 = {}, pattern6 = {}'.format(_1,_2,_3,_4,_5,_6)) #print('pattern2 = ',list2)#'\n','list5 = ',list5) return text_list
def sori(text): text_list = np.array(list(text)) text_list = text_list[np.where(text_list!=' ')] decompose = pd.Series(text_list).apply(lambda x: j2hcj(h2j(x))).tolist() # 끝소리 규칙 end_sound = ['ㄱ', 'ㄴ', 'ㄷ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅇ'] convert_end_sound = {'ㄲ': 'ㄱㄱ', 'ㄳ': 'ㄱㅅ', 'ㄶ':'ㄴㅎ', 'ㄵ': 'ㄴㅈ', 'ㄺ': 'ㄹㄱ', 'ㄻ': 'ㄹㅁ', 'ㄼ': 'ㄹㅂ', 'ㄽ': 'ㄹㅅ', 'ㄾ': 'ㄹㅌ', 'ㄿ': 'ㄹㅍ', 'ㅀ': 'ㄹㅎ', 'ㅄ': 'ㅂㅅ', 'ㅆ': 'ㅅㅅ'} end_simplize = {'ㅅ': 'ㄷ', 'ㅈ': 'ㄷ', 'ㅊ': 'ㄷ', 'ㅋ': 'ㄱ', 'ㅌ': 'ㄷ', 'ㅍ': 'ㅂ', 'ㅎ': 'ㄷ', 'ㅅㅅ':'ㄷ'} for idx, word in enumerate(decompose): if len(word)==3 and word[-1] in convert_end_sound.keys(): decompose[idx] = word[:-1] + convert_end_sound[word[-1]] for again in range(10): for idx in range(len(decompose)-1): f_idx = idx b_idx = f_idx + 1 forth, back = decompose[f_idx], decompose[b_idx] if (back[0]=='ㅇ' and forth[-2:]=='ㄹㅎ') or (back[0]=='ㅇ' and forth[-2:]=='ㄴㅎ'): decompose[f_idx] = forth[:-1] if back[0]=='ㅇ' and forth[-1] in end_sound and forth[-1] != 'ㅇ': # jong_sung -> end_sound 수정 decompose[f_idx] = forth[:-1] decompose[b_idx] = forth[-1] + back[1:] if back[0]=='ㅎ' and forth[-1] == 'ㄱ': decompose[f_idx] = forth[:-1] decompose[b_idx] = 'ㅋ' + back[1:] if (back[0]=='ㅈ' and forth[-1] == 'ㄱ') or (back[0]=='ㅈ' and forth[-1]=='ㅂ') or (back[0]=='ㅈ' and forth[-1]=='ㅍ') or (back[0]=='ㅈ' and forth[-1]=='ㄷ'): decompose[b_idx] = 'ㅉ' + back[1:] if back[0]=='ㅈ' and forth[-2:]=='ㄹㅌ': decompose[b_idx] = 'ㅉ' + back[1:] decompose[f_idx] = forth[:-1] if back[0]=='ㅈ' and forth[-2:]=='ㅅㅅ': decompose[f_idx] = forth[:-1] decompose[b_idx] = 'ㅉ' + back[1:] if (back[0]=='ㄷ' and forth[-1]=='ㅅ') or (back[0]=='ㄷ' and forth[-1]=='ㄷ'): decompose[f_idx] = forth[:-1] decompose[b_idx] = 'ㄸ' + back[1:] if back[0]=='ㄲ' and forth[-2:]=='ㅅㅅ': decompose[f_idx] = forth[:-1] if back[0]=='ㄱ' and forth[-1]=='ㅎ': decompose[f_idx] = forth[:-1] decompose[b_idx] = 'ㅋ' + back[1:] if (back[0] == 'ㄱ' and forth[-1] == 'ㅅ') or (back[0]=='ㄱ' and forth[-1] == 'ㄱ') or (back[0]=='ㄱ' and forth[-1]=='ㅍ'): decompose[f_idx] = forth[:-1] decompose[b_idx] = 'ㄲ' + back[1:] if back[0]=='ㄱ' and forth[-1]=='ㅂ': decompose[b_idx] = 'ㄲ' + back[1:] if (back[0] == 'ㅅ' and forth[-1] == 'ㅂ') or (back[0]=='ㅅ' and forth[-1]=='ㅅ') or (back[0]=='ㅅ' and forth[-1]=='ㄱ') or (back[0]=='ㅅ' and forth[-1]=='ㄹ') or (back[0] == 'ㅅ' and forth[-1] == 'ㅍ'): decompose[b_idx] = 'ㅆ' + back[1:] if back[0]=='ㄷ' and forth[-1]=='ㅎ': decompose[f_idx] = forth[:-1] decompose[b_idx] = 'ㅌ' + back[1:] if back[0]=='ㅎ' and forth[-1]=='ㅅ': decompose[b_idx] = 'ㅌ' + back[1:] decompose[f_idx] = forth[:-1] if back[0]=='ㅎ' and forth[-1]=='ㄷ': decompose[b_idx] = 'ㅊ' + back[1:] if (back[0]=='ㄱ' and forth[-1]=='ㅅ') or (back[0]=='ㄱ' and forth[-1]=='ㄷ'): decompose[b_idx] = 'ㄲ' + back[1:] if back[0]=='ㅎ' and forth[-1]=='ㅂ': decompose[f_idx] = forth[:-1] decompose[b_idx] = 'ㅍ' + back[1:] if back[0]=='ㅅ' and forth[-2:]=='ㄴㅈ': decompose[b_idx] = 'ㅆ' + back[1:] decompose[f_idx] = forth[:-1] if (back[0]=='ㅈ' and forth[-2:]=='ㄴㅎ') or (back[0]=='ㅈ' and forth[-1]=='ㅎ'): decompose[b_idx] = 'ㅊ' + back[1:] decompose[f_idx] = forth[:-1] if back[0]=='ㄷ' and forth[-1]=='ㄱ': decompose[b_idx] = 'ㄸ' + back[1:] if back[0]=='ㅂ' and forth[-1]=='ㄱ': decompose[b_idx] = 'ㅃ' + back[1:] if back[0]=='ㄷ' and forth[-2:]=='ㄹㅁ': decompose[b_idx] = 'ㄸ' + back[1:] decompose[f_idx] = forth[:-2] + 'ㅁ' if back[0]=='ㄷ' and forth[-2:]=='ㄹㅌ': decompose[b_idx] = 'ㄸ' + back[1:] decompose[f_idx] = forth[:-2] + 'ㄹ' if back[0]=='ㄹ' and forth[-1]=='ㄱ': decompose[b_idx] = 'ㄴ' + back[1:] decompose[f_idx] = forth[:-1] + 'ㅇ' if back[0]=='ㄹ' and forth[-1]=='ㄴ': decompose[f_idx] = forth[:-1] + 'ㄹ' if back[0]=='ㄹ' and forth[-1]=='ㅇ': decompose[b_idx] = 'ㄴ' + back[1:] if back[0]=='ㅁ' and forth[-1]=='ㄱ': decompose[f_idx] = forth[:-1] + 'ㅇ' if back[0]=='ㄴ' and forth[-1]=='ㄹ': decompose[b_idx] = 'ㄹ' + back[1:] if back[0]=='ㅅ' and forth[-1]=='ㄱ': decompose[b_idx] = 'ㅆ' + back[1:] if back[0]=='ㄹ' and forth[-1]=='ㄱ': decompose[f_idx] = forth[:-1] + 'ㅇ' for idx, word in enumerate(decompose): if len(word)==3 and word[-1] not in end_sound: decompose[idx] = word[:-1] + end_simplize[word[-1]] elif word[-2:]=='ㅅㅅ': decompose[idx] = word[:-2] + 'ㄷ' elif word[-2:]=='ㅂㅅ': decompose[idx] = word[:-2] + 'ㅂ' elif word[-2:]=='ㄴㅎ': decompose[idx] = word[:-2] + 'ㄴ' elif word[-2:]=='ㄱㅅ': decompose[idx] = word[:-2] + 'ㄱ' elif word[-2:]=='ㄹㅁ': decompose[idx] = word[:-2] + 'ㅁ' elif word[-2:]=='ㄹㅂ': decompose[idx] = word[:-2] + 'ㅂ' elif word[-2:]=='ㄱㄱ': decompose[idx] = word[:-2] + 'ㄱ' elif word[-2:]=='ㄴㅈ': decompose[idx] = word[:-2] + 'ㄴ' elif word[-2:]=='ㄹㄱ': decompose[idx] = word[:-2] + 'ㄱ' elif word[-2:]=='ㄹㅁ': decompose[idx] = word[:-2] + 'ㅁ' elif word[-2:]=='ㄹㅂ': decompose[idx] = word[:-2] + 'ㅂ' elif word[-2:]=='ㄹㅅ': decompose[idx] = word[:-2] + 'ㄷ' elif word[-2:]=='ㄹㅌ': decompose[idx] = word[:-2] + 'ㄷ' elif word[-2:]=='ㄹㅍ': decompose[idx] = word[:-2] + 'ㅂ' elif word[-2:]=='ㄹㅎ': decompose[idx] = word[:-2] + 'ㄹ' return decompose
def decompose(s): return jamo.j2hcj(jamo.h2j(s))
def tokenize(self, sentence): tokenized_sentence = [j for j in j2hcj(h2j(sentence))] return tokenized_sentence
import sys from jamo import h2j, j2hcj from collections import Counter string = sys.stdin.readline().strip() divided = [] for x in j2hcj(h2j(string)): divided.append(x) counts = Counter() for letter in divided: counts[letter] += 1 print(counts)