def test_translate_combination_text_mode(): mode = "combination-text" assert hanja.translate(u"韓國語", mode=mode) == u"韓國語(한국어)" assert hanja.translate(u"利用해", mode=mode) == u"利用(이용)해" assert ( hanja.translate(u"大韓民國은 民主共和國이다.", mode=mode) == u"大韓民國(대한민국)은 民主共和國(민주공화국)이다." )
def test_translate_combination_html_mode(mode): assert ( hanja.translate(u"韓國語", mode=mode) == u'<span class="hanja">韓國語</span><span class="hangul">(한국어)</span>' ) assert ( hanja.translate(u"利用해", mode=mode) == u'<span class="hanja">利用</span><span class="hangul">(이용)</span>해' ) assert ( hanja.translate(u"大韓民國은 民主共和國이다.", mode=mode) == u'<span class="hanja">大韓民國</span><span class="hangul">(대한민국)' u'</span>은 <span class="hanja">民主共和國</span><span class="hangul">' u"(민주공화국)</span>이다." )
def readHanja(line): if re.search('[一-龥豈-龎]+', line): if re.search('[가-힣]+[一-龥豈-龎]+', line): line = re.sub('[一-龥豈-龎]+', '', line) else: line = hanja.translate(line, 'substitution') return line
def korean2symbols(self, text): table = {} symbol = "A" for player in filter( lambda e: "organization" in e and e["organization"] == "韓国棋院", self.PLAYERS): if "name" in player: match = re.match(r'(.*)\((.+)\)', player["name"]) hangul = match.group(1) hanja = match.group(2) else: hanja = player["mamumamuName"] hangul = H.translate(hanja, mode='substitution') if hangul in text: text = text.replace(hangul, self.DELIMITER + symbol + self.DELIMITER) table[symbol] = hanja symbol = chr(ord(symbol) + 1) for ko, ja in sorted(self.KOREAN_DICTIONARY.items(), key=lambda e: -len(e[0])): if ko in text: text = text.replace(ko, self.DELIMITER + symbol + self.DELIMITER) table[symbol] = ja symbol = chr(ord(symbol) + 1) return (text, table)
def main(argv): file_list = glob(FLAGS.input_dir + '/*.txt') file_list = sorted(file_list) logging.info( f'Preprocessing {len(file_list)} txt files to {FLAGS.output_path}') with open(FLAGS.output_path, 'w') as output_file: text_list = [] output_file.write( 'date, text, major_direction, voting, minor_direction \n') for file in tqdm(file_list, desc='preprocessing'): with open(file, 'r') as f: date = file[-14:-4].replace('-', '') text = f.read().replace('\n', ' ') # hanja translate text = text.strip() text = ''.join( [hanja.translate(c, 'substitution') for c in text]) # remove special characters text = re.sub(pattern='[^\w\s]', repl='', string=text) text = re.sub(pattern='\s{1,}', repl=' ', string=text) text_list.append('"' + date + '", "' + text + '"') # sort by date sorted(text_list, key=lambda x: x[:9], reverse=True) for text in tqdm(text_list, desc='writing output'): output_file.write(text + ' \n')
def test_translate(): assert hanja.translate(u'韓國語', mode='substitution') == u'한국어' assert hanja.translate(u'한국어', mode='substitution') == u'한국어' assert hanja.translate(u'利用해', mode='substitution') == u'이용해' assert hanja.translate(u'連結된', mode='substitution') == u'연결된' assert hanja.translate(u'1800年에', mode='substitution') == u'1800년에' assert hanja.translate(u'그레고리曆', mode='substitution') == u'그레고리력' assert hanja.translate(u'系列', mode='substitution') == u'계열'
def process(self, article): # remove bylines article = re.sub(r'\. *\S+ +\S+ +\w+@(\w+\.)+\w+', '.', article) article = re.sub(r'\S+ +\S+ +\w+@(\w+\.)+\w+', '.', article) # remove parentheses article = re.sub(r'\([^)]+\)', ' ', article) article = re.sub(r'\[[^)]+\]', ' ', article) article = re.sub(r'\<[^)]+\>', ' ', article) article = re.sub(r'\【[^)]+\】', ' ', article) # replace hanja to hangul hanja.translate(article, 'substitution') # remove special characters except necessary punctuations article = re.sub(r'[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣0-9\%\-\_\.\,\?\!\/\"\'ㆍ·。、“”‘’『』《》〈〉「」\~○×□…\ ]', ' ', article) # initialize korean language analyzers splitter = SentenceSplitter(API.HNN) tagger = Tagger(API.HNN) # split text into sentences sentences = splitter(article) # regularize sentences and split into POS article_regularized = '' for sent in sentences: sent = tagger.tagSentence(sent) sent_regularized = [] for word in sent[0].words: sent_regularized.append(' '.join([m.surface for m in word.morphemes])) article_regularized += '\n' + ' '.join(sent_regularized) # regularize whitespaces article_regularized = re.sub(r' +', ' ', article_regularized) command = ["java", "edu.stanford.nlp.process.PTBTokenizer", "-preserveLines", "-lowerCase"] result = '' echo = subprocess.Popen(["echo", "'{}'".format(article_regularized)], stdout=subprocess.PIPE) result = subprocess.check_output(command, stdin=echo.stdout) echo.wait() return result.decode("utf-8")
def test_translate_substitution_mode(): mode = "substitution" assert hanja.translate(u"韓國語", mode=mode) == u"한국어" assert hanja.translate(u"한국어", mode=mode) == u"한국어" assert hanja.translate(u"利用해", mode=mode) == u"이용해" assert hanja.translate(u"連結된", mode=mode) == u"연결된" assert hanja.translate(u"1800年에", mode=mode) == u"1800년에" assert hanja.translate(u"그레고리曆", mode=mode) == u"그레고리력" assert hanja.translate(u"系列", mode=mode) == u"계열"
def sentranslit(sentence,if_num=True,if_sym=True,if_han=True,if_eng=True,if_puncs=True,if_else=True): if if_han: sentence = hanja.translate(sentence,'substitution') ## For word-initial rule if not hgtk.checker.is_hangul(sentence): ## Only if contains non-Hangul terms s, particles, metadata = align_particles(sentence) chunks = info_to_word(metadata) chunks_4num = info_to_word(metadata) mod_chunks = trans_eojeol(chunks,chunks_4num,metadata,if_num,if_sym,if_han,if_eng,if_puncs,if_else) ## Chunks > Mod_chunks mod_chunks = check_josa(mod_chunks,chunks_4num,metadata) ## Mod_chunks > Mod_final return (' ').join([''.join(z) for z in mod_chunks]) else: return sentence
def cleansing_chinese(sentence: str = None) -> str: """ 한자를 변환하는 전처리를 하는 함수 :param sentence: 전처리 대상 문장 :return: 전처리 완료된 문장 """ # chinese character를 앞뒤로 괄호가 감싸고 있을 경우, 대부분 한글 번역임 sentence = re.sub( "\([\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]+\)", "", sentence) # 다른 한자가 있다면 한글로 치환 if re.search( "[\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]", sentence) is not None: sentence = hanja.translate(sentence, 'substitution') return sentence
def cleaning_strings(input_text): input_text = input_text.replace('<p>', ' ').replace( '</p>', '\n') # 문단 간 구분이 필요 없으므로, 문단 구분자 삭제, 줄바꿈 삽입 input_text = input_text.translate( str.maketrans('①②③④⑤⑥⑦⑴⑵⑶⑷⑸ⅠⅡⅢ', '123456712345123')) # 숫자 정리 input_text = input_text.translate( str.maketrans('―“”‘’〉∼\u3000', '-""\'\'>~ ')) # 유니코드 기호 정리 input_text = input_text.translate( {ord(i): None for i in '↑→↓⇒∇■□▲△▶▷▼◆◇○◎●★☆☞♥♪【】'}) # 특수문자 정리 # 이메일 패턴 제거 EMAIL_PATTERN = re.compile( r'(([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)(\.[a-zA-Z]{2,4}))') input_text = re.sub(EMAIL_PATTERN, ' ', input_text) # url 패턴 제거 URL_PATTERN = re.compile( "(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" ) input_text = re.sub(URL_PATTERN, ' ', input_text) input_text = re.sub('\(.+?\)', '', input_text) # 괄호 안 내용 삭제 input_text = hanja.translate(input_text, 'substitution') # 한자 -> 한글 치환 input_text = re.sub('([\'\",.\(\)\[\]\{\}<\>\:\;\/\?\!\~\…\·\=\+\-\_])', ' \g<1> ', input_text) # 각종 문장부호 전후 띄어쓰기 while True: temp_text = re.sub( '(.+)([.|,])([가-힣]+)', '\g<1>\g<2> \g<3>', input_text) # 앞텍스트.뒷텍스트 처럼 마침표/쉼표 뒤에 띄어쓰기가 없는 경우 띄어쓰기 if input_text == temp_text: # -> 재귀적으로 구현하여 여러번 시행 input_text = temp_text break else: input_text = temp_text[:] input_text = re.sub('[0-9]+', 'NUM', input_text) # 모든 숫자 NUM 으로 마스킹 input_text = re.sub('[ ]{2,}', ' ', input_text) # 띄어쓰기 2번 이상 중복된 경우 하나로 통합 output_text = input_text.strip() return output_text
def preproc_ko_basic(comment): comment = hanja.translate(comment, 'substitution') # 大韓民國은 --> 대한민국은 comment = comment.replace('ʼ', "'").upper() comment = re.sub(r'\s', ' ', comment) comment = re.sub(r'[\~\-]+', '', comment) comment = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣A-Z0-9\'\:\,\!\?\.\s\(\)]', ' ', comment).strip() comment = re.sub(r'\s[ㄱ-ㅎㅏ-ㅡ](\s|,)', ' ', comment) comment = re.sub(r'(\d+\:\d+)([가-힣A-Z])', r'\1 \2', comment) comment = re.sub(r'[ㄱㅋㄲㄷㅌㅎ그크킄큐키킥킼하핳흐흫히힣]{2,}', ' ', comment) comment = re.sub(r'(?<=[ㄱ-ㅎ][ㅏ-ㅣ])[ㅏ-ㅣ]+', ' ', comment) comment = re.sub(r'(?<=[가-힣])[ㅏ-ㅣ]+', ' ', comment) comment = re.sub(r'[ㅏ-ㅣ]{2,}', ' ', comment) comment = re.sub(r'[퓨뮤ㅍㅠㅜ\.]{2,}', ' ', comment) comment = re.sub(r'[아악앜앟]+', '아', comment) comment = re.sub(r'[어억엌엏]+', '어', comment) comment = re.sub(r'[워웡웤]+', '워', comment) comment = re.sub(r'[\!\?\.\,]+', ' ', comment) comment = re.sub(r'넘\s', r' 너무 ', comment) comment = re.sub(r'절대', r' 절대 ', comment) comment = re.sub(r'\s+', ' ', comment) return comment.strip()
def preprossessing(self, text_list, embedding_dim, maxlen=par.max_length, model=None, mode='default'): EMBEDDING_DIM = embedding_dim # MAX_SEQ_LENGTH = 50 if model is not None: self.model = model embedding_matrix = np.zeros([len(self.word2id) + 1, EMBEDDING_DIM]) for k in self.word2id: embedding_matrix[self.word2id[k]] = self.model.wv[k] index_list = [] for text in text_list: tmp_list = [] # nouns = twitter.nouns(text) text = hanja.translate(text, mode='substitution') if mode == 'default': posses = self.twitter.pos(text, stem=True, norm=True) for pos in posses: if pos[1] not in ['Eomi', 'Punctuation', 'Hashtag', 'URL', 'PreEomi', 'Josa', 'Foreign'] and\ pos[0] in self.word2id.keys(): tmp_list.append(self.word2id[pos[0]]) else: posses = self.twitter.nouns(text) for pos in posses: if pos in self.word2id.keys(): tmp_list.append(self.word2id[pos]) # print(nouns) # print(embedding_matrix[tmp_list]) index_list.append(tmp_list) index_list = pad_sequences(index_list, maxlen=maxlen) # print(np.shape(embedding_matrix[index_list])) return embedding_matrix[index_list]
def main(): #url 태그 없이 data 불러오기 for file_name in file_list: news_file_name='news_data_{FILE_NAME}.pkl'.format(FILE_NAME=file_name) with open(news_file_name,'rb') as infile: news_dset=cPickle.load(infile).loc[:,['title','body','tag']] #본문/제목 이슈 first_fun=lambda x: re.sub(r'\[.*?\]|\(.*?\)|【.*?】','',str(x)) #괄호 제거(괄호 안의 괄호는 없다고 가정) second_fun=lambda x: hanja.translate(str(x),'substitution') #한자 제거 third_fun=lambda x: re.sub(r'[가-힣]*\s+(기자|특파원)','',str(x)) #기자 이름 제거 fourth_fun=lambda x: re.sub(r'[^가-힣]|\s+','',str(x)) #그 이외 한글이 아닌 부분 제거 news_dset.loc[:,['title','body']]=news_dset.loc[:,['title','body']].applymap(first_fun).applymap(second_fun).applymap(third_fun).applymap(fourth_fun) #태그 이슈 #쓸데 없는 태그가 들어간 부분을 제거 removing_list=[] for i in range(news_dset.shape[0]): for j in news_dset.at[i,'tag']: if j in ['정치','경제','사회','세계','IT']: continue else: removing_list.append(i) break news_dset=news_dset.drop(removing_list) first_fun=lambda x: list(map(lambda k: '과학' if str(k)=='IT' else k,list(x))) #IT 태그를 과학 태그로 변환 second_fun=lambda x: set(x) #집합 태그 news_dset['tag']=news_dset['tag'].map(first_fun).map(second_fun) #처리된 파일 저장 puri_news_file_name='puri_news_data_{FILE_NAME}.pkl'.format(FILE_NAME=file_name) puri_news_file_stat_name='puri_news_data_ex_{FILE_NAME}.txt'.format(FILE_NAME=file_name) with open(puri_news_file_name,'wb') as outfile: cPickle.dump(news_dset, outfile,-1) with open(puri_news_file_stat_name,'w') as outfile: with pd.option_context('display.max_rows', None, 'display.max_columns', None): outfile.write('data lengh: '+str(news_dset.shape[0])+'\n'+'_'*40+'\n') outfile.write(str(news_dset.head(20)))
def convert(self, sentence, parsed_sentence): sentence_index = 0 converted_sentence = [] for parsed_word_index, parsed_word in enumerate(parsed_sentence): tags = tuple(parsed_word[1].split('+')) is_first_articulation = sentence[sentence_index] == ' ' if is_first_articulation: print(sentence_index) sentence_index += 1 # letter is foreign language if 'SL' in tags: iso_693_1_language_code = langdetect.detect(parsed_word[0]) iso_637_2_t_language_code = self.__convert_ISO_639_1_to_ISO_637_2_T( iso_693_1_language_code) hangul_word = \ self.__convert_to_hangul( \ iso_637_2_t_language_code.encode('utf-8'), \ parsed_word[0].encode('utf-8') \ ).decode('utf-8') for j in range(len(hangul_word)): converted_sentence.append((Hangul(hangul_word[j], j == 0), ('NNG', ))) sentence_index += len(parsed_word) # letter is hanja # TODO: distinguish korean, japanese and chinese kanji elif 'SH' in tags: hangul_word = hanja.translate(parsed_word, 'substitution') for j in hangul_word: converted_sentence.append( (Hangul(j, is_first_articulation), ('NNG', ))) # TODO: letter is number # TODO: consider ',' which is used to split numbers elif 'SN' in tags: is_hangul_number \ = parsed_word_index + 1 < len(converted_sentence) and \ 'NNBC' in converted_sentence[parsed_word_index + 1][1] converted_numbers \ = self.__convert_number_to_hangul( \ parsed_word[0], \ is_hangul_number \ ) for j in converted_numbers: converted_sentence.append( (Hangul(j, is_first_articulation), ('NNG', ))) is_first_articulation = False sentence_index += len(parsed_word[0]) # letter is a special character elif len({'SF', 'SE', 'SC', 'SY'}.intersection(tags)): sentence_index += 1 # TODO: when letter's type is SSO or SSC, add wait sign so that # listener can distinguish if the content is for describing # letter is hangul else: for j in parsed_word[0]: # print(j, sentence_index, sentence) # j is single letter hangul if j in HangulNames.keys(): # convert to hangul's name letter = HangulNames[j] for k in letter: converted_sentence.append( (Hangul(k, is_first_articulation), ('NNG', ))) is_first_articulation = False sentence_index += 1 else: converted_sentence.append( (Hangul(j, is_first_articulation), tags)) print(j, sentence_index) sentence_index += 1 is_first_articulation = False return converted_sentence
def romanizeText(transliter, text): text = text.strip() if text != '': hangul_text = hanja.translate(text, 'substitution') return transliter.translit(hangul_text) return text
def text_cleaning(text): text = hanja.translate(text, 'substitution') text = re.sub('[^가-힝0-9a-zA-Z\\s]', '', text) text = text.replace(u"카드뉴스", '').replace(u"조선일보", '') return text
target.send_keys(Keys.CONTROL + "\n") driver.switch_to.window(driver.window_handles[1]) address = driver.current_url u.write(address + '\n') ti = str(i) + ".txt" tmp = open(ti, 'w') try: d = driver.find_element_by_class_name("poem").text except: try: d = driver.find_element_by_class_name("mw-parser-output").text except: continue rr = hanja.translate(d, 'substitution') r = hangul.sub('\n', rr) tmp.write(r) tmp.close() driver.close() driver.switch_to.window(driver.window_handles[0]) i += 1 u.close()
def trans_hanja(term): ## Complementary check return hanja.translate(term,'substitution')
def normalize(in_file, out_file): fin = open(in_file, 'rb') file_data = fin.read() try: file_data = file_data.decode('utf-8') except: file_data = file_data.decode('cp949') # 소설용: 마침표 기준으로 문장 분리 # file_text = file_data.replace('\n', '') # file_text = file_text.split('.') # 일반용: \n 기준으로 문장 분리 file_text = file_data.split('\n') try: fout = open(out_file, 'a', encoding='utf-8') except: fout = open(out_file, 'w', encoding='utf-8') word_cnt = 0 line_cnt = 0 only_word = {} for line in file_text: # 한 문장에서 영어가 50%를 넘어설 경우 문장을 분석하지 않음 exp = re.compile('[a-zA-Z]+\s*') e_words = exp.findall(line.strip()) e_len = 0 if e_words: for e_word in exp.findall(line): e_len += len(e_word) if (e_len / len(line.strip())) > 0.5: continue # 불필요한 문장, 어절 제외 line = text_except(line) # 고파스 댓글에 숫자다는 것 제거 line = re.sub(u'^[0-9][0-9]/', u'', line) line = re.sub(u'^[0-9]/', u'', line) line = re.sub(u'ㅋ+', u'', line) # 기호 지우기 전에 문장분리?(.마침표) (2글자 이하(감탄사 추정) 제외..) # ' ' 혹은 " " 사이에 있는 마침표는 분리하지 않음 if bool(re.search(u'.*(\.|\?|\!) ', line)) == True and len( re.search(u'.*(\.|\?|\!) ', line).group()) > 4: line = re.sub(u'(\.|\?|\!) ', u'\n', line) # 만약 한 문장이 10000 글자를 넘어갈 경우, 문장을 둘로 분리 # while len(line) > 10000: 구현불가 for oneline in line.split('\n'): # 숫자만 있으면 제외 if oneline.isdigit() == True: continue # // 로 시작하면 제외 (주석) if oneline[:2] == '//': continue ################## 제외 목록 ####################### # 1. 괄호 안 부가 설명 # 2. 이메일 # 3. 웹주소 # 4. 전화번호 # 5. 인스타, 트위터 DM ################################################## oneline = re.sub(u'\(([0-9a-zA-Z가-힣一-龥豈-龎/]+\s*)+\)', u'', oneline) # 괄호 안 부가 설명은 삭제 oneline = re.sub(u'[A-Za-z0-9-_.]+@[A-Za-z]+(\.[A-Za-z]+)+', u'', oneline) # 이메일 oneline = re.sub( u'(http)*s*(://)*[A-Za-z가-힣]+(\.[A-Za-z]+)+(/[?&.=!A-Za-z]+)*', u'', oneline) # 웹주소 oneline = re.sub(u'\({0,1}[0-9]+\){0,1}([-.~ ][0-9]+)+', u'', oneline) # 전화번호 oneline = re.sub( u'((@[A-Za-z]+[A-Za-z-_.]+)|[A-Za-z]+[A-Za-z-_.]+@)', u'', oneline) # 인스타, 트위터 DM oneline = readUnit(oneline) # 단위 읽기(%때문에 기호 제거 전에 처리) oneline = readNumber(oneline) # 숫자 읽기 # 숫자, 영어, 한글, 한문이 아니면 제외 oneline = re.sub(u'[\'\"‘’“”]', u'', oneline) # 따옴표 삭제 oneline = re.sub(u'[^0-9a-zA-Z가-힣一-龥豈-龎]', u' ', oneline) # 특수기호 제거 oneline = longword_except(oneline) # 빈칸 처리 oneline = re.sub(u' +', u' ', oneline) oneline = re.sub(u'^ ', u'', oneline) oneline = re.sub(u' $', u'', oneline) if bool(re.search('[一-龥豈-龎]', oneline)) == True: oneline = hanja.translate(oneline, 'substitution') # 한자 읽기 oneline = readAlphabet(oneline, 'ita') # 영어 읽기 oneline = re.sub(u'\.', u'', oneline) # 빈칸 처리 oneline = re.sub(u' +', u' ', oneline) oneline = re.sub(u'^ ', u'', oneline) oneline = re.sub(u' $', u'', oneline) if len(oneline) > 2000: print(oneline) continue if oneline != '' and oneline != ' ': word_cnt += len(oneline.split(' ')) for my_word in oneline.split(' '): only_word[my_word] = 0 line_cnt += 1 fout.write(oneline) fout.write('\n') fin.close() fout.close() print('word ' + str(word_cnt)) print('line ' + str(line_cnt)) print('only_word ' + str(len(only_word.keys())))
class EasternTerm(Term): read: str def romanize(self, locale: Locale) -> Markup: return romanize(self.read, locale) normalizers: ClassVar[Mapping[Locale, OpenCC]] = { Locale.parse('ja'): OpenCC('jp2t'), Locale.parse('zh_CN'): OpenCC('s2t'), # Locale.parse('zh_HK'): OpenCC('hk2t'), # Locale.parse('zh_TW'): OpenCC('tw2t'), } readers: ClassVar[Mapping[Locale, Callable[ [str, str, Sequence[str]], Iterable[Tuple[str, Union[str, Markup]]]]]] = { Locale.parse('ja'): lambda t, n, _: ((t[sum(len(x['orig']) for x in r[:i]):][:len(e['orig'])], e['hira']) for r in [kks.convert(n)] for i, e in enumerate(r)), Locale.parse('ko'): lambda t, n, p: zip( t, # To prevent a non-spaced term from the "initial sound law" # (which is adopted by South Korean orthography; # <https://en.wikipedia.org/wiki/Dueum_beopchik>), # prepend previous terms to the input, and strip them # from the output: translate(''.join(p) + n, 'substitution')[sum(map(len, p)):]), Locale.parse('zh_CN'): lambda t, n, _: zip( t, pinyin_jyutping_sentence.pinyin(n, False, True).split()), Locale.parse('zh_HK'): lambda t, n, _: zip( t, pinyin_jyutping_sentence.jyutping(n, True, True).split()), Locale.parse('zh_TW'): lambda t, n, _: zip( t, pinyin_jyutping_sentence.pinyin(n, False, True).split()), } def normalize(self, locale: Locale) -> str: try: normalizer = self.normalizers[locale] except KeyError: return self.term else: return normalizer.convert(self.term) def read_as(self, from_: Locale, to: Locale, previous_terms: Sequence[Term], word_id: str, translation: Translation, table: Table) -> Iterable[Tuple[str, Union[str, Markup]]]: if from_ == to: return zip(self.term, self.read.split()) same_cls = type(self) target_words: Sequence[Word] = translation.get(to, []) for target_word in target_words: if target_word.id == word_id: for target_term in target_word: if target_term.correspond == self.correspond and \ isinstance(target_term, same_cls): return zip(self.term, target_term.read.split()) terms_table: Mapping[str, Term] = table.terms_table.get(to, {}) term_id = self.normalize(from_) correspond = terms_table.get(term_id) if isinstance(correspond, same_cls): return zip(self.term, correspond.read.split()) reader = self.readers.get(to) term = self.normalize(from_) if callable(reader): previous = [t.normalize(from_) for t in previous_terms] return reader(self.term, term, previous) return self.read_as(from_, from_, previous_terms, word_id, translation, table)
tcnt = 1 cnt += 1 lines = f1.readline() if not lines: break word = lines.split() cate = word[0] cate2 = word[1] time = "20" + word[2] realtime = datetime(int(time[0:4]), int(time[4:6]), int(time[6:])) pivot = datetime(2000, 1, 1) timef.write(str((realtime - pivot).days)) timef.write('\n') # timelist.append(realtime) url = word[3] a = Article(url, language='ko') a.download() a.parse() if not a.text: print("\thoho: ", cate, cate2) continue f = open('./text/news/input' + str(cate) + '-' + str(cate2) + '.txt', 'w') #print(a.title) #print(a.text) title = hanja.translate(a.title, 'substitution') f.write(title) f.write(".\n") text = hanja.translate(a.text, 'substitution') f.write(text)
def test_translate_with_invalid_mode(): with pytest.raises(ValueError): hanja.translate("Some text", mode="invalid")
print('Tokenization start') # merge document contents (title + author + body) mergeString = [] for idx in range(0, maxPage * numOfCnt): title = source['title'][idx] body = source['body'][idx] writer = source['writer'][idx] # exception handling for NonType Processing if title is None: title = ' ' if body is None: body = ' ' if writer is None: writer = ' ' text = hanja.translate(title, "substitution") title = text result = title + ' ' + writer + ' ' + body mergeString.append(result) def isHangul(text): # Check the Python Version pyVer3 = sys.version_info >= (3, 0) if pyVer3: # for Ver 3 or later encText = text else: # for Ver 2.x if type(text) is not unicode: encText = text.decode('utf-8') else: