예제 #1
0
파일: test_basic.py 프로젝트: suminb/hanja
def test_translate_combination_text_mode():
    mode = "combination-text"
    assert hanja.translate(u"韓國語", mode=mode) == u"韓國語(한국어)"
    assert hanja.translate(u"利用해", mode=mode) == u"利用(이용)해"
    assert (
        hanja.translate(u"大韓民國은 民主共和國이다.", mode=mode) == u"大韓民國(대한민국)은 民主共和國(민주공화국)이다."
    )
예제 #2
0
파일: test_basic.py 프로젝트: suminb/hanja
def test_translate_combination_html_mode(mode):
    assert (
        hanja.translate(u"韓國語", mode=mode)
        == u'<span class="hanja">韓國語</span><span class="hangul">(한국어)</span>'
    )
    assert (
        hanja.translate(u"利用해", mode=mode)
        == u'<span class="hanja">利用</span><span class="hangul">(이용)</span>해'
    )
    assert (
        hanja.translate(u"大韓民國은 民主共和國이다.", mode=mode)
        == u'<span class="hanja">大韓民國</span><span class="hangul">(대한민국)'
        u'</span>은 <span class="hanja">民主共和國</span><span class="hangul">'
        u"(민주공화국)</span>이다."
    )
예제 #3
0
def readHanja(line):
    if re.search('[一-龥豈-龎]+', line):
        if re.search('[가-힣]+[一-龥豈-龎]+', line):
            line = re.sub('[一-龥豈-龎]+', '', line)
        else:
            line = hanja.translate(line, 'substitution')
    return line
예제 #4
0
    def korean2symbols(self, text):
        table = {}
        symbol = "A"
        for player in filter(
                lambda e: "organization" in e and e["organization"] == "韓国棋院",
                self.PLAYERS):
            if "name" in player:
                match = re.match(r'(.*)\((.+)\)', player["name"])
                hangul = match.group(1)
                hanja = match.group(2)
            else:
                hanja = player["mamumamuName"]
                hangul = H.translate(hanja, mode='substitution')
            if hangul in text:
                text = text.replace(hangul,
                                    self.DELIMITER + symbol + self.DELIMITER)
                table[symbol] = hanja
                symbol = chr(ord(symbol) + 1)
        for ko, ja in sorted(self.KOREAN_DICTIONARY.items(),
                             key=lambda e: -len(e[0])):
            if ko in text:
                text = text.replace(ko,
                                    self.DELIMITER + symbol + self.DELIMITER)
                table[symbol] = ja
                symbol = chr(ord(symbol) + 1)

        return (text, table)
예제 #5
0
def main(argv):
    file_list = glob(FLAGS.input_dir + '/*.txt')
    file_list = sorted(file_list)

    logging.info(
        f'Preprocessing {len(file_list)} txt files to {FLAGS.output_path}')
    with open(FLAGS.output_path, 'w') as output_file:
        text_list = []
        output_file.write(
            'date, text, major_direction, voting, minor_direction \n')
        for file in tqdm(file_list, desc='preprocessing'):
            with open(file, 'r') as f:
                date = file[-14:-4].replace('-', '')
                text = f.read().replace('\n', ' ')

                # hanja translate
                text = text.strip()
                text = ''.join(
                    [hanja.translate(c, 'substitution') for c in text])

                # remove special characters
                text = re.sub(pattern='[^\w\s]', repl='', string=text)
                text = re.sub(pattern='\s{1,}', repl=' ', string=text)

                text_list.append('"' + date + '", "' + text + '"')

        # sort by date
        sorted(text_list, key=lambda x: x[:9], reverse=True)
        for text in tqdm(text_list, desc='writing output'):
            output_file.write(text + ' \n')
예제 #6
0
def test_translate():
    assert hanja.translate(u'韓國語', mode='substitution') == u'한국어'
    assert hanja.translate(u'한국어', mode='substitution') == u'한국어'
    assert hanja.translate(u'利用해', mode='substitution') == u'이용해'
    assert hanja.translate(u'連結된', mode='substitution') == u'연결된'
    assert hanja.translate(u'1800年에', mode='substitution') == u'1800년에'
    assert hanja.translate(u'그레고리曆', mode='substitution') == u'그레고리력'
    assert hanja.translate(u'系列', mode='substitution') == u'계열'
예제 #7
0
    def process(self, article):
        # remove bylines
        article = re.sub(r'\. *\S+ +\S+ +\w+@(\w+\.)+\w+', '.', article)
        article = re.sub(r'\S+ +\S+ +\w+@(\w+\.)+\w+', '.', article)

        # remove parentheses
        article = re.sub(r'\([^)]+\)', ' ', article)
        article = re.sub(r'\[[^)]+\]', ' ', article)
        article = re.sub(r'\<[^)]+\>', ' ', article)
        article = re.sub(r'\【[^)]+\】', ' ', article)

        # replace hanja to hangul
        hanja.translate(article, 'substitution')

        # remove special characters except necessary punctuations
        article = re.sub(r'[^A-Za-zㄱ-ㅎㅏ-ㅣ가-힣0-9\%\-\_\.\,\?\!\/\"\'ㆍ·。、“”‘’『』《》〈〉「」\~○×□…\ ]', ' ', article)

        # initialize korean language analyzers
        splitter = SentenceSplitter(API.HNN)
        tagger = Tagger(API.HNN)

        # split text into sentences
        sentences = splitter(article)

        # regularize sentences and split into POS
        article_regularized = ''
        for sent in sentences:
            sent = tagger.tagSentence(sent)
            sent_regularized = []
            for word in sent[0].words:
                sent_regularized.append(' '.join([m.surface for m in word.morphemes]))
            article_regularized += '\n' + ' '.join(sent_regularized)

        # regularize whitespaces
        article_regularized = re.sub(r' +', ' ', article_regularized)
        command = ["java", "edu.stanford.nlp.process.PTBTokenizer", "-preserveLines", "-lowerCase"]

        result = ''
        echo = subprocess.Popen(["echo", "'{}'".format(article_regularized)], stdout=subprocess.PIPE)
        result = subprocess.check_output(command, stdin=echo.stdout)
        echo.wait()

        return result.decode("utf-8")
예제 #8
0
파일: test_basic.py 프로젝트: suminb/hanja
def test_translate_substitution_mode():
    mode = "substitution"
    assert hanja.translate(u"韓國語", mode=mode) == u"한국어"
    assert hanja.translate(u"한국어", mode=mode) == u"한국어"
    assert hanja.translate(u"利用해", mode=mode) == u"이용해"
    assert hanja.translate(u"連結된", mode=mode) == u"연결된"
    assert hanja.translate(u"1800年에", mode=mode) == u"1800년에"
    assert hanja.translate(u"그레고리曆", mode=mode) == u"그레고리력"
    assert hanja.translate(u"系列", mode=mode) == u"계열"
예제 #9
0
def sentranslit(sentence,if_num=True,if_sym=True,if_han=True,if_eng=True,if_puncs=True,if_else=True):
    if if_han:
        sentence = hanja.translate(sentence,'substitution') ## For word-initial rule
    if not hgtk.checker.is_hangul(sentence): ## Only if contains non-Hangul terms
        s, particles, metadata  = align_particles(sentence)
        chunks = info_to_word(metadata)
        chunks_4num = info_to_word(metadata)
        mod_chunks = trans_eojeol(chunks,chunks_4num,metadata,if_num,if_sym,if_han,if_eng,if_puncs,if_else)   ## Chunks > Mod_chunks
        mod_chunks  = check_josa(mod_chunks,chunks_4num,metadata) ## Mod_chunks > Mod_final
        return (' ').join([''.join(z) for z in mod_chunks])
    else:
        return sentence
예제 #10
0
def cleansing_chinese(sentence: str = None) -> str:
    """
    한자를 변환하는 전처리를 하는 함수
    :param sentence: 전처리 대상 문장
    :return: 전처리 완료된 문장
    """
    # chinese character를 앞뒤로 괄호가 감싸고 있을 경우, 대부분 한글 번역임
    sentence = re.sub(
        "\([\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]+\)",
        "", sentence)
    # 다른 한자가 있다면 한글로 치환
    if re.search(
            "[\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]",
            sentence) is not None:
        sentence = hanja.translate(sentence, 'substitution')

    return sentence
예제 #11
0
def cleaning_strings(input_text):

    input_text = input_text.replace('<p>', ' ').replace(
        '</p>', '\n')  # 문단 간 구분이 필요 없으므로, 문단 구분자 삭제, 줄바꿈 삽입
    input_text = input_text.translate(
        str.maketrans('①②③④⑤⑥⑦⑴⑵⑶⑷⑸ⅠⅡⅢ', '123456712345123'))  # 숫자 정리
    input_text = input_text.translate(
        str.maketrans('―“”‘’〉∼\u3000', '-""\'\'>~ '))  # 유니코드 기호 정리
    input_text = input_text.translate(
        {ord(i): None
         for i in '↑→↓⇒∇■□▲△▶▷▼◆◇○◎●★☆☞♥♪【】'})  # 특수문자 정리

    # 이메일 패턴 제거
    EMAIL_PATTERN = re.compile(
        r'(([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)(\.[a-zA-Z]{2,4}))')
    input_text = re.sub(EMAIL_PATTERN, ' ', input_text)

    # url 패턴 제거
    URL_PATTERN = re.compile(
        "(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    )
    input_text = re.sub(URL_PATTERN, ' ', input_text)

    input_text = re.sub('\(.+?\)', '', input_text)  # 괄호 안 내용 삭제
    input_text = hanja.translate(input_text, 'substitution')  # 한자 -> 한글 치환
    input_text = re.sub('([\'\",.\(\)\[\]\{\}<\>\:\;\/\?\!\~\…\·\=\+\-\_])',
                        ' \g<1> ', input_text)  # 각종 문장부호 전후 띄어쓰기

    while True:
        temp_text = re.sub(
            '(.+)([.|,])([가-힣]+)', '\g<1>\g<2> \g<3>',
            input_text)  # 앞텍스트.뒷텍스트  처럼 마침표/쉼표 뒤에 띄어쓰기가 없는 경우 띄어쓰기
        if input_text == temp_text:  # -> 재귀적으로 구현하여 여러번 시행
            input_text = temp_text
            break
        else:
            input_text = temp_text[:]

    input_text = re.sub('[0-9]+', 'NUM', input_text)  # 모든 숫자 NUM 으로 마스킹
    input_text = re.sub('[ ]{2,}', ' ', input_text)  # 띄어쓰기 2번 이상 중복된 경우 하나로 통합

    output_text = input_text.strip()

    return output_text
예제 #12
0
def preproc_ko_basic(comment):
    comment = hanja.translate(comment, 'substitution')  # 大韓民國은 --> 대한민국은
    comment = comment.replace('ʼ', "'").upper()
    comment = re.sub(r'\s', ' ', comment)
    comment = re.sub(r'[\~\-]+', '', comment)
    comment = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣A-Z0-9\'\:\,\!\?\.\s\(\)]', ' ',
                     comment).strip()
    comment = re.sub(r'\s[ㄱ-ㅎㅏ-ㅡ](\s|,)', ' ', comment)
    comment = re.sub(r'(\d+\:\d+)([가-힣A-Z])', r'\1 \2', comment)
    comment = re.sub(r'[ㄱㅋㄲㄷㅌㅎ그크킄큐키킥킼하핳흐흫히힣]{2,}', ' ', comment)
    comment = re.sub(r'(?<=[ㄱ-ㅎ][ㅏ-ㅣ])[ㅏ-ㅣ]+', ' ', comment)
    comment = re.sub(r'(?<=[가-힣])[ㅏ-ㅣ]+', ' ', comment)
    comment = re.sub(r'[ㅏ-ㅣ]{2,}', ' ', comment)
    comment = re.sub(r'[퓨뮤ㅍㅠㅜ\.]{2,}', ' ', comment)
    comment = re.sub(r'[아악앜앟]+', '아', comment)
    comment = re.sub(r'[어억엌엏]+', '어', comment)
    comment = re.sub(r'[워웡웤]+', '워', comment)
    comment = re.sub(r'[\!\?\.\,]+', ' ', comment)
    comment = re.sub(r'넘\s', r' 너무 ', comment)
    comment = re.sub(r'절대', r' 절대 ', comment)
    comment = re.sub(r'\s+', ' ', comment)
    return comment.strip()
    def preprossessing(self,
                       text_list,
                       embedding_dim,
                       maxlen=par.max_length,
                       model=None,
                       mode='default'):
        EMBEDDING_DIM = embedding_dim
        # MAX_SEQ_LENGTH = 50
        if model is not None:
            self.model = model
        embedding_matrix = np.zeros([len(self.word2id) + 1, EMBEDDING_DIM])
        for k in self.word2id:
            embedding_matrix[self.word2id[k]] = self.model.wv[k]

        index_list = []
        for text in text_list:
            tmp_list = []
            # nouns = twitter.nouns(text)
            text = hanja.translate(text, mode='substitution')
            if mode == 'default':
                posses = self.twitter.pos(text, stem=True, norm=True)

                for pos in posses:
                    if pos[1] not in ['Eomi', 'Punctuation', 'Hashtag', 'URL', 'PreEomi', 'Josa', 'Foreign'] and\
                            pos[0] in self.word2id.keys():
                        tmp_list.append(self.word2id[pos[0]])
            else:
                posses = self.twitter.nouns(text)
                for pos in posses:
                    if pos in self.word2id.keys():
                        tmp_list.append(self.word2id[pos])
            # print(nouns)
            # print(embedding_matrix[tmp_list])
            index_list.append(tmp_list)
        index_list = pad_sequences(index_list, maxlen=maxlen)
        # print(np.shape(embedding_matrix[index_list]))
        return embedding_matrix[index_list]
예제 #14
0
def main():
    #url 태그 없이 data 불러오기
    for file_name in file_list:
        news_file_name='news_data_{FILE_NAME}.pkl'.format(FILE_NAME=file_name)
        with open(news_file_name,'rb') as infile:
            news_dset=cPickle.load(infile).loc[:,['title','body','tag']]
        #본문/제목 이슈
        first_fun=lambda x: re.sub(r'\[.*?\]|\(.*?\)|【.*?】','',str(x))  #괄호 제거(괄호 안의 괄호는 없다고 가정)
        second_fun=lambda x: hanja.translate(str(x),'substitution')         #한자 제거
        third_fun=lambda x: re.sub(r'[가-힣]*\s+(기자|특파원)','',str(x))  #기자 이름 제거
        fourth_fun=lambda x: re.sub(r'[^가-힣]|\s+','',str(x))                  #그 이외 한글이 아닌 부분 제거
        news_dset.loc[:,['title','body']]=news_dset.loc[:,['title','body']].applymap(first_fun).applymap(second_fun).applymap(third_fun).applymap(fourth_fun)

        #태그 이슈
        #쓸데 없는 태그가 들어간 부분을 제거
        removing_list=[]
        for i in range(news_dset.shape[0]):
            for j in news_dset.at[i,'tag']:
                if j in ['정치','경제','사회','세계','IT']:
                    continue
                else:
                    removing_list.append(i)
                    break
        news_dset=news_dset.drop(removing_list)
        
        first_fun=lambda x: list(map(lambda k: '과학' if str(k)=='IT' else k,list(x)))  #IT 태그를 과학 태그로 변환
        second_fun=lambda x: set(x)                                                     #집합 태그
        news_dset['tag']=news_dset['tag'].map(first_fun).map(second_fun)
        #처리된 파일 저장
        puri_news_file_name='puri_news_data_{FILE_NAME}.pkl'.format(FILE_NAME=file_name)
        puri_news_file_stat_name='puri_news_data_ex_{FILE_NAME}.txt'.format(FILE_NAME=file_name)
        with open(puri_news_file_name,'wb') as outfile:
            cPickle.dump(news_dset, outfile,-1)
        with open(puri_news_file_stat_name,'w') as outfile:
            with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                outfile.write('data lengh: '+str(news_dset.shape[0])+'\n'+'_'*40+'\n')
                outfile.write(str(news_dset.head(20)))
예제 #15
0
    def convert(self, sentence, parsed_sentence):
        sentence_index = 0
        converted_sentence = []

        for parsed_word_index, parsed_word in enumerate(parsed_sentence):
            tags = tuple(parsed_word[1].split('+'))

            is_first_articulation = sentence[sentence_index] == ' '

            if is_first_articulation:
                print(sentence_index)
                sentence_index += 1

            # letter is foreign language
            if 'SL' in tags:
                iso_693_1_language_code = langdetect.detect(parsed_word[0])
                iso_637_2_t_language_code = self.__convert_ISO_639_1_to_ISO_637_2_T(
                    iso_693_1_language_code)

                hangul_word = \
                    self.__convert_to_hangul( \
                        iso_637_2_t_language_code.encode('utf-8'), \
                        parsed_word[0].encode('utf-8') \
                    ).decode('utf-8')

                for j in range(len(hangul_word)):
                    converted_sentence.append((Hangul(hangul_word[j],
                                                      j == 0), ('NNG', )))

                sentence_index += len(parsed_word)

            # letter is hanja
            # TODO: distinguish korean, japanese and chinese kanji
            elif 'SH' in tags:
                hangul_word = hanja.translate(parsed_word, 'substitution')
                for j in hangul_word:
                    converted_sentence.append(
                        (Hangul(j, is_first_articulation), ('NNG', )))

            # TODO: letter is number
            # TODO: consider ',' which is used to split numbers
            elif 'SN' in tags:
                is_hangul_number \
                    = parsed_word_index + 1 < len(converted_sentence) and \
                      'NNBC' in converted_sentence[parsed_word_index + 1][1]


                converted_numbers \
                    = self.__convert_number_to_hangul( \
                        parsed_word[0], \
                        is_hangul_number \
                    )

                for j in converted_numbers:
                    converted_sentence.append(
                        (Hangul(j, is_first_articulation), ('NNG', )))
                    is_first_articulation = False

                sentence_index += len(parsed_word[0])

            # letter is a special character
            elif len({'SF', 'SE', 'SC', 'SY'}.intersection(tags)):
                sentence_index += 1

            # TODO: when letter's type is SSO or SSC, add wait sign so that
            #       listener can distinguish if the content is for describing

            # letter is hangul
            else:
                for j in parsed_word[0]:
                    # print(j, sentence_index, sentence)

                    # j is single letter hangul
                    if j in HangulNames.keys():
                        # convert to hangul's name
                        letter = HangulNames[j]

                        for k in letter:
                            converted_sentence.append(
                                (Hangul(k, is_first_articulation), ('NNG', )))
                            is_first_articulation = False

                        sentence_index += 1

                    else:
                        converted_sentence.append(
                            (Hangul(j, is_first_articulation), tags))
                        print(j, sentence_index)

                        sentence_index += 1

                    is_first_articulation = False
        return converted_sentence
예제 #16
0
def romanizeText(transliter, text):
    text = text.strip()
    if text != '':
        hangul_text = hanja.translate(text, 'substitution')
        return transliter.translit(hangul_text)
    return text
예제 #17
0
def text_cleaning(text):
    text = hanja.translate(text, 'substitution')
    text = re.sub('[^가-힝0-9a-zA-Z\\s]', '', text)
    text = text.replace(u"카드뉴스", '').replace(u"조선일보", '')
    return text
예제 #18
0
        target.send_keys(Keys.CONTROL + "\n")

        driver.switch_to.window(driver.window_handles[1])
        address = driver.current_url
        u.write(address + '\n')

        ti = str(i) + ".txt"
        tmp = open(ti, 'w')
        try:
            d = driver.find_element_by_class_name("poem").text
        except:
            try:
                d = driver.find_element_by_class_name("mw-parser-output").text
            except:
                continue
        rr = hanja.translate(d, 'substitution')

        r = hangul.sub('\n', rr)

        tmp.write(r)

        tmp.close()

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        i += 1

u.close()
예제 #19
0
def trans_hanja(term): ## Complementary check
    return hanja.translate(term,'substitution')
예제 #20
0
def normalize(in_file, out_file):
    fin = open(in_file, 'rb')
    file_data = fin.read()
    try:
        file_data = file_data.decode('utf-8')
    except:
        file_data = file_data.decode('cp949')

    # 소설용: 마침표 기준으로 문장 분리
    # file_text = file_data.replace('\n', '')
    # file_text = file_text.split('.')

    # 일반용: \n 기준으로 문장 분리
    file_text = file_data.split('\n')

    try:
        fout = open(out_file, 'a', encoding='utf-8')
    except:
        fout = open(out_file, 'w', encoding='utf-8')

    word_cnt = 0
    line_cnt = 0
    only_word = {}
    for line in file_text:
        # 한 문장에서 영어가 50%를 넘어설 경우 문장을 분석하지 않음
        exp = re.compile('[a-zA-Z]+\s*')
        e_words = exp.findall(line.strip())
        e_len = 0

        if e_words:
            for e_word in exp.findall(line):
                e_len += len(e_word)

            if (e_len / len(line.strip())) > 0.5:
                continue

        # 불필요한 문장, 어절 제외
        line = text_except(line)

        # 고파스 댓글에 숫자다는 것 제거
        line = re.sub(u'^[0-9][0-9]/', u'', line)
        line = re.sub(u'^[0-9]/', u'', line)
        line = re.sub(u'ㅋ+', u'', line)

        # 기호 지우기 전에 문장분리?(.마침표) (2글자 이하(감탄사 추정) 제외..)
        # ' ' 혹은 " " 사이에 있는 마침표는 분리하지 않음
        if bool(re.search(u'.*(\.|\?|\!) ', line)) == True and len(
                re.search(u'.*(\.|\?|\!) ', line).group()) > 4:
            line = re.sub(u'(\.|\?|\!) ', u'\n', line)

        # 만약 한 문장이 10000 글자를 넘어갈 경우, 문장을 둘로 분리
        # while len(line) > 10000: 구현불가

        for oneline in line.split('\n'):
            # 숫자만 있으면 제외
            if oneline.isdigit() == True:
                continue

            # // 로 시작하면 제외 (주석)
            if oneline[:2] == '//':
                continue

            ################## 제외 목록 #######################
            # 1. 괄호 안 부가 설명
            # 2. 이메일
            # 3. 웹주소
            # 4. 전화번호
            # 5. 인스타, 트위터 DM
            ##################################################
            oneline = re.sub(u'\(([0-9a-zA-Z가-힣一-龥豈-龎/]+\s*)+\)', u'',
                             oneline)  # 괄호 안 부가 설명은 삭제
            oneline = re.sub(u'[A-Za-z0-9-_.]+@[A-Za-z]+(\.[A-Za-z]+)+', u'',
                             oneline)  # 이메일
            oneline = re.sub(
                u'(http)*s*(://)*[A-Za-z가-힣]+(\.[A-Za-z]+)+(/[?&.=!A-Za-z]+)*',
                u'', oneline)  # 웹주소
            oneline = re.sub(u'\({0,1}[0-9]+\){0,1}([-.~ ][0-9]+)+', u'',
                             oneline)  # 전화번호
            oneline = re.sub(
                u'((@[A-Za-z]+[A-Za-z-_.]+)|[A-Za-z]+[A-Za-z-_.]+@)', u'',
                oneline)  # 인스타, 트위터 DM

            oneline = readUnit(oneline)  # 단위 읽기(%때문에 기호 제거 전에 처리)
            oneline = readNumber(oneline)  # 숫자 읽기

            # 숫자, 영어, 한글, 한문이 아니면 제외
            oneline = re.sub(u'[\'\"‘’“”]', u'', oneline)  # 따옴표 삭제
            oneline = re.sub(u'[^0-9a-zA-Z가-힣一-龥豈-龎]', u' ',
                             oneline)  # 특수기호 제거
            oneline = longword_except(oneline)

            # 빈칸 처리
            oneline = re.sub(u' +', u' ', oneline)
            oneline = re.sub(u'^ ', u'', oneline)
            oneline = re.sub(u' $', u'', oneline)

            if bool(re.search('[一-龥豈-龎]', oneline)) == True:
                oneline = hanja.translate(oneline, 'substitution')  # 한자 읽기
            oneline = readAlphabet(oneline, 'ita')  # 영어 읽기

            oneline = re.sub(u'\.', u'', oneline)

            # 빈칸 처리
            oneline = re.sub(u' +', u' ', oneline)
            oneline = re.sub(u'^ ', u'', oneline)
            oneline = re.sub(u' $', u'', oneline)

            if len(oneline) > 2000:
                print(oneline)
                continue
            if oneline != '' and oneline != ' ':
                word_cnt += len(oneline.split(' '))
                for my_word in oneline.split(' '):
                    only_word[my_word] = 0
                line_cnt += 1
                fout.write(oneline)
                fout.write('\n')

    fin.close()
    fout.close()

    print('word ' + str(word_cnt))
    print('line ' + str(line_cnt))
    print('only_word ' + str(len(only_word.keys())))
예제 #21
0
class EasternTerm(Term):
    read: str

    def romanize(self, locale: Locale) -> Markup:
        return romanize(self.read, locale)

    normalizers: ClassVar[Mapping[Locale, OpenCC]] = {
        Locale.parse('ja'): OpenCC('jp2t'),
        Locale.parse('zh_CN'): OpenCC('s2t'),
        # Locale.parse('zh_HK'): OpenCC('hk2t'),
        # Locale.parse('zh_TW'): OpenCC('tw2t'),
    }

    readers: ClassVar[Mapping[Locale, Callable[
        [str, str, Sequence[str]],
        Iterable[Tuple[str, Union[str, Markup]]]]]] = {
            Locale.parse('ja'):
            lambda t, n, _:
            ((t[sum(len(x['orig'])
                    for x in r[:i]):][:len(e['orig'])], e['hira'])
             for r in [kks.convert(n)] for i, e in enumerate(r)),
            Locale.parse('ko'):
            lambda t, n, p: zip(
                t,
                # To prevent a non-spaced term from the "initial sound law"
                # (which is adopted by South Korean orthography;
                # <https://en.wikipedia.org/wiki/Dueum_beopchik>),
                # prepend previous terms to the input, and strip them
                # from the output:
                translate(''.join(p) + n, 'substitution')[sum(map(len, p)):]),
            Locale.parse('zh_CN'):
            lambda t, n, _: zip(
                t,
                pinyin_jyutping_sentence.pinyin(n, False, True).split()),
            Locale.parse('zh_HK'):
            lambda t, n, _: zip(
                t,
                pinyin_jyutping_sentence.jyutping(n, True, True).split()),
            Locale.parse('zh_TW'):
            lambda t, n, _: zip(
                t,
                pinyin_jyutping_sentence.pinyin(n, False, True).split()),
        }

    def normalize(self, locale: Locale) -> str:
        try:
            normalizer = self.normalizers[locale]
        except KeyError:
            return self.term
        else:
            return normalizer.convert(self.term)

    def read_as(self, from_: Locale, to: Locale,
                previous_terms: Sequence[Term], word_id: str,
                translation: Translation,
                table: Table) -> Iterable[Tuple[str, Union[str, Markup]]]:
        if from_ == to:
            return zip(self.term, self.read.split())
        same_cls = type(self)
        target_words: Sequence[Word] = translation.get(to, [])
        for target_word in target_words:
            if target_word.id == word_id:
                for target_term in target_word:
                    if target_term.correspond == self.correspond and \
                       isinstance(target_term, same_cls):
                        return zip(self.term, target_term.read.split())
        terms_table: Mapping[str, Term] = table.terms_table.get(to, {})
        term_id = self.normalize(from_)
        correspond = terms_table.get(term_id)
        if isinstance(correspond, same_cls):
            return zip(self.term, correspond.read.split())
        reader = self.readers.get(to)
        term = self.normalize(from_)
        if callable(reader):
            previous = [t.normalize(from_) for t in previous_terms]
            return reader(self.term, term, previous)
        return self.read_as(from_, from_, previous_terms, word_id, translation,
                            table)
예제 #22
0
파일: newscrawl.py 프로젝트: jjayd/project
            tcnt = 1
            cnt += 1
    lines = f1.readline()
    if not lines:
        break
    word = lines.split()
    cate = word[0]
    cate2 = word[1]
    time = "20" + word[2]
    realtime = datetime(int(time[0:4]), int(time[4:6]), int(time[6:]))
    pivot = datetime(2000, 1, 1)
    timef.write(str((realtime - pivot).days))
    timef.write('\n')

    # timelist.append(realtime)
    url = word[3]
    a = Article(url, language='ko')
    a.download()
    a.parse()
    if not a.text:
        print("\thoho: ", cate, cate2)
        continue
    f = open('./text/news/input' + str(cate) + '-' + str(cate2) + '.txt', 'w')
    #print(a.title)
    #print(a.text)
    title = hanja.translate(a.title, 'substitution')
    f.write(title)
    f.write(".\n")
    text = hanja.translate(a.text, 'substitution')
    f.write(text)
예제 #23
0
파일: test_basic.py 프로젝트: suminb/hanja
def test_translate_with_invalid_mode():
    with pytest.raises(ValueError):
        hanja.translate("Some text", mode="invalid")
예제 #24
0
print('Tokenization start')
# merge document contents (title + author + body)
mergeString = []
for idx in range(0, maxPage * numOfCnt):
    title = source['title'][idx]
    body = source['body'][idx]
    writer = source['writer'][idx]
    # exception handling for NonType Processing
    if title is None:
        title = ' '
    if body is None:
        body = ' '
    if writer is None:
        writer = ' '

    text = hanja.translate(title, "substitution")
    title = text
    result = title + ' ' + writer + ' ' + body
    mergeString.append(result)


def isHangul(text):
    # Check the Python Version
    pyVer3 = sys.version_info >= (3, 0)

    if pyVer3:  # for Ver 3 or later
        encText = text
    else:  # for Ver 2.x
        if type(text) is not unicode:
            encText = text.decode('utf-8')
        else: