Пример #1
0
def main():
    args = parse()
    pwd_path = os.path.abspath(os.path.dirname(__file__))
    file_path = os.path.join(pwd_path, "/" + args.input_file)

    if not os.path.exists(file_path):
        default_logger = get_logger(__file__)
        default_logger.debug("file not exists:", file_path)

    file_in = codecs.open(args.input_file, 'rb', encoding = 'utf-8').readlines()
    file_ou = codecs.open(args.output_file, 'w', encoding = 'utf-8')

    if args.effect:
        PUNCTUATION_LIST = "。,,、?:;{}[]【】“‘’”《》/!%……()<>@#$~^¥%&*\"\'=+-"
        for line in tqdm(file_in):
            line = line.strip()
            if not is_chinese(line[0]) and line[0] not in {'“', '‘', '{', '[', '【', '(', '<', '《'}:
                continue
            if line[-1] not in {'。', '?', '”', '!', '……', '’', ')'}:
                if is_chinese(line[-1]):
                    line += '。'
                else:
                    continue
            if len(line) < 5:
                continue
            if False not in [(char in PUNCTUATION_LIST or is_chinese(char)) for char in line]:
                line = traditional2simplified(line)
                file_ou.write(line + '\n')
        file_ou.close()
    else:
        for line in tqdm(file_in):
            line = line.strip()
            line = traditional2simplified(line)
            file_ou.write(line + '\n')
        file_ou.close()
Пример #2
0
def load_same_pinyin(path, sep='\t'):
    """
    加载同音字
    :param path:
    :return:
    """
    result = dict()
    if not os.path.exists(path):
        default_logger.debug("file not exists:", path)
        return result
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = traditional2simplified(line.strip())
            parts = line.split(sep)
            if parts and len(parts) > 2:
                key_char = parts[0]
                # same_pron_same_tone = set(list(parts[1]))
                # same_pron_diff_tone = set(list(parts[2]))
                # value = same_pron_same_tone.union(same_pron_diff_tone)
                value = set(list("".join(parts)))
                if len(key_char) > 1 or not value:
                    continue
                result[key_char] = value

    # these pairs would be dealed with rule
    result['他'] -= {'她', '它'}
    result['她'] -= {'他', '它'}
    result['它'] -= {'她', '他'}
    result['影'] -= {'音'}
    result['车'] = result['扯']

    return result
Пример #3
0
def load_same_stroke(path, sep=','):
    """
    加载形似字
    :param path:
    :param sep:
    :return:
    """
    result = dict()
    if not os.path.exists(path):
        default_logger.debug("file not exists:", path)
        return result
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = traditional2simplified(line.strip())
            parts = line.strip().split(sep)
            if parts and len(parts) > 1:
                for i, c in enumerate(parts):
                    result[c] = set(list(parts[:i] + parts[i + 1:]))
    return result
Пример #4
0
def load_same_stroke(path, sep=','):
    """
    加载形似字
    :param path:
    :param sep:
    :return:
    """
    result = dict()
    if not os.path.exists(path):
        default_logger.debug("file not exists:", path)
        return result
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = traditional2simplified(line.strip())
            parts = line.strip().split(sep)
            if parts and len(parts) > 1:
                for i, c in enumerate(parts):
                    result[c] = set(list(parts[:i] + parts[i + 1:]))
    return result
Пример #5
0
def load_same_pinyin(path, sep='\t'):
    """
    加载同音字
    :param path:
    :return:
    """
    result = dict()
    if not os.path.exists(path):
        default_logger.debug("file not exists:", path)
        return result
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = traditional2simplified(line.strip())
            parts = line.split(sep)
            if parts and len(parts) > 2:
                key_char = parts[0]
                same_pron_same_tone = set(list(parts[1]))
                same_pron_diff_tone = set(list(parts[2]))
                value = same_pron_same_tone.union(same_pron_diff_tone)
                if len(key_char) > 1 or not value:
                    continue
                result[key_char] = value
    return result
Пример #6
0
def load_same_pinyin(path, sep='\t'):
    """
    加载同音字
    :param path:
    :return:
    """
    result = dict()
    if not os.path.exists(path):
        default_logger.debug("file not exists:", path)
        return result
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = traditional2simplified(line.strip())
            parts = line.split(sep)
            if parts and len(parts) > 2:
                key_char = parts[0]
                same_pron_same_tone = set(list(parts[1]))
                same_pron_diff_tone = set(list(parts[2]))
                value = same_pron_same_tone.union(same_pron_diff_tone)
                if len(key_char) > 1 or not value:
                    continue
                result[key_char] = value
    return result
Пример #7
0
# -*- coding: utf-8 -*-
# Author: XuMing <*****@*****.**>
# Brief:
import enchant
from pypinyin import lazy_pinyin

from pycorrector.utils.text_utils import traditional2simplified, simplified2traditional
from pycorrector.utils.text_utils import tokenize, get_homophones_by_char, get_homophones_by_pinyin

traditional_sentence = '憂郁的臺灣烏龜'
simplified_sentence = traditional2simplified(traditional_sentence)
print(simplified_sentence)

simplified_sentence = '忧郁的台湾乌龟'
traditional_sentence = simplified2traditional(simplified_sentence)
print(traditional_sentence)

print(lazy_pinyin('中心'))  # 不带音调

print(tokenize('小姑娘蹦蹦跳跳的去了她外公家'))

# 判断拼音还是英文
en_dict = enchant.Dict("en_US")
print(en_dict.check("hello"))
print(en_dict.check("hello boy what is your name"))
strs = "hello boy what is your name"
flag = False
for word in strs:
    if en_dict.check(word):
        flag = True
    else: