Exemplo n.º 1
0
 def text_preprocess(self, sentence):
     '''
     将sentence中的繁体字转为简体字
     :param sentence: 待转换的句子
     :return: 将句子中繁体字转换为简体字之后的句子
     '''
     punctu_reg = '''\\【.*?】+|\\《.*?》+|\\#.*?#+|[.!/_,$&%^*()<>+""'?@|:~{}#]+|[——!\\\,。=?、:“”‘’¥……()《》【】]'''
     if FLAGS.simple == 'true':
         sentence = Converter('zh-hans').convert(sentence.decode('utf8'))
     elif FLAGS.simple == 'false':
         sentence = Converter('zh-hant').convert(sentence.decode('utf8'))
     elif FLAGS.simple == 'mix':
         sentence = Converter('zh-hans').convert(
             sentence.decode('utf8')) + Converter('zh-hant').convert(
                 sentence.decode('utf8'))
     else:
         sentence = sentence.decode('utf8')
     '''过滤标点符号'''
     if FLAGS.filter_punct:
         return re.sub(punctu_reg, '', sentence)
     return sentence
Exemplo n.º 2
0
def cn2zh(line):
    from langconv import Converter
    # 簡體轉繁體
    line = line.encode('utf-8')
    line = Converter('zh-hant').convert(line.decode('utf-8'))
    return line