def text_preprocess(self, sentence): ''' 将sentence中的繁体字转为简体字 :param sentence: 待转换的句子 :return: 将句子中繁体字转换为简体字之后的句子 ''' punctu_reg = '''\\【.*?】+|\\《.*?》+|\\#.*?#+|[.!/_,$&%^*()<>+""'?@|:~{}#]+|[——!\\\,。=?、:“”‘’¥……()《》【】]''' if FLAGS.simple == 'true': sentence = Converter('zh-hans').convert(sentence.decode('utf8')) elif FLAGS.simple == 'false': sentence = Converter('zh-hant').convert(sentence.decode('utf8')) elif FLAGS.simple == 'mix': sentence = Converter('zh-hans').convert( sentence.decode('utf8')) + Converter('zh-hant').convert( sentence.decode('utf8')) else: sentence = sentence.decode('utf8') '''过滤标点符号''' if FLAGS.filter_punct: return re.sub(punctu_reg, '', sentence) return sentence
def cn2zh(line): from langconv import Converter # 簡體轉繁體 line = line.encode('utf-8') line = Converter('zh-hant').convert(line.decode('utf-8')) return line