def ws(filename, convert2zh=False): if not os.path.exists(REPO_DIR): os.makedirs(REPO_DIR) file = os.path.join(REPO_DIR, filename) fw = codecs.open(file + '.seg.sc', 'w', encoding = 'utf-8') regex = re.compile(r'[\u4e00-\u9fffa-zA-Z0-9]+') with codecs.open(file, 'r', encoding = 'utf-8') as fr: for line in fr: line = line.split('\t', 1)[1].strip().replace('“', '').replace('”', '') line = clean(line) _list = regex.findall(line.strip()) seq = '' for span in _list: result = analyzer.analyze(span) for terms in result.toSimpleWordList(): field = terms.toString().split('/') word = field[0] if not convert2zh else HanLP.convertToTraditionalChinese(field[0]) pos = field[1] seq += word.lower() + '_' + pos + ' ' seq += ',_, ' fw.write(seq.rsplit('_', 1)[0][:-1] + '。_。\n') fw.close()
def convertToTraditionalChinese(simplifiedChineseString): """ * 简转繁 * * @param simplifiedChineseString 简体中文 * @return 繁体中文 """ return HanLP.convertToTraditionalChinese(simplifiedChineseString)
def ws(line, convert2zh=False): regex = re.compile(r'[\u4e00-\u9fffa-zA-Z0-9]+') line = line.strip().replace('“', '').replace('”', '') line = clean(line) _list = regex.findall(line.strip()) seq = '' for span in _list: result = analyzer.analyze(span) for terms in result.toSimpleWordList(): field = terms.toString().split('/') word = field[ 0] if not convert2zh else HanLP.convertToTraditionalChinese( field[0]) seq += word.lower() + " " seq += ', ' return (seq.rsplit(',', 1)[0] + '。').split()
def zh_traditional_standard(line, server_model): return HanLP.convertToTraditionalChinese(line)