def ws(filename, convert2zh=False):

    if not os.path.exists(REPO_DIR):
        os.makedirs(REPO_DIR)

    file = os.path.join(REPO_DIR, filename)
    fw = codecs.open(file + '.seg.sc', 'w', encoding = 'utf-8')
    
    regex = re.compile(r'[\u4e00-\u9fffa-zA-Z0-9]+')

    with codecs.open(file, 'r', encoding = 'utf-8') as fr:
        for line in fr:
            line = line.split('\t', 1)[1].strip().replace('“', '').replace('”', '')
            line = clean(line)
            _list = regex.findall(line.strip())
            seq = ''
            for span in _list:
                result = analyzer.analyze(span)
                for terms in result.toSimpleWordList():
                    field = terms.toString().split('/')
                    word = field[0] if not convert2zh else HanLP.convertToTraditionalChinese(field[0])
                    pos = field[1]
                    seq += word.lower() + '_' + pos + ' '

                seq += ',_, '
                    
            fw.write(seq.rsplit('_', 1)[0][:-1] + '。_。\n')

    fw.close()
Пример #2
0
def convertToTraditionalChinese(simplifiedChineseString):
    """
     * 简转繁
     *
     * @param simplifiedChineseString 简体中文
     * @return 繁体中文

    """
    return HanLP.convertToTraditionalChinese(simplifiedChineseString)
def ws(line, convert2zh=False):

    regex = re.compile(r'[\u4e00-\u9fffa-zA-Z0-9]+')

    line = line.strip().replace('“', '').replace('”', '')
    line = clean(line)
    _list = regex.findall(line.strip())
    seq = ''
    for span in _list:
        result = analyzer.analyze(span)
        for terms in result.toSimpleWordList():
            field = terms.toString().split('/')
            word = field[
                0] if not convert2zh else HanLP.convertToTraditionalChinese(
                    field[0])
            seq += word.lower() + " "
        seq += ', '

    return (seq.rsplit(',', 1)[0] + '。').split()
Пример #4
0
def zh_traditional_standard(line, server_model):
    return HanLP.convertToTraditionalChinese(line)