Пример #1
0
def cut_td_idf(sources_path, target_path):
    """
    结巴切词,汉语
    :param path: 
    :return: 
    """
    print("cut_td_idf start! ")
    corpus = txtRead(sources_path)
    governments = []
    for corpus_one in corpus:
        corpus_one_clear = corpus_one.replace(' ', '').strip()
        ques_q2b = strQ2B(corpus_one_clear.strip())
        ques_q2b_syboml = get_syboml(ques_q2b)
        governments.append(ques_q2b_syboml.strip())

    government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))

    topic_ques_all = []
    for topic_ques_one in government_ques:
        top_ques_aqlq = topic_ques_one.replace('   ', ' ').replace(
            '  ', ' ').strip() + '\n'
        topic_ques_all.append(top_ques_aqlq)

    txtWrite(topic_ques_all, target_path)
    print("cut_td_idf ok! " + sources_path)
def clear_sentence(sentence):
    """
      数据清晰,全角转半角
    :param sentence: str, input sentence
    :return: str, clearned sentences
    """
    corpus_one_clear = str(sentence).replace(' ', '').strip()
    ques_q2b = strQ2B(corpus_one_clear.strip())
    ques_q2b_syboml = get_syboml(ques_q2b)
    return ques_q2b_syboml
Пример #3
0
def cut_td_idf_pinyin(sources_path, target_path):  # 获取拼音
    """
       汉语转拼音
    :param path: 
    :return: 
    """
    pin = xpinyin.Pinyin()
    corpus = txtRead(sources_path)
    topic_ques_all = []
    corpus_count = 0
    for corpus_one in corpus:
        corpus_count += 1
        # time1 = time.time()
        corpus_one_clear = corpus_one.replace(' ', '').strip()
        ques_q2b = strQ2B(corpus_one_clear.strip())
        ques_q2b_syboml = get_syboml(ques_q2b)
        ques_q2b_syboml_pinying = pin.get_pinyin(
            ques_q2b_syboml.replace('   ', '').replace('  ', '').strip(), ' ')
        topic_ques_all.append(ques_q2b_syboml_pinying + '\n')
        # time2 = time.time()
        # print(str(corpus_count) + 'time:' + str(time2 - time1))
    txtWrite(topic_ques_all, target_path)
    print("cut_td_idf_pinyin ok! " + sources_path)