def cut_td_idf(sources_path, target_path): """ 结巴切词,汉语 :param path: :return: """ print("cut_td_idf start! ") corpus = txtRead(sources_path) governments = [] for corpus_one in corpus: corpus_one_clear = corpus_one.replace(' ', '').strip() ques_q2b = strQ2B(corpus_one_clear.strip()) ques_q2b_syboml = get_syboml(ques_q2b) governments.append(ques_q2b_syboml.strip()) government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments)) topic_ques_all = [] for topic_ques_one in government_ques: top_ques_aqlq = topic_ques_one.replace(' ', ' ').replace( ' ', ' ').strip() + '\n' topic_ques_all.append(top_ques_aqlq) txtWrite(topic_ques_all, target_path) print("cut_td_idf ok! " + sources_path)
def clear_sentence(sentence): """ 数据清晰,全角转半角 :param sentence: str, input sentence :return: str, clearned sentences """ corpus_one_clear = str(sentence).replace(' ', '').strip() ques_q2b = strQ2B(corpus_one_clear.strip()) ques_q2b_syboml = get_syboml(ques_q2b) return ques_q2b_syboml
def cut_td_idf_pinyin(sources_path, target_path): # 获取拼音 """ 汉语转拼音 :param path: :return: """ pin = xpinyin.Pinyin() corpus = txtRead(sources_path) topic_ques_all = [] corpus_count = 0 for corpus_one in corpus: corpus_count += 1 # time1 = time.time() corpus_one_clear = corpus_one.replace(' ', '').strip() ques_q2b = strQ2B(corpus_one_clear.strip()) ques_q2b_syboml = get_syboml(ques_q2b) ques_q2b_syboml_pinying = pin.get_pinyin( ques_q2b_syboml.replace(' ', '').replace(' ', '').strip(), ' ') topic_ques_all.append(ques_q2b_syboml_pinying + '\n') # time2 = time.time() # print(str(corpus_count) + 'time:' + str(time2 - time1)) txtWrite(topic_ques_all, target_path) print("cut_td_idf_pinyin ok! " + sources_path)