def make_biake_corpora(baike_input_file_path, output_file_path): word_id_map = WordIdMap() with open(baike_input_file_path,'r') as input: for line in input: document = Document(line.strip()) document.split_sentences(WhiteSpaceSegmenter()) document.segement_words(WhiteSpaceSegmenter()) word_id_map.add_document(document) word_id_map.save_as_text(output_file_path)
def make_sentence_corpora(baike_input_file_path, output_file_path): output = codecs.open(output_file_path,'r','utf-8') with codecs.open(baike_input_file_path,'r','utf-8') as input: for line in input: document = Document(line.strip()) document.split_sentences(SentenceSegmenter()) for sentence in document.sentences: output.write(str(document.id) + "\t" + str(sentence.id) + "\t" + sentence.content + "\n")