start_num += len(v) for k, v in end_dic.items(): if len(v) > 1: end_num += len(v) print("All {}, start {}, end {}".format(all_num, start_num, end_num)) if __name__ == "__main__": reader = Reader() reader.read_and_gen_vectors_pubmed_word2vec(config.embed_path) reader.read_all_data("./data/genia/", "genia.train", "genia.dev", "genia.test") # print reader.train_sents[0] train_batches, dev_batches, test_batches = reader.to_batch( config.batch_size) f = open(config.train_data_path, 'wb') pickle.dump(train_batches, f) f.close() f = open(config.dev_data_path, 'wb') pickle.dump(dev_batches, f) f.close() f = open(config.test_data_path, 'wb') pickle.dump(test_batches, f) f.close() batch_stat(train_batches) batch_stat(dev_batches) batch_stat(test_batches)
for k, v in start_dic.items(): if len(v) > 1: start_num += len(v) for k, v in end_dic.items(): if len(v) > 1: end_num += len(v) print("All {}, start {}, end {}".format(all_num, start_num, end_num)) if __name__ == "__main__": reader = Reader(config) reader.read_all_data() # print reader.train_sents[0] train_batches, dev_batches, test_batches = reader.to_batch() f = open(config.data_path + "_train.pkl", 'wb') pickle.dump(train_batches, f) f.close() f = open(config.data_path + "_dev.pkl", 'wb') pickle.dump(dev_batches, f) f.close() f = open(config.data_path + "_test.pkl", 'wb') pickle.dump(test_batches, f) f.close() #batch_stat(train_batches) #batch_stat(dev_batches) #batch_stat(test_batches)