end_dic[(ent[1], ent[2])].append(ent) all_num += 1 for k, v in start_dic.items(): if len(v) > 1: start_num += len(v) for k, v in end_dic.items(): if len(v) > 1: end_num += len(v) print("All {}, start {}, end {}".format(all_num, start_num, end_num)) if __name__ == "__main__": reader = Reader() reader.read_and_gen_vectors_pubmed_word2vec(config.embed_path) reader.read_all_data("./data/genia/", "genia.train", "genia.dev", "genia.test") # print reader.train_sents[0] train_batches, dev_batches, test_batches = reader.to_batch( config.batch_size) f = open(config.train_data_path, 'wb') pickle.dump(train_batches, f) f.close() f = open(config.dev_data_path, 'wb') pickle.dump(dev_batches, f) f.close() f = open(config.test_data_path, 'wb') pickle.dump(test_batches, f) f.close()
start_dic[(ent[0], ent[2])].append(ent) end_dic[(ent[1], ent[2])].append(ent) all_num += 1 for k, v in start_dic.items(): if len(v) > 1: start_num += len(v) for k, v in end_dic.items(): if len(v) > 1: end_num += len(v) print("All {}, start {}, end {}".format(all_num, start_num, end_num)) if __name__ == "__main__": reader = Reader(config.bert_model) reader.read_all_data("./data/ace2005/", "ace2005.train", "ace2005.dev", "ace2005.test") # print reader.train_sents[0] train_batches, dev_batches, test_batches = reader.to_batch( config.batch_size) f = open(config.train_data_path, 'wb') pickle.dump(train_batches, f) f.close() f = open(config.dev_data_path, 'wb') pickle.dump(dev_batches, f) f.close() f = open(config.test_data_path, 'wb') pickle.dump(test_batches, f) f.close()
end_dic[(ent[1], ent[2])].append(ent) all_num += 1 for k, v in start_dic.items(): if len(v) > 1: start_num += len(v) for k, v in end_dic.items(): if len(v) > 1: end_num += len(v) print("All {}, start {}, end {}".format(all_num, start_num, end_num)) if __name__ == "__main__": reader = Reader() reader.read_and_gen_vectors_pubmed_word2vec(config.embed_path) reader.read_all_data("./data/genia_sample/", "train.data", "dev.data", "test.data") # print reader.train_sents[0] train_batches, dev_batches, test_batches = reader.to_batch(config.batch_size) f = open(config.train_data_path, 'wb') pickle.dump(train_batches, f) f.close() f = open(config.dev_data_path, 'wb') pickle.dump(dev_batches, f) f.close() f = open(config.test_data_path, 'wb') pickle.dump(test_batches, f) f.close()
for k, v in end_dic.items(): if len(v) > 1: end_num += len(v) print("All {}, start {}, end {}".format(all_num, start_num, end_num)) if __name__ == "__main__": tokenizer_dir = "tokenization/polish-roberta-large/" tokenizer = SentencePieceBPETokenizer(f"{tokenizer_dir}/vocab.json", f"{tokenizer_dir}/merges.txt") getattr(tokenizer, "_tokenizer").post_processor = RobertaProcessing(sep=("</s>", 2), cls=("<s>", 0)) reader = Reader("polish", tokenizer, cls="<s>", sep="</s>", threshold=8) reader.read_all_data("./data/poleval/", "poleval.train", "poleval.dev", "poleval.test") # print reader.train_sents[0] train_batches, dev_batches, test_batches = reader.to_batch( config.batch_size) f = open(config.train_data_path, 'wb') pickle.dump(train_batches, f) f.close() f = open(config.dev_data_path, 'wb') pickle.dump(dev_batches, f) f.close() f = open(config.test_data_path, 'wb') pickle.dump(test_batches, f) f.close()
start_dic[(ent[0], ent[2])].append(ent) end_dic[(ent[1], ent[2])].append(ent) all_num += 1 for k, v in start_dic.items(): if len(v) > 1: start_num += len(v) for k, v in end_dic.items(): if len(v) > 1: end_num += len(v) print("All {}, start {}, end {}".format(all_num, start_num, end_num)) if __name__ == "__main__": reader = Reader(config) reader.read_all_data() # print reader.train_sents[0] train_batches, dev_batches, test_batches = reader.to_batch() f = open(config.data_path + "_train.pkl", 'wb') pickle.dump(train_batches, f) f.close() f = open(config.data_path + "_dev.pkl", 'wb') pickle.dump(dev_batches, f) f.close() f = open(config.data_path + "_test.pkl", 'wb') pickle.dump(test_batches, f) f.close()