# SQUAD seq2seq dev moses tokenized DATA_DIR = os.path.join("..", "RuleBasedQuestionsToAnswer", "squad_seq2seq_dev_moses_tokenized") coqa_format_test_save_file = os.path.join( DATA_DIR, "squad_seq2seq_dev_moses_test_coqa_format.json") src_squad_seq2seq_predicted_responses_file = os.path.join( DATA_DIR, "src_squad_seq2seq_dev_moses_test.txt") predictions_save_file = "coqa_predictions_on_squad_seq2seq_dev_moses_test.txt" test_data = coqa_reader.read(coqa_format_test_save_file, 'test') evaluator = CoQAEvaluator(coqa_format_test_save_file) best_model_path = os.path.join('models', 'best_weights') bert_dir = 'uncased_L-12_H-768_A-12' bert_data_helper = BertDataHelper(bert_dir) test_data = bert_data_helper.convert(test_data, data='coqa') model = BertCoQA(bert_dir=bert_dir, answer_verification=True) print("loading model") model.load(best_model_path) print("model loaded") my_batch_size = 6 test_batch_generator = BatchGenerator( vocab, test_data, training=False, batch_size=my_batch_size, additional_fields=[ 'input_ids', 'segment_ids', 'input_mask', 'start_position',
random.seed(1234) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = 'E:/dataset/SQuAD1.0/' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) random.shuffle(train_data) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary(do_lowercase=True) bert_dir = 'H:/result/BERT/uncased_L-12_H-768_A-12/' bert_data_helper = BertDataHelper(bert_dir) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) # covert data to bert format train_data = bert_data_helper.convert(train_data, data='squad') eval_data = bert_data_helper.convert(eval_data, data='squad') from sogou_mrc.data.batch_generator import BatchGenerator train_batch_generator = BatchGenerator(vocab, train_data, training=True, batch_size=2, additional_fields=[ 'input_ids', 'segment_ids', 'input_mask', 'start_position',
vocab.load(vocab_filepath) else: print("creating vocab as new") train_data = coqa_reader.read(data_folder + train_filename, 'train') eval_data = coqa_reader.read(data_folder + eval_filename, 'dev') vocab.build_vocab(train_data + eval_data) vocab.save(vocab_filepath) DATA_DIR = os.path.join("..", "RuleBasedQuestionsToAnswer", "squad_seq2seq_train_moses_tokenized") val_data = coqa_reader.read(data_folder + eval_filename, 'dev') evaluator = CoQAEvaluator(data_folder + eval_filename) best_model_path = os.path.join('models', 'best_weights') bert_dir = 'uncased_L-12_H-768_A-12' bert_data_helper = BertDataHelper(bert_dir) val_data = bert_data_helper.convert(val_data, data='coqa') model = BertCoQA(bert_dir=bert_dir, answer_verification=True) print("loading model") model.load(best_model_path) print("model loaded") my_batch_size = 6 eval_batch_generator = BatchGenerator( vocab, val_data, training=False, batch_size=my_batch_size, additional_fields=[ 'input_ids', 'segment_ids', 'input_mask', 'start_position',