# SQUAD seq2seq dev moses tokenized
DATA_DIR = os.path.join("..", "RuleBasedQuestionsToAnswer",
                        "squad_seq2seq_dev_moses_tokenized")
coqa_format_test_save_file = os.path.join(
    DATA_DIR, "squad_seq2seq_dev_moses_test_coqa_format.json")
src_squad_seq2seq_predicted_responses_file = os.path.join(
    DATA_DIR, "src_squad_seq2seq_dev_moses_test.txt")
predictions_save_file = "coqa_predictions_on_squad_seq2seq_dev_moses_test.txt"

test_data = coqa_reader.read(coqa_format_test_save_file, 'test')
evaluator = CoQAEvaluator(coqa_format_test_save_file)

best_model_path = os.path.join('models', 'best_weights')
bert_dir = 'uncased_L-12_H-768_A-12'
bert_data_helper = BertDataHelper(bert_dir)
test_data = bert_data_helper.convert(test_data, data='coqa')

model = BertCoQA(bert_dir=bert_dir, answer_verification=True)
print("loading model")
model.load(best_model_path)
print("model loaded")

my_batch_size = 6
test_batch_generator = BatchGenerator(
    vocab,
    test_data,
    training=False,
    batch_size=my_batch_size,
    additional_fields=[
        'input_ids', 'segment_ids', 'input_mask', 'start_position',
random.seed(1234)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = 'E:/dataset/SQuAD1.0/'
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"
reader = SquadReader()
train_data = reader.read(train_file)
random.shuffle(train_data)

eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)
vocab = Vocabulary(do_lowercase=True)
bert_dir = 'H:/result/BERT/uncased_L-12_H-768_A-12/'
bert_data_helper = BertDataHelper(bert_dir)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)

# covert data to bert format
train_data = bert_data_helper.convert(train_data, data='squad')
eval_data = bert_data_helper.convert(eval_data, data='squad')

from sogou_mrc.data.batch_generator import BatchGenerator

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       training=True,
                                       batch_size=2,
                                       additional_fields=[
                                           'input_ids', 'segment_ids',
                                           'input_mask', 'start_position',
示例#3
0
    vocab.load(vocab_filepath)
else:
    print("creating vocab as new")
    train_data = coqa_reader.read(data_folder + train_filename, 'train')
    eval_data = coqa_reader.read(data_folder + eval_filename, 'dev')
    vocab.build_vocab(train_data + eval_data)
    vocab.save(vocab_filepath)

DATA_DIR = os.path.join("..", "RuleBasedQuestionsToAnswer",
                        "squad_seq2seq_train_moses_tokenized")
val_data = coqa_reader.read(data_folder + eval_filename, 'dev')
evaluator = CoQAEvaluator(data_folder + eval_filename)

best_model_path = os.path.join('models', 'best_weights')
bert_dir = 'uncased_L-12_H-768_A-12'
bert_data_helper = BertDataHelper(bert_dir)
val_data = bert_data_helper.convert(val_data, data='coqa')

model = BertCoQA(bert_dir=bert_dir, answer_verification=True)
print("loading model")
model.load(best_model_path)
print("model loaded")

my_batch_size = 6
eval_batch_generator = BatchGenerator(
    vocab,
    val_data,
    training=False,
    batch_size=my_batch_size,
    additional_fields=[
        'input_ids', 'segment_ids', 'input_mask', 'start_position',