format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '' embedding_folder = '' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary() vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder + "glove.6B.100d.txt") train_batch_generator = BatchGenerator(vocab, train_data, batch_size=60, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60) model = BiDAF(vocab, pretrained_word_embedding=word_embedding) model.compile(tf.train.AdamOptimizer, 0.001) model.train_and_evaluate(train_batch_generator, eval_batch_generator, evaluator, epochs=15, eposides=2)
level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') base_folder = '/Users/huihui/git/SogouMRCToolkit/' data_folder = base_folder + 'data/' dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary() vocab_save_path = base_folder + 'data/vocab.json' vocab.load(vocab_save_path) # load vocab from save path test_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60) model_dir = base_folder + 'models/bidaf/best_weights' model = BiDAF(vocab) model.load(model_dir) model.session.run(tf.local_variables_initializer()) model.inference(test_batch_generator) # inference on test data model.evaluate(test_batch_generator, evaluator) # evaluator.exact_match_score(prediction=,ground_truth=) # print(SquadEvaluator.exact_match_score()) # print(SquadEvaluator.f1_score) eval_batch_generator = test_batch_generator eval_batch_generator.init()
vocab = Vocabulary(do_lowercase=True) bert_dir = 'H:/result/BERT/uncased_L-12_H-768_A-12/' bert_data_helper = BertDataHelper(bert_dir) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) # covert data to bert format train_data = bert_data_helper.convert(train_data, data='squad') eval_data = bert_data_helper.convert(eval_data, data='squad') from sogou_mrc.data.batch_generator import BatchGenerator train_batch_generator = BatchGenerator(vocab, train_data, training=True, batch_size=2, additional_fields=[ 'input_ids', 'segment_ids', 'input_mask', 'start_position', 'end_position' ]) eval_batch_generator = BatchGenerator(vocab, eval_data, training=False, batch_size=2, additional_fields=[ 'input_ids', 'segment_ids', 'input_mask', 'start_position', 'end_position' ]) model = BertBaseline(bert_dir=bert_dir, version_2_with_negative=False) warmup_proportion = 0.1
train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) vocab.save("/root/ZX/SMRCToolkit/vocab_save_folder/testvocab.json") word_embedding = vocab.make_word_embedding(embedding_folder + "glove.840B.300d.txt") # save vocab vocab_save_path = '/root/ZX/SMRCToolkit/vocab_save_folder/vocab.json' vocab.save(vocab_save_path) train_batch_generator = BatchGenerator(vocab, train_data, batch_size=50, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50) train_batch_generator.init() content = "" question = "" # train and save checkpoint in save_dir save_dir = '/root/ZX/SMRCToolkit/model_save_folder' #define save_dir path model = QANET(vocab, pretrained_word_embedding=word_embedding) model.compile(tf.train.AdamOptimizer, 1e-3) model.train_and_evaluate(train_batch_generator, eval_batch_generator, evaluator,
eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder + "glove.840B.300d.txt") feature_transformer = FeatureExtractor( features=['match_lemma', 'match_lower', 'pos', 'ner', 'context_tf'], build_vocab_feature_names=set(['pos', 'ner']), word_counter=vocab.get_word_counter()) train_data = feature_transformer.fit_transform(dataset=train_data) eval_data = feature_transformer.transform(dataset=eval_data) train_batch_generator = BatchGenerator( vocab, train_data, training=True, batch_size=32, additional_fields=feature_transformer.features, feature_vocab=feature_transformer.vocab) eval_batch_generator = BatchGenerator( vocab, eval_data, batch_size=32, additional_fields=feature_transformer.features, feature_vocab=feature_transformer.vocab) model = DrQA(vocab, word_embedding, features=feature_transformer.features, feature_vocab=feature_transformer.vocab) # original paper adamax optimizer model.compile()
reader = SquadV2Reader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadV2Evaluator(dev_file) vocab = Vocabulary() vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder + "glove.6B.100d.txt") vocab_save_path = 'H:/result/bidafv2/vocab.json' vocab.save(vocab_save_path) train_batch_generator = BatchGenerator(vocab, train_data, batch_size=60, training=True, additional_fields=['is_impossible']) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60, additional_fields=['is_impossible']) save_dir = 'H:/result/bidafv2' model = BiDAF(vocab, pretrained_word_embedding=word_embedding, enable_na_answer=True) model.compile(tf.train.AdamOptimizer, 0.001) model.train_and_evaluate(train_batch_generator, eval_batch_generator,
train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary() vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder + "glove.42B.300d.txt") train_batch_generator = BatchGenerator( vocab, train_data, batch_size=60, training=True, additional_fields=['context_word_len', 'question_word_len']) eval_batch_generator = BatchGenerator( vocab, eval_data, batch_size=60, additional_fields=['context_word_len', 'question_word_len']) model = BiDAFPlusSQuad(vocab, pretrained_word_embedding=word_embedding) model.compile(tf.train.AdamOptimizer, 0.001) model.train_and_evaluate(train_batch_generator, eval_batch_generator, evaluator, epochs=15, eposides=2)
eval_data = coqa_reader.read(data_folder + eval_filename, 'dev') vocab.build_vocab(train_data + eval_data) evaluator = CoQAEvaluator(data_folder + eval_filename) bert_dir = 'model' bert_data_helper = BertDataHelper(bert_dir) train_data = bert_data_helper.convert(train_data, data='coqa') eval_data = bert_data_helper.convert(eval_data, data='coqa') from sogou_mrc.data.batch_generator import BatchGenerator train_batch_generator = BatchGenerator( vocab, train_data, training=True, batch_size=6, additional_fields=[ 'input_ids', 'segment_ids', 'input_mask', 'start_position', 'end_position', 'question_mask', 'rationale_mask', 'yes_mask', 'extractive_mask', 'no_mask', 'unk_mask', 'qid' ]) eval_batch_generator = BatchGenerator( vocab, eval_data, training=False, batch_size=12, additional_fields=[ 'input_ids', 'segment_ids', 'input_mask', 'start_position', 'end_position', 'question_mask', 'rationale_mask', 'yes_mask', 'extractive_mask', 'no_mask', 'unk_mask', 'qid' ])
eval_data = reader.read(dev_file) evaluator = SquadV2Evaluator(dev_file) cost = time.time() - t0 logging.info("seg cost=%.3f" % cost) t0 = time.time() vocab = Vocabulary(do_lowercase=True) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding("glove.840B.300d.txt", init_scale=0.05) cost = time.time() - t0 logging.info("make vocab cost=%.3f" % cost) train_batch_generator = BatchGenerator( vocab, train_data, batch_size=16, training=True, additional_fields=["abstractive_answer_mask"]) eval_batch_generator = BatchGenerator( vocab, eval_data, batch_size=16, training=False, additional_fields=["abstractive_answer_mask"]) use_elmo = True save_path = "squad2_elmo" if use_elmo: model = BiDAFPlus(vocab, pretrained_word_embedding=word_embedding,