def get_padded_batch(seq_batch, vocab): seq_batch_ids = [[vocab[loader.SOS]] + [ vocab[tok if tok in vocab.keys() else loader.OOV] for tok in tokenise(sent, asbytes=False) ] + [vocab[loader.EOS]] for sent in seq_batch] max_seq_len = max([len(seq) for seq in seq_batch_ids]) padded_batch = np.asarray([ seq + [vocab[loader.PAD] for i in range(max_seq_len - len(seq))] for seq in seq_batch_ids ]) return padded_batch
def get_q(self, ctxt, ans, ans_pos): ctxt_filt, ans_pos = preprocessing.filter_context(ctxt, ans_pos, 0, 30) ans_toks = preprocessing.tokenise(ans, asbytes=False) doc = self.nlp(ctxt_filt) ctxt_toks = [str(tok).lower() for tok in doc] # ans_ix = preprocessing.char_pos_to_word(ctxt_filt, ctxt_toks, ans_pos, asbytes=False) if ans_toks[0] not in ctxt_toks: # print(ans_toks[0], ctxt_toks) ans_ix=preprocessing.char_pos_to_word(ctxt_filt, ctxt_toks, ans_pos, asbytes=False) # print(ctxt_toks[ans_ix]) else: ans_ix = ctxt_toks.index(ans_toks[0]) ans_type = Counter([doc[i].ent_type_ for i in range(ans_ix, min(ans_ix+len(ans_toks), len(doc)))]).most_common()[0][0] # print(ans_type) type_distances=[] verb_distances=[] for offset in range(len(ctxt_toks)): # print(doc[offset].ent_type_, doc[offset]) if str(doc[offset]).lower() not in ans_toks: # print(doc[offset], ans_toks) if doc[offset].pos_ == 'NOUN': type_distances.append((max(offset-ans_ix-len(ans_toks)+1, ans_ix-offset), 'THING', doc[offset], offset)) if doc[offset].ent_type_ != '' \ and not (doc[offset].ent_iob_ == 'B' and str(doc[min(offset+1, len(doc)-1)]).lower() in ans_toks) \ and self.type_translate(doc[offset].ent_type_) != 'CARDINAL': type_distances.append((max(offset-ans_ix-len(ans_toks)+1, ans_ix-offset), doc[offset].ent_type_, doc[offset], offset)) if doc[offset].tag_ in ['VBG','VBN']: # print(doc[offset]) verb_distances.append((max(offset-ans_ix-len(ans_toks)+1, ans_ix-offset), doc[offset].tag_, doc[offset], offset)) nearest_verb = sorted(verb_distances, key=lambda x: x[0])[0] if len(verb_distances) >0 else (0,'VBG', 'is',0) if len(type_distances) >0: nearest_entity = sorted(type_distances, key=lambda x: x[0])[0] ix= nearest_entity[3] entity_ixs=[ix] # print(nearest_entity) while ix+1 < len(doc) and doc[ix+1].ent_iob_ == 'I': entity_ixs.append(ix+1) ix+=1 entity_toks = [str(tok) for tok in doc[entity_ixs[0]:entity_ixs[-1]+1]] entity_type=nearest_entity[1] else: entity_toks = ["thing"] entity_type="THING" # print(entity_toks) return self.format_q(self.type_translate(ans_type), self.type_translate(entity_type), entity_toks, nearest_verb[2])
def get_ans(self, contexts, questions): toks = [tokenise(ctxt, asbytes=False) for ctxt in contexts] padded_batch_cs = self.get_padded_batch(contexts) padded_batch_qs = self.get_padded_batch(questions) spans = self.sess.run(self.model.pred_span, feed_dict={ self.model.context_in: padded_batch_cs, self.model.question_in: padded_batch_qs }) return [ " ".join(toks[i][span[0]:span[1] + 1]) for i, span in enumerate(spans) ]
def main(_): train_data = loader.load_squad_triples("./data/", False) print('Loaded SQuAD with ',len(train_data),' triples') train_contexts, train_qs, train_as,train_a_pos = zip(*train_data) qa = MpcmQaInstance() qa.load_from_chkpt(FLAGS.model_dir+'saved/qatest') vocab = qa.vocab questions = ["What colour is the car?","When was the car made?","Where was the date?", "What was the dog called?","Who was the oldest cat?"] contexts=["The car is green, and was built in 1985. This sentence should make it less likely to return the date, when asked about a cat. The oldest cat was called creme puff and lived for many years!" for i in range(len(questions))] spans = qa.get_ans(contexts, questions) print(contexts[0]) for i, q in enumerate(questions): toks = tokenise(contexts[i], asbytes=False) print(q, "->", toks[spans[i,0]:spans[i,1]])
def main(_): model_type=FLAGS.model_type # chkpt_path = FLAGS.model_dir+'saved/qgen-maluuba-crop-glove-smart' # chkpt_path = FLAGS.model_dir+'qgen-saved/MALUUBA-CROP-LATENT/1533247183' disc_path = FLAGS.model_dir+'saved/discriminator-trained-latent' chkpt_path = FLAGS.model_dir+'qgen/'+ model_type+'/'+FLAGS.eval_model_id # load dataset # train_data = loader.load_squad_triples(FLAGS.data_path, False) dev_data = loader.load_squad_triples(FLAGS.data_path, dev=FLAGS.eval_on_dev, test=FLAGS.eval_on_test) if len(dev_data) < FLAGS.num_eval_samples: exit('***ERROR*** Eval dataset is smaller than the num_eval_samples flag!') if len(dev_data) > FLAGS.num_eval_samples: print('***WARNING*** Eval dataset is larger than the num_eval_samples flag!') # train_contexts_unfilt, _,_,train_a_pos_unfilt = zip(*train_data) dev_contexts_unfilt, _,_,dev_a_pos_unfilt = zip(*dev_data) if FLAGS.filter_window_size_before >-1: # train_data = preprocessing.filter_squad(train_data, window_size=FLAGS.filter_window_size, max_tokens=FLAGS.filter_max_tokens) dev_data = preprocessing.filter_squad(dev_data, window_size_before=FLAGS.filter_window_size_before, window_size_after=FLAGS.filter_window_size_after, max_tokens=FLAGS.filter_max_tokens) # print('Loaded SQuAD with ',len(train_data),' triples') print('Loaded SQuAD dev set with ',len(dev_data),' triples') # train_contexts, train_qs, train_as,train_a_pos = zip(*train_data) dev_contexts, dev_qs, dev_as, dev_a_pos = zip(*dev_data) # vocab = loader.get_vocab(train_contexts, tf.app.flags.FLAGS.vocab_size) with open(chkpt_path+'/vocab.json') as f: vocab = json.load(f) with SquadStreamer(vocab, FLAGS.eval_batch_size, 1, shuffle=False) as dev_data_source: glove_embeddings = loader.load_glove(FLAGS.data_path) # Create model if model_type[:7] == "SEQ2SEQ": model = Seq2SeqModel(vocab, training_mode=False) elif model_type[:2] == "RL": # TEMP - no need to spin up the LM or QA model at eval time FLAGS.qa_weight = 0 FLAGS.lm_weight = 0 model = RLModel(vocab, training_mode=False) else: exit("Unrecognised model type: "+model_type) with model.graph.as_default(): saver = tf.train.Saver() if FLAGS.eval_metrics: lm = LstmLmInstance() # qa = MpcmQaInstance() qa = QANetInstance() lm.load_from_chkpt(FLAGS.model_dir+'saved/lmtest') # qa.load_from_chkpt(FLAGS.model_dir+'saved/qatest') qa.load_from_chkpt(FLAGS.model_dir+'saved/qanet2') discriminator = DiscriminatorInstance(trainable=False, path=disc_path) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_limit) with tf.Session(graph=model.graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if not os.path.exists(chkpt_path): exit('Checkpoint path doesnt exist! '+chkpt_path) # summary_writer = tf.summary.FileWriter(FLAGS.log_directory+"eval/"+str(int(time.time())), sess.graph) saver.restore(sess, tf.train.latest_checkpoint(chkpt_path)) # print('Loading not implemented yet') # else: # sess.run(tf.global_variables_initializer()) # sess.run(model.glove_init_ops) num_steps = FLAGS.num_eval_samples//FLAGS.eval_batch_size # Initialise the dataset # np.random.shuffle(dev_data) dev_data_source.initialise(dev_data) f1s=[] bleus=[] qa_scores=[] qa_scores_gold=[] lm_scores=[] nlls=[] disc_scores=[] sowe_similarities=[] copy_probs=[] qgolds=[] qpreds=[] qpred_ids=[] qgold_ids=[] ctxts=[] answers=[] ans_positions=[] metric_individuals=[] res=[] for e in range(1): for i in tqdm(range(num_steps), desc='Epoch '+str(e)): dev_batch, curr_batch_size = dev_data_source.get_batch() pred_batch,pred_beam,pred_beam_lens,pred_ids,pred_lens,gold_batch, gold_lens,gold_ids,ctxt,ctxt_len,ans,ans_len,nll,copy_prob= sess.run([model.q_hat_beam_string, model.q_hat_full_beam_str, model.q_hat_full_beam_lens,model.q_hat_beam_ids,model.q_hat_beam_lens,model.question_raw, model.question_length, model.question_ids, model.context_raw, model.context_length, model.answer_locs, model.answer_length, model.nll, model.mean_copy_prob], feed_dict={model.input_batch: dev_batch ,model.is_training:False}) unfilt_ctxt_batch = [dev_contexts_unfilt[ix] for ix in dev_batch[3]] a_text_batch = ops.byte_token_array_to_str(dev_batch[2][0], dev_batch[2][2], is_array=False) unfilt_apos_batch = [dev_a_pos_unfilt[ix] for ix in dev_batch[3]] # subtract 1 to remove the "end sent token" pred_q_batch = [q.replace(' </Sent>',"").replace(" <PAD>","") for q in ops.byte_token_array_to_str(pred_batch, pred_lens-1)] ctxts.extend(unfilt_ctxt_batch) answers.extend(a_text_batch) ans_positions.extend([dev_a_pos_unfilt[ix] for ix in dev_batch[3]]) copy_probs.extend(copy_prob.tolist()) # get QA score # gold_str=[] # pred_str=[] gold_ans = ops.byte_token_array_to_str(dev_batch[2][0], dev_batch[2][2], is_array=False) # pred_str = ops.byte_token_array_to_str([dev_batch[0][0][b][qa_pred[b][0]:qa_pred[b][1]] for b in range(curr_batch_size)], is_array=False) nlls.extend(nll.tolist()) if FLAGS.eval_metrics: qa_pred = qa.get_ans(unfilt_ctxt_batch, ops.byte_token_array_to_str(pred_batch, pred_lens)) gold_qa_pred = qa.get_ans(unfilt_ctxt_batch, ops.byte_token_array_to_str(dev_batch[1][0], dev_batch[1][3])) qa_score_batch = [metrics.f1(metrics.normalize_answer(gold_ans[b]), metrics.normalize_answer(qa_pred[b])) for b in range(curr_batch_size)] qa_score_gold_batch = [metrics.f1(metrics.normalize_answer(gold_ans[b]), metrics.normalize_answer(gold_qa_pred[b])) for b in range(curr_batch_size)] lm_score_batch = lm.get_seq_perplexity(pred_q_batch).tolist() disc_score_batch = discriminator.get_pred(unfilt_ctxt_batch, pred_q_batch, gold_ans, unfilt_apos_batch).tolist() for b, pred in enumerate(pred_batch): pred_str = pred_q_batch[b].replace(' </Sent>',"").replace(" <PAD>","") gold_str = tokens_to_string(gold_batch[b][:gold_lens[b]-1]) f1s.append(metrics.f1(gold_str, pred_str)) bleus.append(metrics.bleu(gold_str, pred_str)) qgolds.append(gold_str) qpreds.append(pred_str) # calc cosine similarity between sums of word embeddings pred_sowe = np.sum(np.asarray([glove_embeddings[w] if w in glove_embeddings.keys() else np.zeros((FLAGS.embedding_size,)) for w in preprocessing.tokenise(pred_str ,asbytes=False)]) ,axis=0) gold_sowe = np.sum(np.asarray([glove_embeddings[w] if w in glove_embeddings.keys() else np.zeros((FLAGS.embedding_size,)) for w in preprocessing.tokenise(gold_str ,asbytes=False)]) ,axis=0) this_similarity = np.inner(pred_sowe, gold_sowe)/np.linalg.norm(pred_sowe, ord=2)/np.linalg.norm(gold_sowe, ord=2) sowe_similarities.append(this_similarity) this_metric_dict={ 'f1':f1s[-1], 'bleu': bleus[-1], 'nll': nlls[-1], 'sowe': sowe_similarities[-1] } if FLAGS.eval_metrics: this_metric_dict={ **this_metric_dict, 'qa': qa_score_batch[b], 'lm': lm_score_batch[b], 'disc': disc_score_batch[b]} qa_scores.extend(qa_score_batch) lm_scores.extend(lm_score_batch) disc_scores.extend(disc_score_batch) metric_individuals.append(this_metric_dict) res.append({ 'c':unfilt_ctxt_batch[b], 'q_pred': pred_str, 'q_gold': gold_str, 'a_pos': unfilt_apos_batch[b], 'a_text': a_text_batch[b], 'metrics': this_metric_dict, 'q_pred_ids': pred_ids.tolist()[b], 'q_gold_ids': dev_batch[1][1][b].tolist() }) # Quick output if i==0: # print(copy_prob.tolist()) # print(copy_probs) pred_str = tokens_to_string(pred_batch[0][:pred_lens[0]-1]) gold_str = tokens_to_string(gold_batch[0][:gold_lens[0]-1]) # print(pred_str) print(qpreds[0]) print(gold_str) title=chkpt_path out_str = output_eval(title,pred_batch, pred_ids, pred_lens, gold_batch, gold_lens, ctxt, ctxt_len, ans, ans_len) with open(FLAGS.log_directory+'out_eval_'+model_type+'.htm', 'w', encoding='utf-8') as fp: fp.write(out_str) # res = list(zip(qpreds,qgolds,ctxts,answers,ans_positions,metric_individuals)) metric_dict={ 'f1':np.mean(f1s), 'bleu': metrics.bleu_corpus(qgolds, qpreds), 'nll':np.mean(nlls), 'sowe': np.mean(sowe_similarities) } if FLAGS.eval_metrics: metric_dict={**metric_dict, 'qa':np.mean(qa_scores), 'lm':np.mean(lm_scores), 'disc': np.mean(disc_scores)} # print(res) with open(FLAGS.log_directory+'out_eval_'+model_type+("_test" if FLAGS.eval_on_test else "")+("_train" if (not FLAGS.eval_on_dev and not FLAGS.eval_on_test) else "")+'.json', 'w', encoding='utf-8') as fp: json.dump({"metrics":metric_dict, "results": res}, fp) print("F1: ", np.mean(f1s)) print("BLEU: ", metrics.bleu_corpus(qgolds, qpreds)) print("NLL: ", np.mean(nlls)) print("SOWE: ", np.mean(sowe_similarities)) print("Copy prob: ", np.mean(copy_probs)) if FLAGS.eval_metrics: print("QA: ", np.mean(qa_scores)) print("LM: ", np.mean(lm_scores)) print("Disc: ", np.mean(disc_scores))
# QA: 0.06118585070207543 # LM: 534.2029482313792 # Filtered candidates by LM # F1: 0.1523949117692406 # BLEU: 0.0015981573389975351 # QA: 0.060554504435173614 # LM: 480.1121001809438 for i in tqdm(range(len(train_data))): triple = train_data[i] ctxt, q, ans, ans_pos = triple ctxt_filt, ans_pos = preprocessing.filter_context(ctxt, ans_pos, 0, 30) ctxt_toks = preprocessing.tokenise(ctxt, asbytes=False) response = requests.post(url, data=ctxt_filt.encode('utf-8')) if response.status_code != 200: exit("There was a problem connecting to the CoreNLP server!") res = response.json() # print(ctxt_filt) candidates = [] # Run NER to get question word for ent in res['sentences'][0]['entitymentions']: if ent['text'].find(ans): ner = ent['ner'] else: ner = "UNK"
def main(_): train_data = loader.load_squad_triples(FLAGS.data_path, False) dev_data = loader.load_squad_triples(FLAGS.data_path, True)[:500] chkpt_path = FLAGS.model_dir + 'saved/qatest' # chkpt_path = FLAGS.model_dir+'qa/1528885583' print('Loaded SQuAD with ', len(train_data), ' triples') train_contexts, train_qs, train_as, train_a_pos = zip(*train_data) dev_contexts, dev_qs, dev_as, dev_a_pos = zip(*dev_data) # vocab = loader.get_vocab(train_contexts, tf.app.flags.FLAGS.qa_vocab_size) with open(chkpt_path + '/vocab.json') as f: vocab = json.load(f) model = MpcmQa(vocab, training_mode=False) with model.graph.as_default(): saver = tf.train.Saver() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_limit) with tf.Session(graph=model.graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if not os.path.exists(chkpt_path): os.makedirs(chkpt_path) summary_writer = tf.summary.FileWriter( FLAGS.log_dir + 'qa/' + str(int(time.time())), sess.graph) saver.restore(sess, chkpt_path + '/model.checkpoint') num_steps = len(dev_data) // FLAGS.batch_size f1s = [] exactmatches = [] for e in range(1): np.random.shuffle(train_data) train_contexts, train_qs, train_as, train_a_pos = zip(*train_data) for i in tqdm(range(num_steps), desc='Epoch ' + str(e)): # TODO: this keeps coming up - refactor it batch_contexts = dev_contexts[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size] batch_questions = dev_qs[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size] batch_ans_text = dev_as[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size] batch_answer_charpos = dev_a_pos[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size] batch_answers = [] for j, ctxt in enumerate(batch_contexts): ans_span = char_pos_to_word( ctxt.encode(), [t.encode() for t in tokenise(ctxt, asbytes=False)], batch_answer_charpos[j]) ans_span = ( ans_span, ans_span + len(tokenise(batch_ans_text[j], asbytes=False))) batch_answers.append(ans_span) # print(batch_answers[:3]) # exit() summ, pred = sess.run( [model.eval_summary, model.pred_span], feed_dict={ model.context_in: get_padded_batch(batch_contexts, vocab), model.question_in: get_padded_batch(batch_questions, vocab), model.answer_spans_in: batch_answers, model.is_training: False }) summary_writer.add_summary(summ, global_step=(e * num_steps + i)) gold_str = [] pred_str = [] for b in range(FLAGS.batch_size): gold_str.append(" ".join( tokenise(batch_contexts[b], asbytes=False) [batch_answers[b][0]:batch_answers[b][1]])) pred_str.append(" ".join( tokenise(batch_contexts[b], asbytes=False)[pred[b][0]:pred[b][1]])) f1s.extend([ f1(gold_str[b], pred_str[b]) for b in range(FLAGS.batch_size) ]) exactmatches.extend([ np.product(pred[b] == batch_answers[b]) * 1.0 for b in range(FLAGS.batch_size) ]) if i % FLAGS.eval_freq == 0: out_str = "<h1>" + "Eval - Dev set" + "</h1>" for b in range(FLAGS.batch_size): out_str += batch_contexts[b] + '<br/>' out_str += batch_questions[b] + '<br/>' out_str += str(batch_answers[b]) + str( tokenise(batch_contexts[b], asbytes=False) [batch_answers[b][0]:batch_answers[b][1]] ) + '<br/>' out_str += str(pred[b]) + str( tokenise(batch_contexts[b], asbytes=False) [pred[b][0]:pred[b][1]]) + '<br/>' out_str += batch_ans_text[b] + '<br/>' out_str += pred_str[b] + '<br/>' out_str += "F1: " + str(f1(gold_str[b], pred_str[b])) + '<br/>' out_str += "EM: " + str( np.product(pred[b] == batch_answers[b]) * 1.0) out_str += "<hr/>" with open(FLAGS.log_dir + 'out_qa_eval.htm', 'w') as fp: fp.write(out_str) print("F1: ", np.mean(f1s)) print("EM: ", np.mean(exactmatches))
counts = {k: 0 for k in q_words} word_gold = ["other" for i in range(len(results))] word_pred = ["other" for i in range(len(results))] gold_pred_bleu = [] gold_pred_f1 = [] nlls = [] x = [] for i, res in enumerate(results): qpred, qgold, ctxt, answer, a_pos = res['q_pred'], res['q_gold'], res[ 'c'], res['a_text'], res['a_pos'] gold_pred_bleu.append(metrics.bleu(qgold, qpred)) nlls.append(res['metrics']['nll']) # x.append(metrics.f1(ctxt, qpred)) x.append(len(preprocessing.tokenise(qgold, asbytes=False))) gold_pred_f1.append(metrics.f1(qgold, qpred)) triggered = False for q in q_words: if q != "other" and q in qpred.lower(): # scores[q].append(metrics.bleu(qgold, qpred)) # scores[q].append(res['metrics']['qa']) # counts[q] += 1 word_pred[i] = q if q != "other" and q in qgold.lower(): counts[q] += 1 word_gold[i] = q triggered = True if not triggered:
# glove_short = list(loader.get_glove_vocab('./data/', size=2000, d=200).keys())[4:] squad_vocab =set() squad_count = Counter() start = time() max_context_len=0 max_pos = None debugstr = "" c_lens=[] q_lens=[] for i,triple in enumerate(squad): # filtered,new_pos = preprocessing.filter_context(triple[0], triple[3], 1, 100) c_toks= preprocessing.tokenise(triple[0], asbytes=False) q_toks= preprocessing.tokenise(triple[1], asbytes=False) # context_set = sorted(set(c_toks)) # context_set = c_toks # if len(context_set) > max_context_len: # max_context_len = len(context_set) # # max_pos = new_pos # debugstr = triple[1] # ix=i squad_count.update(c_toks) squad_count.update(q_toks) # squad_vocab |= set(c_toks) c_lens.append(len(c_toks)) q_lens.append(len(q_toks)) end = time()
def main(_): model = FileLoaderModel('./models/BASELINE') squad = loader.load_squad_triples(FLAGS.data_path, True, as_dict=True) disc_path = FLAGS.model_dir + 'saved/discriminator-trained-latent' glove_embeddings = loader.load_glove(FLAGS.data_path) if FLAGS.eval_metrics: lm = LstmLmInstance() # qa = MpcmQaInstance() qa = QANetInstance() lm.load_from_chkpt(FLAGS.model_dir + 'saved/lmtest') # qa.load_from_chkpt(FLAGS.model_dir+'saved/qatest') qa.load_from_chkpt(FLAGS.model_dir + 'saved/qanet') discriminator = DiscriminatorInstance(trainable=False, path=disc_path) f1s = [] bleus = [] qa_scores = [] qa_scores_gold = [] lm_scores = [] nlls = [] disc_scores = [] sowe_similarities = [] qgolds = [] qpreds = [] ctxts = [] answers = [] ans_positions = [] metric_individuals = [] res = [] missing = 0 for id, el in tqdm(squad.items()): unfilt_ctxt_batch = [el[0]] a_text_batch = [el[2]] a_pos_batch = [el[3]] ctxts.extend(unfilt_ctxt_batch) answers.extend(a_text_batch) ans_positions.extend(a_pos_batch) pred_str = model.get_q(id) if pred_str is None: missing += 1 continue gold_str = el[1] if FLAGS.eval_metrics: qa_pred = qa.get_ans(unfilt_ctxt_batch, [pred_str]) gold_qa_pred = qa.get_ans(unfilt_ctxt_batch, [gold_str]) qa_score = metrics.f1(el[2].lower(), qa_pred[0].lower()) qa_score_gold = metrics.f1(el[2].lower(), gold_qa_pred[0].lower()) lm_score = lm.get_seq_perplexity([pred_str]).tolist() disc_score = discriminator.get_pred(unfilt_ctxt_batch, [pred_str], a_text_batch, a_pos_batch).tolist()[0] f1s.append(metrics.f1(gold_str, pred_str)) bleus.append(metrics.bleu(gold_str, pred_str)) qgolds.append(gold_str) qpreds.append(pred_str) # calc cosine similarity between sums of word embeddings pred_sowe = np.sum(np.asarray([ glove_embeddings[w] if w in glove_embeddings.keys() else np.zeros( (FLAGS.embedding_size, )) for w in preprocessing.tokenise(pred_str, asbytes=False) ]), axis=0) gold_sowe = np.sum(np.asarray([ glove_embeddings[w] if w in glove_embeddings.keys() else np.zeros( (FLAGS.embedding_size, )) for w in preprocessing.tokenise(gold_str, asbytes=False) ]), axis=0) this_similarity = np.inner(pred_sowe, gold_sowe) / np.linalg.norm( pred_sowe, ord=2) / np.linalg.norm(gold_sowe, ord=2) sowe_similarities.append(this_similarity) this_metric_dict = { 'f1': f1s[-1], 'bleu': bleus[-1], 'nll': 0, 'sowe': sowe_similarities[-1] } if FLAGS.eval_metrics: this_metric_dict = { **this_metric_dict, 'qa': qa_score, 'lm': lm_score, 'disc': disc_score } qa_scores.append(qa_score) lm_scores.append(lm_score) disc_scores.append(disc_score) metric_individuals.append(this_metric_dict) res.append({ 'c': el[0], 'q_pred': pred_str, 'q_gold': gold_str, 'a_pos': el[3], 'a_text': el[2], 'metrics': this_metric_dict }) metric_dict = { 'f1': np.mean(f1s), 'bleu': np.mean(bleus), 'nll': 0, 'sowe': np.mean(sowe_similarities) } if FLAGS.eval_metrics: metric_dict = { **metric_dict, 'qa': np.mean(qa_scores), 'lm': np.mean(lm_scores), 'disc': np.mean(disc_scores) } # print(res) with open(FLAGS.log_dir + 'out_eval_BASELINE' + ("_train" if not FLAGS.eval_on_dev else "") + '.json', 'w', encoding='utf-8') as fp: json.dump({"metrics": metric_dict, "results": res}, fp) print("F1: ", np.mean(f1s)) print("BLEU: ", np.mean(bleus)) print("NLL: ", 0) print("SOWE: ", np.mean(sowe_similarities)) if FLAGS.eval_metrics: print("QA: ", np.mean(qa_scores)) print("LM: ", np.mean(lm_scores)) print("Disc: ", np.mean(disc_scores)) print(missing, " ids were missing")
print("LM loaded") qa.load_from_chkpt(FLAGS.model_dir + 'saved/qatest') print("QA loaded") lm_vocab = lm.vocab qa_vocab = qa.vocab f1s = [] bleus = [] qa_scores = [] lm_scores = [] for i in tqdm(range(len(train_data))): triple = train_data[i] ctxt, q, ans, ans_pos = triple ctxt_toks = preprocessing.tokenise(ctxt, asbytes=False) # print(triple[0]) gen_q = model.get_q(triple[0], triple[2], triple[3]) gen_q_toks = preprocessing.tokenise(gen_q, asbytes=False) f1s.append(metrics.f1(triple[1], gen_q)) bleus.append(metrics.bleu(triple[1], gen_q)) qhat_for_lm = preprocessing.lookup_vocab(gen_q_toks, lm_vocab, do_tokenise=False, asbytes=False) ctxt_for_lm = preprocessing.lookup_vocab(ctxt_toks, lm_vocab, do_tokenise=False,
def main(_): run_id = str(int(time.time())) chkpt_path = FLAGS.model_dir + 'lm/' + run_id if not os.path.exists(chkpt_path): os.makedirs(chkpt_path) train_data = loader.load_squad_triples(FLAGS.data_path, False) dev_data = loader.load_squad_triples(FLAGS.data_path, True) np.random.shuffle(train_data) print('Loaded SQuAD with ', len(train_data), ' triples') train_contexts, train_qs, train_as, train_a_pos = zip(*train_data) _, dev_qs, _, _ = zip(*dev_data) vocab = loader.get_vocab(train_qs, tf.app.flags.FLAGS.lm_vocab_size) with open(chkpt_path + '/vocab.json', 'w') as outfile: json.dump(vocab, outfile) unique_sents = list(set(train_qs)) print(len(unique_sents), " unique sentences") # Create model model = LstmLm(vocab, num_units=FLAGS.lm_units) with model.graph.as_default(): saver = tf.train.Saver() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_limit) with tf.Session(graph=model.graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if not os.path.exists(chkpt_path): os.makedirs(chkpt_path) summary_writer = tf.summary.FileWriter( FLAGS.log_directory + 'lm/' + run_id, sess.graph) if FLAGS.restore: # saver.restore(sess, chkpt_path+ '/model.checkpoint') print('Loading not implemented yet') else: print("Building graph, loading glove") sess.run(tf.global_variables_initializer()) num_steps = len(unique_sents) // FLAGS.batch_size best_perp = 1e6 for e in range(FLAGS.lm_num_epochs): np.random.shuffle(unique_sents) for i in tqdm(range(num_steps), desc='Epoch ' + str(e)): seq_batch = unique_sents[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size] seq_batch_ids = [[vocab[loader.SOS]] + [ vocab[tok if tok in vocab.keys() else loader.OOV] for tok in tokenise(sent, asbytes=False) ] + [vocab[loader.EOS]] for sent in seq_batch] max_seq_len = max([len(seq) for seq in seq_batch_ids]) padded_batch = np.asarray([ seq + [vocab[loader.PAD] for i in range(max_seq_len - len(seq))] for seq in seq_batch_ids ]) summ, _, pred, gold, seq = sess.run( [ model.train_summary, model.optimise, model.preds, model.tgt_output, model.input_seqs ], feed_dict={model.input_seqs: padded_batch}) summary_writer.add_summary(summ, global_step=(e * num_steps + i)) # print(pred, gold, seq) # exit() # if i%FLAGS.eval_freq==0: # saver.save(sess, chkpt_path+'/model.checkpoint') # print(pred, gold, seq) perps = [] num_steps_dev = len(dev_qs) // FLAGS.batch_size for i in tqdm(range(num_steps_dev), desc="Eval"): seq_batch = dev_qs[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size] seq_batch_ids = [[vocab[loader.SOS]] + [ vocab[tok if tok in vocab.keys() else loader.OOV] for tok in tokenise(sent, asbytes=False) ] + [vocab[loader.EOS]] for sent in seq_batch] max_seq_len = max([len(seq) for seq in seq_batch_ids]) padded_batch = np.asarray([ seq + [vocab[loader.PAD] for i in range(max_seq_len - len(seq))] for seq in seq_batch_ids ]) perp = sess.run(model.perplexity, feed_dict={model.input_seqs: padded_batch}) perps.extend(perp) perpsummary = tf.Summary(value=[ tf.Summary.Value(tag="dev_perf/perplexity", simple_value=sum(perps) / len(perps)) ]) summary_writer.add_summary(perpsummary, global_step=((e + 1) * num_steps)) if np.mean(perps) < best_perp: print(np.mean(perps), " Saving!") saver.save(sess, chkpt_path + '/model.checkpoint') best_perp = np.mean(perps)
# QA: 0.06118585070207543 # LM: 534.2029482313792 # Filtered candidates by LM # F1: 0.1523949117692406 # BLEU: 0.0015981573389975351 # QA: 0.060554504435173614 # LM: 480.1121001809438 for i in tqdm(range(len(train_data))): triple=train_data[i] ctxt,q,ans,ans_pos = triple ctxt_filt, ans_pos = preprocessing.filter_context(ctxt, ans_pos, 0, 30) ctxt_toks = preprocessing.tokenise(ctxt, asbytes=False) response = requests.post(url, data=ctxt_filt.encode('utf-8')) if response.status_code != 200: exit("There was a problem connecting to the CoreNLP server!") res = response.json() # print(ctxt_filt) candidates=[] # Run NER to get question word for ent in res['sentences'][0]['entitymentions']: if ent['text'].find(ans): ner = ent['ner'] else:
def main(_): if FLAGS.testing: print('TEST MODE - reducing model size') FLAGS.qa_encoder_units =32 FLAGS.qa_match_units=32 FLAGS.qa_batch_size =16 FLAGS.embedding_size=50 run_id = str(int(time.time())) chkpt_path = FLAGS.model_dir+'qa/'+run_id restore_path=FLAGS.model_dir+'qa/1529056867' if not os.path.exists(chkpt_path): os.makedirs(chkpt_path) train_data = loader.load_squad_triples(FLAGS.data_path, False) dev_data = loader.load_squad_triples(FLAGS.data_path, dev=True, ans_list=True) train_data = filter_squad(train_data, window_size=FLAGS.filter_window_size, max_tokens=FLAGS.filter_max_tokens) # dev_data = filter_squad(dev_data, window_size=FLAGS.filter_window_size, max_tokens=FLAGS.filter_max_tokens) if FLAGS.testing: train_data=train_data[:1000] num_dev_samples=100 else: num_dev_samples=3000 print('Loaded SQuAD with ',len(train_data),' triples') train_contexts, train_qs, train_as,train_a_pos = zip(*train_data) dev_contexts, dev_qs, dev_as,dev_a_pos = zip(*dev_data) if FLAGS.restore: with open(restore_path+'/vocab.json') as f: vocab = json.load(f) else: vocab = loader.get_vocab(train_contexts+train_qs, tf.app.flags.FLAGS.qa_vocab_size) with open(chkpt_path+'/vocab.json', 'w') as outfile: json.dump(vocab, outfile) model = MpcmQa(vocab) with model.graph.as_default(): saver = tf.train.Saver() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_limit, allow_growth = True) with tf.Session(graph=model.graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess: summary_writer = tf.summary.FileWriter(FLAGS.log_directory+'qa/'+run_id, sess.graph) if FLAGS.restore: saver.restore(sess, restore_path+ '/model.checkpoint') start_e=40#FLAGS.qa_num_epochs print('Loaded model') else: print("Building graph, loading glove") start_e=0 sess.run(tf.global_variables_initializer()) num_steps_train = len(train_data)//FLAGS.qa_batch_size num_steps_dev = num_dev_samples//FLAGS.qa_batch_size f1summary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/f1", simple_value=0.0)]) emsummary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/em", simple_value=0.0)]) summary_writer.add_summary(f1summary, global_step=start_e*num_steps_train) summary_writer.add_summary(emsummary, global_step=start_e*num_steps_train) best_oos_nll=1e6 for e in range(start_e,start_e+FLAGS.qa_num_epochs): np.random.shuffle(train_data) train_contexts, train_qs, train_as,train_a_pos = zip(*train_data) for i in tqdm(range(num_steps_train), desc='Epoch '+str(e)): # TODO: this keeps coming up - refactor it batch_contexts = train_contexts[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size] batch_questions = train_qs[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size] batch_ans_text = train_as[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size] batch_answer_charpos = train_a_pos[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size] batch_answers=[] for j, ctxt in enumerate(batch_contexts): ans_span=char_pos_to_word(ctxt.encode(), [t.encode() for t in tokenise(ctxt, asbytes=False)], batch_answer_charpos[j]) ans_span=(ans_span, ans_span+len(tokenise(batch_ans_text[j],asbytes=False))-1) batch_answers.append(ans_span) # print(batch_answers[:3]) # exit() # run_metadata = tf.RunMetadata() # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) _,summ, pred = sess.run([model.optimizer, model.train_summary, model.pred_span], feed_dict={model.context_in: get_padded_batch(batch_contexts,vocab), model.question_in: get_padded_batch(batch_questions,vocab), model.answer_spans_in: batch_answers, model.is_training: True}) # ,run_metadata=run_metadata, options=run_options) summary_writer.add_summary(summ, global_step=(e*num_steps_train+i)) # summary_writer.add_run_metadata(run_metadata, tag="step "+str(i), global_step=(e*num_steps_train+i)) if i%FLAGS.eval_freq==0: gold_str=[] pred_str=[] f1s = [] exactmatches= [] for b in range(FLAGS.qa_batch_size): gold_str.append(" ".join(tokenise(batch_contexts[b],asbytes=False)[batch_answers[b][0]:batch_answers[b][1]+1])) pred_str.append( " ".join(tokenise(batch_contexts[b],asbytes=False)[pred[b][0]:pred[b][1]+1]) ) f1s.extend([f1(gold_str[b], pred_str[b]) for b in range(FLAGS.qa_batch_size)]) exactmatches.extend([ np.product(pred[b] == batch_answers[b])*1.0 for b in range(FLAGS.qa_batch_size) ]) f1summary = tf.Summary(value=[tf.Summary.Value(tag="train_perf/f1", simple_value=sum(f1s)/len(f1s))]) emsummary = tf.Summary(value=[tf.Summary.Value(tag="train_perf/em", simple_value=sum(exactmatches)/len(exactmatches))]) summary_writer.add_summary(f1summary, global_step=(e*num_steps_train+i)) summary_writer.add_summary(emsummary, global_step=(e*num_steps_train+i)) # saver.save(sess, chkpt_path+'/model.checkpoint') f1s=[] exactmatches=[] nlls=[] np.random.shuffle(dev_data) dev_subset = dev_data[:num_dev_samples] for i in tqdm(range(num_steps_dev), desc='Eval '+str(e)): dev_contexts,dev_qs,dev_as,dev_a_pos = zip(*dev_subset) batch_contexts = dev_contexts[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size] batch_questions = dev_qs[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size] batch_ans_text = dev_as[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size] batch_answer_charpos = dev_a_pos[i*FLAGS.qa_batch_size:(i+1)*FLAGS.qa_batch_size] batch_answers=[] for j, ctxt in enumerate(batch_contexts): ans_span=char_pos_to_word(ctxt.encode(), [t.encode() for t in tokenise(ctxt, asbytes=False)], batch_answer_charpos[j][0]) ans_span=(ans_span, ans_span+len(tokenise(batch_ans_text[j][0],asbytes=False))-1) batch_answers.append(ans_span) pred,nll = sess.run([model.pred_span, model.nll], feed_dict={model.context_in: get_padded_batch(batch_contexts,vocab), model.question_in: get_padded_batch(batch_questions,vocab), model.answer_spans_in: batch_answers, model.is_training: False}) gold_str=[] pred_str=[] for b in range(FLAGS.qa_batch_size): pred_str = " ".join(tokenise(batch_contexts[b],asbytes=False)[pred[b][0]:pred[b][1]+1]) this_f1=[] this_em=[] for a in range(len(batch_ans_text[b])): this_f1.append(f1(normalize_answer(batch_ans_text[b][a]), normalize_answer(pred_str))) this_em.append(1.0*(normalize_answer(batch_ans_text[b][a]) == normalize_answer(pred_str))) f1s.append(max(this_f1)) exactmatches.append(max(this_em)) nlls.extend(nll.tolist()) f1summary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/f1", simple_value=sum(f1s)/len(f1s))]) emsummary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/em", simple_value=sum(exactmatches)/len(exactmatches))]) nllsummary = tf.Summary(value=[tf.Summary.Value(tag="dev_perf/nll", simple_value=np.mean(nlls))]) summary_writer.add_summary(f1summary, global_step=((e+1)*num_steps_train)) summary_writer.add_summary(emsummary, global_step=((e+1)*num_steps_train)) summary_writer.add_summary(nllsummary, global_step=((e+1)*num_steps_train)) mean_nll=np.mean(nlls) if mean_nll < best_oos_nll: print("New best NLL! ", mean_nll, " Saving... F1: ", np.mean(f1s)) best_oos_nll = mean_nll saver.save(sess, chkpt_path+'/model.checkpoint') else: print("NLL not improved ", mean_nll)
def bleu(gold, prediction, order=4): return compute_bleu([[tokenise(gold, asbytes=False)]], [tokenise(prediction, asbytes=False)], smooth=False, max_order=order)[0]
def bleu_corpus(golds, preds, order=4): return compute_bleu([[tokenise(gold, asbytes=False)] for gold in golds], [tokenise(pred, asbytes=False) for pred in preds], smooth=False, max_order=order)[0]