def valid_eval(data_in, task='FNER', eval_type=None, final=False): m1 = data_in['mention'] l1 = data_in['left_context'] r1 = data_in['right_context'] lab = data_in['label'] lf_id = pad_single(l1) rt_id = pad_single(r1) m_ = pad_single(m1) # m_, lf_id, rt_id = pad_method(m1, l1, r1) collector = [] true = [] eval_loss = [] iters = 0 p1 = 100 total_loss = [] iters = 0 for k in range(0, len(m_), p1): s = Model.predict(lf_id[k:k + p1], rt_id[k:k + p1], context_data=None, mention_representation_data=m_[k:k + p1], feature_data=None, doc_vector=None) loss_val = Model.error(lf_id[k:k + p1], rt_id[k:k + p1], lab[k:k + p1], context_data=None, mention_representation_data=m_[k:k + p1], feature_data=None, doc_vector=None) r = lab[k:k + p1] collector.append(s) true.append(r) total_loss.append(loss_val) iters += 1 average_eval_loss = sum(total_loss) / iters print(task + " Loss: ", average_eval_loss) collector = np.array(collector) collector = np.vstack(collector) collector = np.squeeze(collector) true = np.array(true) true = np.vstack(true) print(collector.shape, true.shape) strict_f1 = acc_hook(collector, true) logging.info(str(eval_type) + " FNER loss: {}".format(average_eval_loss)) if final: fname = args.dataset + "_" + args.encoder + "_" + str( args.feature) + "_" + str(args.hier) + "_" + str( args.dataset_kge) + ".txt" save_predictions(collector, true, dicts["id2label"], fname) return strict_f1
def valid_eval(data_in, task, eval_type=None, final=False): if task == 'FNER': collector = [] true = [] iters = 0 total_loss = [] c_, m_, lab, f, d, s_in, m_id, l_id, r_id = data_in.next() lf_id = pad_single(l_id) rt_id = pad_single(r_id) rt_id = np.flip(rt_id, axis=-1) p1 = 100 for k in range(0, len(c_), p1): s = Model.predict(lf_id[k:k + p1], rt_id[k:k + p1], context_data=None, mention_representation_data=m_[k:k + p1], feature_data=f[k:k + p1], doc_vector=None) loss_val = Model.error(lf_id[k:k + p1], rt_id[k:k + p1], lab[k:k + p1], context_data=None, mention_representation_data=m_[k:k + p1], feature_data=f[k:k + p1], doc_vector=None) r = lab[k:k + p1] collector.append(s) true.append(r) total_loss.append(loss_val) iters += 1 average_eval_loss = sum(total_loss) / iters print(task + " Loss: ", average_eval_loss) collector = np.array(collector) collector = np.vstack(collector) collector = np.squeeze(collector) true = np.array(true) true = np.vstack(true) print(collector.shape, true.shape) # print(collector) # print(true) strict_f1 = acc_hook(collector, true) logging.info( str(eval_type) + " FNER loss: {}".format(average_eval_loss)) if final: fname = args.dataset + "_" + args.encoder + "_" + str( args.feature) + "_" + str(args.hier) + "_" + str( args.dataset_kge) + ".txt" save_predictions(collector, true, dicts["id2label"], fname) return strict_f1
print "\r%d" % i, batch_data = train_batcher.next() # if i%(step_par_epoch/10)==0: # loss = model.printer([model.LM_loss_total, model.type_loss], batch_data) # print(loss) model.train(batch_data) print "------dev--------" batch_data = dev_batcher.next() scores = model.predict(batch_data) acc_hook(scores, batch_data["Y"]) model.load_all("./Models/" + d + "/lamb" + str(args.lamb) + "/model") print "-----test--------" batch_data = test_batcher.next() scores = model.predict(batch_data) acc_hook(scores, batch_data["Y"]) print # model.save_all(epoch, save_id) print "Training completed. Below are the final test scores: " print "-----test--------" batch_data = test_batcher.next() scores = model.predict(batch_data) acc_hook(scores, batch_data["Y"]) fname = args.dataset + "_" + args.encoder + "_" + str( args.feature) + "_" + str(args.hier) + ".txt" save_predictions(scores, batch_data["Y"], dicts["id2label"], fname) print "Cheers!"
np.savetxt(args.resultpath + "/scores_epoch" + str(epoch), scores, fmt='%f') scores = np.sort(a=scores, axis=1) np.savetxt(args.resultpath + "/sorted_scores_epoch" + str(epoch), scores, fmt='%f') print "-----test--------" context_data, mention_representation_data, target_data, feature_data = test_batcher.next( ) scores = model.predict(context_data, mention_representation_data, feature_data, args.gaussian) acc_hook(scores, target_data, args.gaussian, 0, 1, args.path, label_hierarchy) endtime = datetime.datetime.now() print "total train time: " + str((endtime - time1).total_seconds()) print "Training completed. Below are the final test scores: " print "-----test--------" context_data, mention_representation_data, target_data, feature_data = test_batcher.next( ) scores = model.predict(context_data, mention_representation_data, feature_data, 0) acc_hook(scores, target_data, args.gaussian, args.path, label_hierarchy) fname = args.dataset + "_" + args.encoder + "_" + str( args.feature) + "_" + str(args.hier) + ".txt" # fname = args.resultpath + "/prediction" save_predictions(scores, target_data, dicts["id2label"], fname, args.gaussian) print "Cheers!"
print "decode_size: ", decode_size # if decode_size is 0 (i.e. the decode_dataset is empty), the serif-instances file must have been empty # this can happen if the serif name-list adder did not find any names to add as mentions (e.g. if you are using a very short text) # in such a case, exit with success if decode_size == 0: print "Exiting decoding since decode dataset is empty!" # save an empty output file; this will avoid other steps in the CauseEX pipeline from failing with (open(args.output, "w")) as fp: pass sys.exit(0) if decode_dataset["data"].shape[0] == 0: print "Dataset is empty. Exit" sys.exit() print "Creating batcher..." test_batcher = Batcher(decode_dataset["storage"], decode_dataset["data"], decode_dataset["data"].shape[0], 10, dicts["id2vec"]) print "Getting bacther.next..." context_data, mention_representation_data, target_data, feature_data = test_batcher.next( ) print "Running decoding..." scores = model.predict(context_data, mention_representation_data, feature_data) acc_hook(scores, target_data) save_predictions(scores, target_data, dicts["id2label"], args.output) print "Finished decoding! Predicted labels written to: " + args.output
# Bonan: 31 is hard-coded number (number of batches): 31*1000 (batch size) should be less than the total number of training instances step_par_epoch = int(train_dataset["data"].shape[0] / 1000) print "start trainning" for epoch in range(5): train_batcher.shuffle() print "epoch",epoch for i in range(step_par_epoch): print "step",i context_data, mention_representation_data, target_data, feature_data = train_batcher.next() model.train(context_data, mention_representation_data, target_data, feature_data) print "------dev--------" context_data, mention_representation_data, target_data, feature_data = dev_batcher.next() scores = model.predict(context_data, mention_representation_data,feature_data) acc_hook(scores, target_data) print "Training completed. Below are the final test scores: " print "Saving model" model.save(os.path.join(args.model_output_dir,"NFGEC_tf_session"),"data/"+d+"/dicts_figer.pkl") # using a hard-coded prefix 'NFGEC_tf_session'; the same will be used by decode.py print "-----test--------" context_data, mention_representation_data, target_data, feature_data = test_batcher.next() scores = model.predict(context_data, mention_representation_data, feature_data) acc_hook(scores, target_data) fname = args.dataset + "_" + args.encoder + "_" + str(args.feature) + "_" + str(args.hier) + ".txt" save_predictions(scores, target_data, dicts["id2label"],fname) print "Cheers!"