def evaluate_joint_ace05_test(estimator, evaluator, test, ere_test, params, ace05_kshot_data=None, ere_kshot_data=None): # for joint model with open(params['score_file'] + '_ace', 'a') as score_output: print('bc0 %4.2f' % (100.0 * evaluator.best_dev), end=' ') print('bc0 %4.2f' % (100.0 * evaluator.best_dev), end=' ', file=score_output) all_preds, all_truth = [], [] macro_avg = 0.0 for corpus in test: if params['kshot']: test_input_fn = make_joint_kshot_eval_inputs( test[corpus], ace05_kshot_data, ere_test, ere_kshot_data, params) else: test_input_fn = make_joint_eval_inputs(test[corpus], ere_test) score, preds = evaluator.evaluate_dataset( estimator, test_input_fn, test[corpus][1], params['ace05_num_classes'], 'ace05_relation') all_preds += [preds] all_truth += [test[corpus][1]] macro_avg += score print(corpus, '%4.2f' % (100.0 * score), end=' ') print(corpus, '%4.2f' % (100.0 * score), end=' ', file=score_output) all_preds = np.concatenate(all_preds, axis=0) all_truth = np.concatenate(all_truth, axis=0) micro_avg = metrics.f1_metric(all_truth, all_preds, params['ace05_num_classes']) macro_avg /= float(len(test)) print('micro', '%4.2f' % (100.0 * micro_avg), end=' ') print('micro', '%4.2f' % (100.0 * micro_avg), end=' ', file=score_output) print('macro', '%4.2f' % (100.0 * macro_avg), end=' ') print('macro', '%4.2f' % (100.0 * macro_avg), end=' ', file=score_output) print('epoch', evaluator.best_epoch, end=' ') print('epoch', evaluator.best_epoch, end=' ', file=score_output) print('dir', evaluator.best_ckpt, file=score_output)
def evaluate_joint_ere_test(estimator, evaluator, test, ace05_test, params, ere_label_list=None, output_types=None, ace05_kshot_data=None, ere_kshot_data=None): if params['kshot']: test_input_fn = make_joint_kshot_eval_inputs(ace05_test, ace05_kshot_data, test, ere_kshot_data, params) else: test_input_fn = make_joint_eval_inputs(ace05_test, test) test_score, preds = evaluator.evaluate_dataset(estimator, test_input_fn, test[1], params['ere_num_classes'], 'ere_relation', output_types=output_types) all_scores = metrics.f1_metric(test[1], preds, params['ere_num_classes'], average=None) print(all_scores) with open(params['score_file'] + '_ere', 'a') as score_output: print('dev %4.2f' % (100.0 * evaluator.best_dev), end=' ') print('dev %4.2f' % (100.0 * evaluator.best_dev), end=' ', file=score_output) print('test %4.2f' % (100.0 * test_score), end=' ') print('test %4.2f' % (100.0 * test_score), end=' ', file=score_output) if ere_label_list is not None: for i in xrange(len(all_scores)): print(str(ere_label_list[i]) + ' %4.2f' % (100.0 * all_scores[i]), end=' ') print(str(ere_label_list[i]) + ' %4.2f' % (100.0 * all_scores[i]), end=' ', file=score_output) print('epoch', evaluator.best_epoch, end=' ') print('epoch', evaluator.best_epoch, end=' ', file=score_output) print('dir', evaluator.best_ckpt, file=score_output) print()
def experiment_ace05(params): tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info(params) ace05_data, embed = ace05.load_dataset(max_len=params['max_len']) ace05_data = preprocess_ace05(ace05_data, params) train, dev, test = ace05_data params['embed'] = embed trainX, trainY = train devX, devY = dev trainX, trainY = take_percentage(trainX, trainY, params['percent_train']) train_input_fn = tf.estimator.inputs.numpy_input_fn( x=trainX, y=trainY, num_epochs=1, batch_size=params['batch_size'], shuffle=True) num_steps_per_epoch = trainY.shape[0] / params['batch_size'] + 1 params['learning_rate_decay_step'] = num_steps_per_epoch * params[ 'lr_decay_epoch'] if params['stack']: model_fn = relation_stack_model else: model_fn = relation_model config = tf.estimator.RunConfig() config = config.replace( tf_random_seed=params['random_seed'], keep_checkpoint_max=1, ) estimator = tf.estimator.Estimator(model_dir=params['model_dir'], model_fn=model_fn, params=params, config=config) print('num steps per epoch', num_steps_per_epoch) print('start training') evaluator = metrics.EvaluatorACE05Hook(estimator, ace05_data) if params['debug']: print('debug mode') num_steps_per_epoch = 100 params['epoch'] = 1 for epoch in range(1, params['epoch'] + 1): print('==========') print('epoch', epoch) estimator.train(input_fn=train_input_fn, steps=num_steps_per_epoch, hooks=[evaluator]) print('finish training, best dev (%4.4f) found at epoch: %d' % (evaluator.best_dev, evaluator.best_epoch)) with open(params['score_file'], 'a') as score_output: estimator._model_dir = evaluator.best_ckpt print('bc0 %4.2f' % (100.0 * evaluator.best_dev.item()), end=' ') print('bc0 %4.2f' % (100.0 * evaluator.best_dev.item()), end=' ', file=score_output) all_preds, all_truth = [], [] for corpus in test: score, preds = metrics.evaluate_predict(estimator, test[corpus], params['num_classes']) all_preds += [preds] all_truth += [test[corpus][1]] print(corpus, '%4.2f' % (100.0 * score.item()), end=' ') print(corpus, '%4.2f' % (100.0 * score.item()), end=' ', file=score_output) all_preds = np.concatenate(all_preds, axis=0) all_truth = np.concatenate(all_truth, axis=0) micro_avg = metrics.f1_metric(all_truth, all_preds, params['num_classes']) print('micro', '%4.2f' % (100.0 * micro_avg), end=' ') print('micro', '%4.2f' % (100.0 * micro_avg), end=' ', file=score_output) print('epoch', evaluator.best_epoch) print('epoch', evaluator.best_epoch, file=score_output)
def dev_step(split, global_step): if split == 'test_seen': test_loader = test_seen_loader elif split == 'test_unseen': test_loader = test_unseen_loader else: raise ValueError dis_model.eval() gen_model.eval() n_token, test_loss = 0, 0.0 # ppl test_hyp, test_ref = [], [] count = 0 with torch.no_grad(): for knowledges, histories, users, responses, knowledge_lens in test_loader: knowledges = [know.split('\n\n') for know in knowledges] histories = [his.split('\n\n') for his in histories] dis_args = dis_batcher(knowledges, histories, knowledge_lens, args.n_sent) dis_out = dis_model(*dis_args) dis_knowledges = [[knowledges[bi][dis_out[0][bi].item()]] for bi in range(len(knowledges))] gen_args = gen_batcher(dis_knowledges, histories, users, responses, args.segment, True) loss = gen_criterion( gen_model(gen_args[0], token_type_ids=gen_args[1])[0], gen_args[2]) n_token += loss.size(0) test_loss += loss.sum().item() for bi in range(len(dis_knowledges)): dec_in = gen_batcher(dis_knowledges[bi:bi + 1], histories[bi:bi + 1], users[bi:bi + 1], segment=args.segment, training=False) dec_out = gen_model.batch_decode( dec_in, args.max_length, args.min_length, args.early_stopping, args.beam_size, args.repetition_penalty, gen_batcher.eos_id, args.length_penalty, args.no_repeat_ngram_size) dec_out = dec_out[0].tolist()[dec_in.size(1):] _hyp = gen_batcher.tokenizer.decode( dec_out, skip_special_tokens=True, clean_up_tokenization_spaces=False) _ref = responses[bi] test_hyp.append(_hyp) test_ref.append(_ref) count += 1 if count % 1000 == 0: print(count) with open( os.path.join( out_dir, '{}-decoded-iter-{}.txt'.format(split, global_step)), 'w') as f: for _hyp, _ref in zip(test_hyp, test_ref): f.writelines('{} ||| {}\n'.format(_hyp, _ref)) MeanLoss = test_loss / n_token b1, b2, b3, b4 = bleu_metric(test_hyp, test_ref) d1, d2 = distinct_metric(test_hyp) f1 = f1_metric(test_hyp, test_ref) time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print("**********************************") print("{} results..........".format(split)) print('hypothesis: ', len(test_hyp)) print("Step: %d \t| ppl: %.3f \t| %s" % (global_step, math.exp(MeanLoss), time_str)) print("BLEU-1/2/3/4: {:.4f}/{:.4f}/{:.4f}/{:.4f}".format( b1, b2, b3, b4)) print("Distinct-1/2: {:.4f}/{:.4f}".format(d1, d2)) print("F1: {:.4f}".format(f1)) print("**********************************") return { 'f1': f1, 'loss': MeanLoss, 'bleu1': b1, 'bleu2': b2, 'bleu3': b3, 'bleu4': b4, 'distinct1': d1, 'distinct2': d2 }
def s3dg_fn(features, labels, mode, params): # Compute logits. with slim.arg_scope(s3dg_arg_scope(weight_decay=params['weight_decay'])): logits, endpoints = s3dg( features, num_classes=params['num_classes'], dropout_keep_prob=1. - params['dropout_rate'], is_training=mode == tf.estimator.ModeKeys.TRAIN, prediction_fn=scoped_sigmoid, min_depth=params['min_depth'], depth_multiplier=params['depth_multiplier']) # Compute predictions using round instead of argmax since our prediction # function is sigmoid (for multi-label classification) and not softmax # (for multi-class classification). predicted_classes = tf.round(endpoints['Predictions']) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'class_ids': predicted_classes, 'probabilities': endpoints['Predictions'], 'logits': logits, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) # Compute primary loss. sigmoid_loss = tf.losses.sigmoid_cross_entropy(labels, logits) tf.summary.scalar('Losses/sigmoid_loss', sigmoid_loss) # L1 loss is not included by default, but helps with our particular task for var in tf.trainable_variables(): if var.op.name.find(r'weights') > 0 \ and var not in tf.get_collection(tf.GraphKeys.WEIGHTS): tf.add_to_collection(tf.GraphKeys.WEIGHTS, var) l1_loss = tf.contrib.layers.apply_regularization( regularizer=tf.contrib.layers.l1_regularizer( scale=params['weight_decay']), weights_list=tf.get_collection(tf.GraphKeys.WEIGHTS)) tf.summary.scalar('Losses/l1_loss', l1_loss) # L2 loss is already computed when utilizing the slim argument scope, # including the weight decay arument. Just display the existing value l2_loss = tf.reduce_sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) tf.summary.scalar('Losses/l2_loss', l2_loss) regularization_loss = tf.add(l1_loss, l2_loss) tf.summary.scalar('Losses/regularization_loss', regularization_loss) total_loss = tf.add(sigmoid_loss, regularization_loss) tf.summary.scalar('Losses/total_loss', total_loss) # Compute evaluation metrics. auc = tf.metrics.auc(labels=labels, predictions=predicted_classes, name='auc_op', weights=params['metric_weights']) precision = tf.metrics.precision(labels=labels, predictions=predicted_classes, name='precision_op', weights=params['metric_weights']) recall = tf.metrics.recall(labels=labels, predictions=predicted_classes, name='recall_op', weights=params['metric_weights']) f1 = f1_metric(labels=labels, predictions=predicted_classes, name='f1_op', weights=params['metric_weights']) if mode == tf.estimator.ModeKeys.EVAL: metrics = { 'Metrics/eval/auc': auc, 'Metrics/eval/f1': f1, 'Metrics/eval/precision': precision, 'Metrics/eval/recall': recall } return tf.estimator.EstimatorSpec(mode, loss=total_loss, eval_metric_ops=metrics) # Create training op. assert mode == tf.estimator.ModeKeys.TRAIN if params['add_image_summaries']: for batch_num in range(params['batch_size']): tf.summary.image( 'processed_video_frame', tf.expand_dims( features[batch_num, int(params['clip_length'] / 2)], 0)) # Add summaries for end_points. for endpoint in endpoints: x = endpoints[endpoint] tf.summary.histogram('activations/' + endpoint, x) tf.summary.scalar('sparsity/' + endpoint, tf.nn.zero_fraction(x)) # Add summaries if we are training only and not evaluating # If evaluating, the estimator spec will add summaries automatically tf.summary.scalar('Metrics/train/auc', auc[1]) tf.summary.scalar('Metrics/train/precision', precision[1]) tf.summary.scalar('Metrics/train/recall', recall[1]) tf.summary.scalar('Metrics/train/f1', f1[1]) # Add histograms for variables. for variable in tf.global_variables(): tf.summary.histogram(variable.op.name, variable) # prepare optimizer. if params['optimizer'] == 'momentum': #SGD + Momentum is the optimizer used to pre-train s3dg optimizer = tf.train.MomentumOptimizer( learning_rate=params['learning_rate'], momentum=params['momentum']) else: # pure SDG is a safe optimizer to use when troubleshooting problems # restoring Momentum variables from checkpoints using the Estimator API optimizer = tf.train.GradientDescentOptimizer( learning_rate=params['learning_rate']) variables_to_train = get_variables_to_train(params['variables_to_train']) train_op = tf.contrib.training.create_train_op( total_loss=total_loss, optimizer=optimizer, variables_to_train=variables_to_train) return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)