def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('evaluation_set', help='The dev or test set JSON file', type=argparse.FileType('r')) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('--metric_name', help='name of metric to use', choices=["labeled_precision", "labeled_recall", "labeled_f1", "nuc_precision", "nuc_recall", "nuc_f1", "span_precision", "span_recall", "span_f1"], required=True) parser.add_argument('--n_samples', type=int, default=10000) parser.add_argument('--alpha', type=float, default=0.05) args = parser.parse_args() # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) # read the models logger.info('Loading models') rst_parser = Parser(max_acts=1, max_states=1, n_best=1) rst_parser.load_model(args.parsing_model) eval_data = json.load(args.evaluation_set) pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees = \ predict_rst_trees_for_eval(None, None, rst_parser, eval_data) data = np.array(list(zip(pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees))) # score without bootstrapping orig_score = compute_rst_eval_results(pred_edu_tokens_lists, pred_trees, gold_edu_tokens_lists, gold_trees)[args.metric_name] tmp_score = make_score_func(args.metric_name)(data) assert tmp_score == orig_score boot_ci_lower, boot_ci_upper = \ boot.ci(data, make_score_func(args.metric_name), n_samples=args.n_samples, method='bca', alpha=args.alpha) print("evaluation_set: {}".format(args.evaluation_set)) print("alpha: {}".format(args.alpha)) print("n_samples: {}".format(args.n_samples)) print("metric: {}".format(args.metric_name)) print("original score: {}".format(orig_score)) print("CI: ({}, {})".format(boot_ci_lower, boot_ci_upper))
def score_func(data): return compute_rst_eval_results(data[:, 0], data[:, 1], data[:, 2], data[:, 3])[metric_name]