def test_extract_parse_actions(): tree = ParentedTree.fromstring('(ROOT (satellite:attribution (text 0)) (nucleus:span (satellite:condition (text 1)) (nucleus:span (nucleus:span (nucleus:same-unit (text 2)) (nucleus:same-unit (satellite:temporal (text 3)) (nucleus:span (text 4)))) (satellite:conclusion (text 5)))))') # I think the tree above would be for something # like this silly little example: # "John said that if Bob bought this excellent book, # then before the end of next week Bob would finish it, # and therefore he would be happy." actions = extract_parse_actions(tree) num_shifts = len([x for x in actions if x.type == 'S']) assert num_shifts == 6 assert actions[0].type == 'S' assert actions[1].type == 'U' assert actions[1].label == 'satellite:attribution' assert actions[2].type == 'S'
def test_reconstruct_training_examples(): ''' This code goes through the training data and makes sure that the actions extracted from the trees can be used to reconstruct those trees from a list of EDUs. ''' train_path = 'rst_discourse_tb_edus_TRAINING_TRAIN.json' with open(train_path) as f: data = json.load(f) rst_parser = Parser(max_acts=1, max_states=1, n_best=1) for doc_dict in data: tree_orig = ParentedTree.fromstring(doc_dict['rst_tree']) actions = extract_parse_actions(tree_orig) tree2 = next(rst_parser.parse(doc_dict, gold_actions=actions, make_features=False))['tree'] logging.info('test_reconstruct_training_examples verified tree for {}'.format(doc_dict['path_basename'])) assert tree2 == tree_orig
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('train_file', help='Path to JSON training file.', type=argparse.FileType('r')) parser.add_argument('eval_file', help='Path to JSON dev or test file for ' + 'tuning/evaluation.', type=argparse.FileType('r')) parser.add_argument('model_path', help='Prefix for the path to where the model should be' ' stored. A suffix with the C value will be added.') parser.add_argument('-w', '--working_path', help='Path to where intermediate files should be ' + 'stored', default='working') parser.add_argument('-C', '--C_values', help='comma-separated list of model complexity ' + 'parameter settings to evaluate.', default=','.join([str(2.0**x) for x in range(-4, 5)])) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('-s', '--single_process', action='store_true', help='Run in a single process for all hyperparameter' + ' grid points, to simplify debugging.') args = parser.parse_args() if os.path.exists(args.working_path): raise IOError("{} already exists. Stopping here to avoid the " "possibility of overwriting files that are currently " "being used.".format(args.working_path)) os.makedirs(args.working_path) parser = Parser(1, 1, 1) # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) logger.info('Extracting examples') train_data = json.load(args.train_file) eval_data = json.load(args.eval_file) train_examples = [] for doc_dict in train_data: path_basename = doc_dict['path_basename'] logging.info('Extracting examples for {}'.format(path_basename)) tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(tree) actions = extract_parse_actions(tree) for i, (action_str, feats) in \ enumerate(parser.parse(doc_dict, gold_actions=actions)): example_id = "{}_{}".format(path_basename, i) example = {"x": Counter(feats), "y": action_str, "id": example_id} train_examples.append(example) # print("{} {}".format(action_str, " ".join(feats))) # train and evaluate a model for each value of C best_labeled_f1 = -1.0 best_C = None # train and evaluate models with different C values in parallel C_values = [float(x) for x in args.C_values.split(',')] partial_train_and_eval_model = partial(train_and_eval_model, args.working_path, args.model_path, eval_data) # Make the SKLL jsonlines feature file train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines') with open(train_path, 'w') as train_file: for example in train_examples: train_file.write('{}\n'.format(json.dumps(example))) if args.single_process: all_results = [ partial_train_and_eval_model(C_value) for C_value in C_values ] else: n_workers = len(C_values) with ProcessPoolExecutor(max_workers=n_workers) as executor: all_results = executor.map(partial_train_and_eval_model, C_values) for C_value, results in zip(C_values, all_results): results["C"] = C_value print(json.dumps(sorted(results.items()))) if results["labeled_f1"] > best_labeled_f1: best_labeled_f1 = results["labeled_f1"] best_C = C_value print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('train_file', help='Path to JSON training file.', type=argparse.FileType('r')) parser.add_argument('eval_file', help='Path to JSON dev or test file for ' + 'tuning/evaluation.', type=argparse.FileType('r')) parser.add_argument('model_path', help='Prefix for the path to where the model should be' ' stored. A suffix with the C value will be added.') parser.add_argument('-w', '--working_path', help='Path to where intermediate files should be ' + 'stored', default='working') parser.add_argument('-C', '--C_values', help='comma-separated list of model complexity ' + 'parameter settings to evaluate.', default=','.join([str(2.0 ** x) for x in range(-4, 5)])) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') parser.add_argument('-s', '--single_process', action='store_true', help='Run in a single process for all hyperparameter' + ' grid points, to simplify debugging.') args = parser.parse_args() if os.path.exists(args.working_path): raise IOError("{} already exists. Stopping here to avoid the " "possibility of overwriting files that are currently " "being used.".format(args.working_path)) os.makedirs(args.working_path) parser = Parser(1, 1, 1) # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) logger.info('Extracting examples') train_data = json.load(args.train_file) eval_data = json.load(args.eval_file) train_examples = [] for doc_dict in train_data: path_basename = doc_dict['path_basename'] logging.info('Extracting examples for {}'.format(path_basename)) tree = ParentedTree.fromstring(doc_dict['rst_tree']) collapse_rst_labels(tree) actions = extract_parse_actions(tree) for i, (action_str, feats) in \ enumerate(parser.parse(doc_dict, gold_actions=actions)): example_id = "{}_{}".format(path_basename, i) example = {"x": Counter(feats), "y": action_str, "id": example_id} train_examples.append(example) # print("{} {}".format(action_str, " ".join(feats))) # train and evaluate a model for each value of C best_labeled_f1 = -1.0 best_C = None # train and evaluate models with different C values in parallel C_values = [float(x) for x in args.C_values.split(',')] partial_train_and_eval_model = partial(train_and_eval_model, args.working_path, args.model_path, eval_data) # Make the SKLL jsonlines feature file train_path = os.path.join(args.working_path, 'rst_parsing.jsonlines') with open(train_path, 'w') as train_file: for example in train_examples: train_file.write('{}\n'.format(json.dumps(example))) if args.single_process: all_results = [partial_train_and_eval_model(C_value) for C_value in C_values] else: n_workers = len(C_values) with ProcessPoolExecutor(max_workers=n_workers) as executor: all_results = executor.map(partial_train_and_eval_model, C_values) for C_value, results in zip(C_values, all_results): results["C"] = C_value print(json.dumps(sorted(results.items()))) if results["labeled_f1"] > best_labeled_f1: best_labeled_f1 = results["labeled_f1"] best_C = C_value print("best labeled F1 = {}, with C = {}".format(best_labeled_f1, best_C))