def manual_eval(dataset, model_dir=None, decode_sig=None): if model_dir is None: model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) model_dir = os.path.join(FLAGS.model_root_dir, model_subdir) print("(Manual) evaluating " + model_dir) return eval_tools.manual_eval(model_dir, decode_sig, dataset, FLAGS, top_k=3, num_examples=100, verbose=True)
def gen_eval_sheet(dataset): model_dir, decode_sig = graph_utils.get_decode_signature(FLAGS) print("evaluating " + decode_sig) output_path = os.path.join(FLAGS.model_root_dir, model_dir, "predictions.csv") eval_tools.gen_eval_sheet(decode_sig, dataset, FLAGS, output_path) print("prediction results saved to {}".format(output_path))
def eval(data_set, model_dir=None, decode_sig=None, verbose=True): if model_dir is None: model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) model_dir = os.path.join(FLAGS.model_root_dir, model_subdir) print("evaluating " + model_dir) return eval_tools.automatic_eval(model_dir, decode_sig, data_set, top_k=3, FLAGS=FLAGS, verbose=verbose)
def manual_eval(dataset, prediction_path=None): if prediction_path is None: model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) model_dir = os.path.join(FLAGS.model_root_dir, model_subdir) prediction_path = os.path.join(model_dir, 'predictions.{}.latest'.format(decode_sig)) print("(Manual) evaluating " + prediction_path) return eval_tools.manual_eval(prediction_path, dataset, FLAGS, top_k=3, num_examples=100, verbose=True)
def eval(dataset, prediction_path=None, verbose=True): if prediction_path is None: model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) model_dir = os.path.join(FLAGS.model_root_dir, model_subdir) prediction_path = os.path.join(model_dir, 'predictions.{}.latest'.format(decode_sig)) print("(Auto) evaluating " + prediction_path) return eval_tools.automatic_eval(prediction_path, dataset, top_k=3, FLAGS=FLAGS, verbose=verbose)
def load_model_predictions(): model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) model_dir = os.path.join(FLAGS.model_root_dir, model_subdir) prediction_list = load_predictions(model_dir, decode_sig, top_k) if len(grouped_dataset) != len(prediction_list): raise ValueError("ground truth list and prediction list length must " "be equal: {} vs. {}".format(len(grouped_dataset), len(prediction_list))) return prediction_list
def gen_error_analysis_sheets(dataset, model_dir=None, decode_sig=None, group_by_utility=False): if model_dir is None: model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) model_dir = os.path.join(FLAGS.model_root_dir, model_subdir) if group_by_utility: eval_tools.gen_error_analysis_csv_by_utility( model_dir, decode_sig, dataset, FLAGS) else: eval_tools.gen_error_analysis_csv( model_dir, decode_sig, dataset, FLAGS)
def manual_eval(dataset, num_eval): _, decode_sig = graph_utils.get_decode_signature(FLAGS) eval_tools.manual_eval(decode_sig, dataset, FLAGS, FLAGS.model_root_dir, num_eval)
def gen_manual_evaluation_csv_single_model(dataset, FLAGS): """ Generate .csv spreadsheet for manual evaluation on dev/test set examples for a specific model. """ # Group dataset tokenizer_selector = "cm" if FLAGS.explain else "nl" grouped_dataset = data_utils.group_parallel_data( dataset, use_bucket=True, tokenizer_selector=tokenizer_selector) # Load model predictions model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) model_dir = os.path.join(FLAGS.model_root_dir, model_subdir) prediction_list = load_predictions(model_dir, decode_sig, top_k=3) if len(grouped_dataset) != len(prediction_list): raise ValueError("ground truth list and prediction list length must " "be equal: {} vs. {}".format(len(grouped_dataset), len(prediction_list))) # Load additional ground truths template_translations, command_translations = load_cached_correct_translations( FLAGS.data_dir) # Load cached evaluation results structure_eval_cache, command_eval_cache = load_cached_evaluations( os.path.join(FLAGS.data_dir, 'manual_judgements')) eval_bash = FLAGS.dataset.startswith("bash") cmd_parser = data_tools.bash_parser if eval_bash else data_tools.paren_parser output_path = os.path.join(model_dir, 'manual.evaluations.single.model') with open(output_path, 'w') as o_f: # write spreadsheet header o_f.write('id,description,command,correct template,correct command\n') for example_id in range(len(grouped_dataset)): data_group = grouped_dataset[example_id][1] sc_txt = data_group[0].sc_txt.strip() sc_key = get_example_nl_key(sc_txt) command_gts = [dp.tg_txt for dp in data_group] command_gts = set(command_gts + command_translations[sc_key]) command_gt_asts = [ data_tools.bash_parser(cmd) for cmd in command_gts ] template_gts = [ data_tools.cmd2template(cmd, loose_constraints=True) for cmd in command_gts ] template_gts = set(template_gts + template_translations[sc_key]) template_gt_asts = [ data_tools.bash_parser(temp) for temp in template_gts ] predictions = prediction_list[example_id] for i in xrange(3): if i >= len(predictions): o_f.write(',,,n,n\n') continue pred_cmd = predictions[i] pred_tree = cmd_parser(pred_cmd) pred_temp = data_tools.ast2template(pred_tree, loose_constraints=True) temp_match = tree_dist.one_match(template_gt_asts, pred_tree, ignore_arg_value=True) str_match = tree_dist.one_match(command_gt_asts, pred_tree, ignore_arg_value=False) # Match ground truths & exisitng judgements command_example_sig = '{}<NL_PREDICTION>{}'.format( sc_key, pred_cmd) structure_example_sig = '{}<NL_PREDICTION>{}'.format( sc_key, pred_temp) command_eval, structure_eval = '', '' if str_match: command_eval = 'y' structure_eval = 'y' elif temp_match: structure_eval = 'y' if command_eval_cache and \ command_example_sig in command_eval_cache: command_eval = command_eval_cache[command_example_sig] if structure_eval_cache and \ structure_example_sig in structure_eval_cache: structure_eval = structure_eval_cache[ structure_example_sig] if i == 0: o_f.write('{},"{}","{}",{},{}\n'.format( example_id, sc_txt.replace('"', '""'), pred_cmd.replace('"', '""'), structure_eval, command_eval)) else: o_f.write(',,"{}",{},{}\n'.format( pred_cmd.replace('"', '""'), structure_eval, command_eval)) print('manual evaluation spreadsheet saved to {}'.format(output_path))
def save_hyperparameters(): model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) with open(os.path.join(FLAGS.model_root_dir, model_subdir, 'hyperparameters.pkl'), 'wb') as o_f: pickle.dump(FLAGS, o_f)
def main(_): # set GPU device os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # set up data and model directories FLAGS.data_dir = os.path.join(os.path.dirname(__file__), "..", "data", FLAGS.dataset) print("Reading data from {}".format(FLAGS.data_dir)) # set up encoder/decider dropout rate if FLAGS.universal_keep >= 0 and FLAGS.universal_keep < 1: FLAGS.sc_input_keep = FLAGS.universal_keep FLAGS.sc_output_keep = FLAGS.universal_keep FLAGS.tg_input_keep = FLAGS.universal_keep FLAGS.tg_output_keep = FLAGS.universal_keep FLAGS.attention_input_keep = FLAGS.universal_keep FLAGS.attention_output_keep = FLAGS.universal_keep # adjust hyperparameters for batch normalization if FLAGS.recurrent_batch_normalization: # larger batch size FLAGS.batch_size *= 4 # larger initial learning rate FLAGS.learning_rate *= 10 if FLAGS.decoder_topology in ['basic_tree']: FLAGS.model_root_dir = os.path.join(os.path.dirname(__file__), "..", FLAGS.model_root_dir, "seq2tree") elif FLAGS.decoder_topology in ['rnn']: FLAGS.model_root_dir = os.path.join(os.path.dirname(__file__), "..", FLAGS.model_root_dir, "seq2seq") else: raise ValueError("Unrecognized decoder topology: {}.".format( FLAGS.decoder_topology)) print("Saving models to {}".format(FLAGS.model_root_dir)) if FLAGS.process_data: process_data() else: train_set, dev_set, test_set = data_utils.load_data( FLAGS, use_buckets=False, load_features=False) dataset = test_set if FLAGS.test else dev_set if FLAGS.eval: eval(dataset, FLAGS.prediction_file) elif FLAGS.manual_eval: manual_eval(dataset, FLAGS.prediction_file) elif FLAGS.gen_error_analysis_sheet: gen_error_analysis_sheets(dataset, group_by_utility=True) elif FLAGS.gen_manual_evaluation_sheet: error_analysis.gen_manual_evaluation_csv(dataset, FLAGS) elif FLAGS.gen_manual_evaluation_sheet_single_model: error_analysis.gen_manual_evaluation_csv_single_model( dataset, FLAGS) elif FLAGS.gen_manual_evaluation_table: if FLAGS.test: eval_tools.gen_manual_evaluation_table(dataset, FLAGS) else: eval_tools.gen_manual_evaluation_table(dataset, FLAGS, num_examples=100) elif FLAGS.gen_auto_evaluation_table: eval_tools.gen_automatic_evaluation_table(dataset, FLAGS) elif FLAGS.tabulate_example_predictions: error_analysis.tabulate_example_predictions(dataset, FLAGS, num_examples=100) else: train_set, dev_set, test_set = data_utils.load_data( FLAGS, use_buckets=True) dataset = test_set if FLAGS.test else dev_set vocab = data_utils.load_vocabulary(FLAGS) print("Set dataset parameters") FLAGS.max_sc_length = train_set.max_sc_length if not train_set.buckets else \ train_set.buckets[-1][0] FLAGS.max_tg_length = train_set.max_tg_length if not train_set.buckets else \ train_set.buckets[-1][1] FLAGS.sc_vocab_size = len(vocab.sc_vocab) FLAGS.tg_vocab_size = len(vocab.tg_vocab) FLAGS.max_sc_token_size = vocab.max_sc_token_size FLAGS.max_tg_token_size = vocab.max_tg_token_size if FLAGS.gen_slot_filling_training_data: gen_slot_filling_training_data(FLAGS, [train_set, dev_set, test_set]) elif FLAGS.decode: model = decode(dataset, buckets=train_set.buckets) if not FLAGS.explain: eval(dataset, verbose=False) elif FLAGS.demo: demo(buckets=train_set.buckets) elif FLAGS.grid_search: meta_experiments.grid_search(train, decode, eval, train_set, dataset, FLAGS) elif FLAGS.schedule_experiments: schedule_experiments(train, decode, eval, train_set, dataset) else: # Train the model. train(train_set, dataset) if FLAGS.normalized: tf.compat.v1.reset_default_graph() gen_slot_filling_training_data( FLAGS, [train_set, dev_set, test_set]) FLAGS.fill_argument_slots = True # save model hyperparameters model_subdir, decode_sig = graph_utils.get_decode_signature( FLAGS) with open( os.path.join(FLAGS.model_root_dir, model_subdir, 'hyperparameters.pkl'), 'wb') as o_f: flag_dict = dict() for flag in dir(FLAGS): flag_dict[flag] = getattr(FLAGS, flag) pickle.dump(flag_dict, o_f) # Decode the new model on the development set. tf.compat.v1.reset_default_graph() model = decode(dataset, buckets=train_set.buckets) # Run automatic evaluation on the development set. if not FLAGS.explain: eval(dataset, verbose=True)