示例#1
0
def manual_eval(dataset, model_dir=None, decode_sig=None):
    if model_dir is None:
        model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
        model_dir = os.path.join(FLAGS.model_root_dir, model_subdir)
    print("(Manual) evaluating " + model_dir)

    return eval_tools.manual_eval(model_dir, decode_sig, dataset, FLAGS, top_k=3, num_examples=100, verbose=True)
示例#2
0
def gen_eval_sheet(dataset):
    model_dir, decode_sig = graph_utils.get_decode_signature(FLAGS)
    print("evaluating " + decode_sig)

    output_path = os.path.join(FLAGS.model_root_dir, model_dir,
                               "predictions.csv")
    eval_tools.gen_eval_sheet(decode_sig, dataset, FLAGS, output_path)
    print("prediction results saved to {}".format(output_path))
示例#3
0
def eval(data_set, model_dir=None, decode_sig=None, verbose=True):
    if model_dir is None:
        model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
        model_dir = os.path.join(FLAGS.model_root_dir, model_subdir)
    print("evaluating " + model_dir)

    return eval_tools.automatic_eval(model_dir, decode_sig, data_set,
        top_k=3, FLAGS=FLAGS, verbose=verbose)
示例#4
0
def manual_eval(dataset, prediction_path=None):
    if prediction_path is None:
        model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
        model_dir = os.path.join(FLAGS.model_root_dir, model_subdir)
        prediction_path = os.path.join(model_dir, 'predictions.{}.latest'.format(decode_sig))
    print("(Manual) evaluating " + prediction_path)

    return eval_tools.manual_eval(prediction_path, dataset, FLAGS, top_k=3, num_examples=100, verbose=True)
示例#5
0
def eval(dataset, prediction_path=None, verbose=True):
    if prediction_path is None:
        model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
        model_dir = os.path.join(FLAGS.model_root_dir, model_subdir)
        prediction_path = os.path.join(model_dir, 'predictions.{}.latest'.format(decode_sig))
    print("(Auto) evaluating " + prediction_path)

    return eval_tools.automatic_eval(prediction_path, dataset, top_k=3, FLAGS=FLAGS, verbose=verbose)
示例#6
0
 def load_model_predictions():
     model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
     model_dir = os.path.join(FLAGS.model_root_dir, model_subdir)
     prediction_list = load_predictions(model_dir, decode_sig, top_k)
     if len(grouped_dataset) != len(prediction_list):
         raise ValueError("ground truth list and prediction list length must "
                          "be equal: {} vs. {}".format(len(grouped_dataset),
                                                       len(prediction_list)))
     return prediction_list
示例#7
0
def gen_error_analysis_sheets(dataset, model_dir=None, decode_sig=None,
                              group_by_utility=False):
    if model_dir is None:
        model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
        model_dir = os.path.join(FLAGS.model_root_dir, model_subdir)
    if group_by_utility:
        eval_tools.gen_error_analysis_csv_by_utility(
            model_dir, decode_sig, dataset, FLAGS)
    else:
        eval_tools.gen_error_analysis_csv(
            model_dir, decode_sig, dataset, FLAGS)
示例#8
0
def manual_eval(dataset, num_eval):
    _, decode_sig = graph_utils.get_decode_signature(FLAGS)
    eval_tools.manual_eval(decode_sig, dataset, FLAGS, FLAGS.model_root_dir,
                           num_eval)
示例#9
0
def gen_manual_evaluation_csv_single_model(dataset, FLAGS):
    """
    Generate .csv spreadsheet for manual evaluation on dev/test set
    examples for a specific model.
    """
    # Group dataset
    tokenizer_selector = "cm" if FLAGS.explain else "nl"
    grouped_dataset = data_utils.group_parallel_data(
        dataset, use_bucket=True, tokenizer_selector=tokenizer_selector)

    # Load model predictions
    model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
    model_dir = os.path.join(FLAGS.model_root_dir, model_subdir)
    prediction_list = load_predictions(model_dir, decode_sig, top_k=3)
    if len(grouped_dataset) != len(prediction_list):
        raise ValueError("ground truth list and prediction list length must "
                         "be equal: {} vs. {}".format(len(grouped_dataset),
                                                      len(prediction_list)))

    # Load additional ground truths
    template_translations, command_translations = load_cached_correct_translations(
        FLAGS.data_dir)

    # Load cached evaluation results
    structure_eval_cache, command_eval_cache = load_cached_evaluations(
        os.path.join(FLAGS.data_dir, 'manual_judgements'))

    eval_bash = FLAGS.dataset.startswith("bash")
    cmd_parser = data_tools.bash_parser if eval_bash else data_tools.paren_parser

    output_path = os.path.join(model_dir, 'manual.evaluations.single.model')
    with open(output_path, 'w') as o_f:
        # write spreadsheet header
        o_f.write('id,description,command,correct template,correct command\n')
        for example_id in range(len(grouped_dataset)):
            data_group = grouped_dataset[example_id][1]
            sc_txt = data_group[0].sc_txt.strip()
            sc_key = get_example_nl_key(sc_txt)
            command_gts = [dp.tg_txt for dp in data_group]
            command_gts = set(command_gts + command_translations[sc_key])
            command_gt_asts = [
                data_tools.bash_parser(cmd) for cmd in command_gts
            ]
            template_gts = [
                data_tools.cmd2template(cmd, loose_constraints=True)
                for cmd in command_gts
            ]
            template_gts = set(template_gts + template_translations[sc_key])
            template_gt_asts = [
                data_tools.bash_parser(temp) for temp in template_gts
            ]
            predictions = prediction_list[example_id]
            for i in xrange(3):
                if i >= len(predictions):
                    o_f.write(',,,n,n\n')
                    continue
                pred_cmd = predictions[i]
                pred_tree = cmd_parser(pred_cmd)
                pred_temp = data_tools.ast2template(pred_tree,
                                                    loose_constraints=True)
                temp_match = tree_dist.one_match(template_gt_asts,
                                                 pred_tree,
                                                 ignore_arg_value=True)
                str_match = tree_dist.one_match(command_gt_asts,
                                                pred_tree,
                                                ignore_arg_value=False)
                # Match ground truths & exisitng judgements
                command_example_sig = '{}<NL_PREDICTION>{}'.format(
                    sc_key, pred_cmd)
                structure_example_sig = '{}<NL_PREDICTION>{}'.format(
                    sc_key, pred_temp)
                command_eval, structure_eval = '', ''
                if str_match:
                    command_eval = 'y'
                    structure_eval = 'y'
                elif temp_match:
                    structure_eval = 'y'
                if command_eval_cache and \
                        command_example_sig in command_eval_cache:
                    command_eval = command_eval_cache[command_example_sig]
                if structure_eval_cache and \
                        structure_example_sig in structure_eval_cache:
                    structure_eval = structure_eval_cache[
                        structure_example_sig]
                if i == 0:
                    o_f.write('{},"{}","{}",{},{}\n'.format(
                        example_id, sc_txt.replace('"', '""'),
                        pred_cmd.replace('"', '""'), structure_eval,
                        command_eval))
                else:
                    o_f.write(',,"{}",{},{}\n'.format(
                        pred_cmd.replace('"', '""'), structure_eval,
                        command_eval))
    print('manual evaluation spreadsheet saved to {}'.format(output_path))
示例#10
0
def save_hyperparameters():
    model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
    with open(os.path.join(FLAGS.model_root_dir, model_subdir, 'hyperparameters.pkl'), 'wb') as o_f:
        pickle.dump(FLAGS, o_f)
示例#11
0
def main(_):
    # set GPU device
    os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    # set up data and model directories
    FLAGS.data_dir = os.path.join(os.path.dirname(__file__), "..", "data",
                                  FLAGS.dataset)
    print("Reading data from {}".format(FLAGS.data_dir))

    # set up encoder/decider dropout rate
    if FLAGS.universal_keep >= 0 and FLAGS.universal_keep < 1:
        FLAGS.sc_input_keep = FLAGS.universal_keep
        FLAGS.sc_output_keep = FLAGS.universal_keep
        FLAGS.tg_input_keep = FLAGS.universal_keep
        FLAGS.tg_output_keep = FLAGS.universal_keep
        FLAGS.attention_input_keep = FLAGS.universal_keep
        FLAGS.attention_output_keep = FLAGS.universal_keep

    # adjust hyperparameters for batch normalization
    if FLAGS.recurrent_batch_normalization:
        # larger batch size
        FLAGS.batch_size *= 4
        # larger initial learning rate
        FLAGS.learning_rate *= 10

    if FLAGS.decoder_topology in ['basic_tree']:
        FLAGS.model_root_dir = os.path.join(os.path.dirname(__file__), "..",
                                            FLAGS.model_root_dir, "seq2tree")
    elif FLAGS.decoder_topology in ['rnn']:
        FLAGS.model_root_dir = os.path.join(os.path.dirname(__file__), "..",
                                            FLAGS.model_root_dir, "seq2seq")
    else:
        raise ValueError("Unrecognized decoder topology: {}.".format(
            FLAGS.decoder_topology))
    print("Saving models to {}".format(FLAGS.model_root_dir))

    if FLAGS.process_data:
        process_data()

    else:
        train_set, dev_set, test_set = data_utils.load_data(
            FLAGS, use_buckets=False, load_features=False)
        dataset = test_set if FLAGS.test else dev_set

        if FLAGS.eval:
            eval(dataset, FLAGS.prediction_file)
        elif FLAGS.manual_eval:
            manual_eval(dataset, FLAGS.prediction_file)
        elif FLAGS.gen_error_analysis_sheet:
            gen_error_analysis_sheets(dataset, group_by_utility=True)
        elif FLAGS.gen_manual_evaluation_sheet:
            error_analysis.gen_manual_evaluation_csv(dataset, FLAGS)
        elif FLAGS.gen_manual_evaluation_sheet_single_model:
            error_analysis.gen_manual_evaluation_csv_single_model(
                dataset, FLAGS)
        elif FLAGS.gen_manual_evaluation_table:
            if FLAGS.test:
                eval_tools.gen_manual_evaluation_table(dataset, FLAGS)
            else:
                eval_tools.gen_manual_evaluation_table(dataset,
                                                       FLAGS,
                                                       num_examples=100)
        elif FLAGS.gen_auto_evaluation_table:
            eval_tools.gen_automatic_evaluation_table(dataset, FLAGS)
        elif FLAGS.tabulate_example_predictions:
            error_analysis.tabulate_example_predictions(dataset,
                                                        FLAGS,
                                                        num_examples=100)
        else:
            train_set, dev_set, test_set = data_utils.load_data(
                FLAGS, use_buckets=True)
            dataset = test_set if FLAGS.test else dev_set
            vocab = data_utils.load_vocabulary(FLAGS)

            print("Set dataset parameters")
            FLAGS.max_sc_length = train_set.max_sc_length if not train_set.buckets else \
                train_set.buckets[-1][0]
            FLAGS.max_tg_length = train_set.max_tg_length if not train_set.buckets else \
                train_set.buckets[-1][1]
            FLAGS.sc_vocab_size = len(vocab.sc_vocab)
            FLAGS.tg_vocab_size = len(vocab.tg_vocab)
            FLAGS.max_sc_token_size = vocab.max_sc_token_size
            FLAGS.max_tg_token_size = vocab.max_tg_token_size

            if FLAGS.gen_slot_filling_training_data:
                gen_slot_filling_training_data(FLAGS,
                                               [train_set, dev_set, test_set])

            elif FLAGS.decode:
                model = decode(dataset, buckets=train_set.buckets)
                if not FLAGS.explain:
                    eval(dataset, verbose=False)

            elif FLAGS.demo:
                demo(buckets=train_set.buckets)

            elif FLAGS.grid_search:
                meta_experiments.grid_search(train, decode, eval, train_set,
                                             dataset, FLAGS)
            elif FLAGS.schedule_experiments:
                schedule_experiments(train, decode, eval, train_set, dataset)
            else:
                # Train the model.
                train(train_set, dataset)

                if FLAGS.normalized:
                    tf.compat.v1.reset_default_graph()
                    gen_slot_filling_training_data(
                        FLAGS, [train_set, dev_set, test_set])
                    FLAGS.fill_argument_slots = True

                # save model hyperparameters
                model_subdir, decode_sig = graph_utils.get_decode_signature(
                    FLAGS)
                with open(
                        os.path.join(FLAGS.model_root_dir, model_subdir,
                                     'hyperparameters.pkl'), 'wb') as o_f:
                    flag_dict = dict()
                    for flag in dir(FLAGS):
                        flag_dict[flag] = getattr(FLAGS, flag)
                    pickle.dump(flag_dict, o_f)

                # Decode the new model on the development set.
                tf.compat.v1.reset_default_graph()
                model = decode(dataset, buckets=train_set.buckets)

                # Run automatic evaluation on the development set.
                if not FLAGS.explain:
                    eval(dataset, verbose=True)