def manual_eval(model, dataset, rev_nl_vocab, FLAGS, output_dir, num_eval=30): num_top1_correct_temp = 0.0 num_top3_correct_temp = 0.0 num_top5_correct_temp = 0.0 num_top10_correct_temp = 0.0 num_top1_correct = 0.0 num_top3_correct = 0.0 num_top5_correct = 0.0 num_top10_correct = 0.0 num_evaled = 0 grouped_dataset = data_utils.group_data_by_nl(dataset, use_bucket=False) \ .values() random.shuffle(grouped_dataset, lambda: 0.5208484091114275) cmd_parser = data_tools.bash_parser if FLAGS.dataset == "bash" \ else data_utils.parse_brackets o_f = open(os.path.join(output_dir, "manual.eval.results"), 'w') rejudge = False with DBConnection() as db: db.create_schema() while num_evaled < len(grouped_dataset): nl_strs, cm_strs, nls, search_historys = grouped_dataset[num_evaled] nl_str = nl_strs[0].decode('utf-8') if num_evaled == num_eval: break gt_trees = [cmd_parser(cmd) for cmd in cm_strs] predictions = db.get_top_k_predictions(model, nl_str, k=10) top1_correct_temp, top3_correct_temp, top5_correct_temp, top10_correct_temp = \ False, False, False, False top1_correct, top3_correct, top5_correct, top10_correct = \ False, False, False, False # evaluation ignoring ordering of flags print("Example %d (%d)" % (num_evaled+1, len(cm_strs))) o_f.write("Example %d (%d)" % (num_evaled+1, len(cm_strs)) + "\n") print("English: " + nl_str.strip()) o_f.write("English: " + nl_str.encode('utf-8')) for j in xrange(len(cm_strs)): print("GT Command %d: " % (j+1) + cm_strs[j].strip()) o_f.write("GT Command %d: " % (j+1) + cm_strs[j].strip() + "\n") pred_id = 0 while pred_id < min(1, len(predictions)): pred_cmd, score = predictions[pred_id] tree = cmd_parser(pred_cmd) print("Prediction {}: {} ({})".format(pred_id+1, pred_cmd, score)) o_f.write("Prediction {}: {} ({})\n".format(pred_id+1, pred_cmd, score)) print() pred_temp = data_tools.ast2template(tree, loose_constraints=True) str_judge = db.get_str_judgement((nl_str, pred_cmd)) temp_judge = db.get_temp_judgement((nl_str, pred_temp)) if temp_judge is not None and not rejudge: judgement_str = "y" if temp_judge == 1 \ else "n ({})".format(error_types[temp_judge]) print("Correct template [y/n]: %s" % judgement_str) else: temp_judge = ast_based.one_match(gt_trees, tree, rewrite=False, ignore_arg_value=True) if not temp_judge: inp = raw_input("Correct template [y/n]: ") if inp == "REVERSE": rejudge = True else: if inp == "y": temp_judge = True db.add_temp_judgement((nl_str, pred_temp, 1)) else: temp_judge = False error_type = raw_input( "Error type: \n" "(2) extra utility \n" "(3) missing utility \n" "(4) confused utility \n" "(5) extra flag \n" "(6) missing flag \n" "(7) confused flag \n" "(8) logic error\n" "(9) count error\n" ) db.add_temp_judgement((nl_str, pred_temp, int(error_type))) rejudge = False else: print("Correct template [y/n]: y") if temp_judge == 1: if pred_id < 1: top1_correct_temp = True top3_correct_temp = True top5_correct_temp = True top10_correct_temp = True elif pred_id < 3: top3_correct_temp = True top5_correct_temp = True top10_correct_temp = True elif pred_id < 5: top5_correct_temp = True top10_correct_temp = True elif pred_id < 10: top10_correct_temp = True o_f.write("C") if str_judge is not None and not rejudge: judgement_str = "y" if str_judge == 1 \ else "n ({})".format(error_types[str_judge]) print("Correct command [y/n]: %s" % judgement_str) else: str_judge = ast_based.one_match(gt_trees, tree, rewrite=False, ignore_arg_value=False) if not str_judge: inp = raw_input("Correct command [y/n]: ") if inp == "REVERSE": rejudge = True continue elif inp == "y": str_judge = True o_f.write("C") db.add_str_judgement((nl_str, pred_cmd, 1)) else: str_judge = False o_f.write("W") db.add_str_judgement((nl_str, pred_cmd, 0)) else: print("Correct command [y/n]: y") if str_judge == 1: if pred_id < 1: top1_correct = True top3_correct = True top5_correct = True top10_correct = True elif pred_id < 3: top3_correct = True top5_correct = True top10_correct = True elif pred_id < 5: top5_correct = True top10_correct = True elif pred_id < 10: top10_correct = True o_f.write("C") else: o_f.write("W") else: o_f.write("WW") o_f.write("\n") o_f.write("\n") pred_id += 1 if rejudge: num_evaled -= 1 else: num_evaled += 1 if top1_correct_temp: num_top1_correct_temp += 1 if top3_correct_temp: num_top3_correct_temp += 1 if top5_correct_temp: num_top5_correct_temp += 1 if top10_correct_temp: num_top10_correct_temp += 1 if top1_correct: num_top1_correct += 1 if top3_correct: num_top3_correct += 1 if top5_correct: num_top5_correct += 1 if top10_correct: num_top10_correct += 1 rejudge = False print() print("%d examples evaluated" % num_eval) print("Top 1 Template Match Score = %.2f" % (num_top1_correct_temp/num_eval)) print("Top 1 String Match Score = %.2f" % (num_top1_correct/num_eval)) if len(predictions) > 3: print("Top 5 Template Match Score = %.2f" % (num_top5_correct_temp/num_eval)) print("Top 5 String Match Score = %.2f" % (num_top5_correct/num_eval)) print("Top 10 Template Match Score = %.2f" % (num_top10_correct_temp/num_eval)) print("Top 10 String Match Score = %.2f" % (num_top10_correct/num_eval)) print() o_f.write("%d examples evaluated" % num_eval + "\n") o_f.write("Top 1 Template MatchScore = %.2f" % (num_top1_correct_temp/num_eval) + "\n") o_f.write("Top 1 String Match Score = %.2f" % (num_top1_correct/num_eval) + "\n") if len(predictions) > 1: o_f.write("Top 5 Template Match Score = %.2f" % (num_top5_correct_temp/num_eval) + "\n") o_f.write("Top 5 String Match Score = %.2f" % (num_top5_correct/num_eval) + "\n") o_f.write("Top 10 Template Match Score = %.2f" % (num_top10_correct_temp/num_eval) + "\n") o_f.write("Top 10 String Match Score = %.2f" % (num_top10_correct/num_eval) + "\n") o_f.write("\n")
def eval_set(model, dataset, rev_nl_vocab, FLAGS, verbose=True): num_top1_correct_temp = 0.0 num_top3_correct_temp = 0.0 num_top5_correct_temp = 0.0 num_top10_correct_temp = 0.0 total_top1_temp_dist = 0.0 total_top3_temp_dist = 0.0 total_top5_temp_dist = 0.0 total_top10_temp_dist = 0.0 num_top1_correct = 0.0 num_top3_correct = 0.0 num_top5_correct = 0.0 num_top10_correct = 0.0 total_top1_dist = 0.0 total_top3_dist = 0.0 total_top5_dist = 0.0 total_top10_dist = 0.0 num_eval = 0 eval_bash = FLAGS.dataset.startswith("bash") cmd_parser = data_tools.bash_parser if eval_bash \ else data_tools.paren_parser use_bucket = False if model == "knn" else True grouped_dataset = data_utils.group_data_by_nl( dataset, use_bucket=use_bucket, use_nl_temp = eval_bash) with DBConnection() as db: for nl_temp in grouped_dataset: nl_strs, cm_strs, nls, search_historys = grouped_dataset[nl_temp] nl_str = nl_strs[0].decode('utf-8') gt_trees = [cmd_parser(cmd) for cmd in cm_strs] num_gts = len(gt_trees) gt_trees = gt_trees + [cmd_parser(cmd) for cmd in db.get_correct_temps(nl_str)] predictions = db.get_top_k_predictions(model, nl_str, k=10) if verbose: print("Example %d (%d)" % (num_eval, len(cm_strs))) print("Original English: " + nl_str.strip()) print("English: " + nl_temp) for j in xrange(len(cm_strs)): print("GT Command {}: ".format(j+1) + cm_strs[j].strip()) num_eval += (1 if eval_bash else num_gts) top1_correct_temp, top3_correct_temp, top5_correct_temp, top10_correct_temp = \ False, False, False, False top1_correct, top3_correct, top5_correct, top10_correct = \ False, False, False, False top1_temp_dist = sys.maxint top3_temp_dist = sys.maxint top5_temp_dist = sys.maxint top10_temp_dist = sys.maxint top1_dist = sys.maxint top3_dist = sys.maxint top5_dist = sys.maxint top10_dist = sys.maxint for i in xrange(min(1, len(predictions))): pred_cmd, score = predictions[i] tree = cmd_parser(pred_cmd) # evaluation ignoring flag orders temp_match = ast_based.one_match(gt_trees, tree, ignore_arg_value=True) str_match = ast_based.one_match(gt_trees, tree, ignore_arg_value=False) min_temp_dist = ast_based.min_dist(gt_trees, tree, ignore_arg_value=True) min_dist = ast_based.min_dist(gt_trees, tree, ignore_arg_value=False) if temp_match: if i < 1: top1_correct_temp = True if i < 3: top3_correct_temp = True if i < 5: top5_correct_temp = True if i < 10: top10_correct_temp = True if str_match: if i < 1: top1_correct = True if i < 3: top3_correct = True if i < 5: top5_correct = True if i < 10: top10_correct = True if i < 1: if min_temp_dist < top1_temp_dist: top1_temp_dist = min_temp_dist if min_dist < top1_dist: top1_dist = min_dist if i < 3: if min_temp_dist < top3_temp_dist: top3_temp_dist = min_temp_dist if min_dist < top3_dist: top3_dist = min_dist if i < 5: if min_temp_dist < top5_temp_dist: top5_temp_dist = min_temp_dist if min_dist < top5_dist: top5_dist = min_dist if i < 10: if min_temp_dist < top10_temp_dist: top10_temp_dist = min_temp_dist if min_dist < top10_dist: top10_dist = min_dist if verbose: print("Prediction {}: {} ({}) ({})".format(i+1, pred_cmd, score, temp_match)) if verbose: print() if top1_correct_temp: num_top1_correct_temp += (1 if eval_bash else num_gts) if top3_correct_temp: num_top3_correct_temp += (1 if eval_bash else num_gts) if top5_correct_temp: num_top5_correct_temp += (1 if eval_bash else num_gts) if top10_correct_temp: num_top10_correct_temp += (1 if eval_bash else num_gts) if top1_correct: num_top1_correct += (1 if eval_bash else num_gts) if top3_correct: num_top3_correct += (1 if eval_bash else num_gts) if top5_correct: num_top5_correct += (1 if eval_bash else num_gts) if top10_correct: num_top10_correct += (1 if eval_bash else num_gts) total_top1_temp_dist += top1_temp_dist total_top3_temp_dist += top3_temp_dist total_top5_temp_dist += top5_temp_dist total_top10_temp_dist += top10_temp_dist total_top1_dist += top1_dist total_top3_dist += top3_dist total_top5_dist += top5_dist total_top10_dist += top10_dist #TODO: compute top-K matching scores top1_temp_match_score = num_top1_correct_temp / num_eval top1_string_match_score = num_top1_correct / num_eval avg_top1_temp_dist = (total_top1_temp_dist + 0.0) / num_eval avg_top1_dist = (total_top1_dist + 0.0) / num_eval print("%d examples evaluated" % num_eval) print("Percentage of top 1 Match (template-only) = %.3f" % top1_temp_match_score) print("Percentage of top 1 Match (whole-string) = %.3f" % top1_string_match_score) print("Average top 1 Tree Edit Distance (template-only) = %.2f" % avg_top1_temp_dist) print("Average top 1 Tree Edit Distance (whole-string) = %.2f" % avg_top1_dist) if len(predictions) > 1: print("Top 3 Template Match Score = %.3f" % (num_top3_correct_temp/num_eval)) print("Top 3 String Match Score = %.3f" % (num_top3_correct/num_eval)) avg_top3_temp_dist = (total_top3_temp_dist + 0.0) / num_eval avg_top3_dist = (total_top3_dist + 0.0) / num_eval print("Average top 3 Tree Edit Distance (template-only) = %.2f" % avg_top3_temp_dist) print("Average top 3 Tree Edit Distance (whole-string) = %.2f" % avg_top3_dist) if len(predictions) > 3: print("Top 5 Template Match Score = %.3f" % (num_top5_correct_temp/num_eval)) print("Top 5 String Match Score = %.3f" % (num_top5_correct/num_eval)) avg_top5_temp_dist = (total_top5_temp_dist + 0.0) / num_eval avg_top5_dist = (total_top5_dist + 0.0) / num_eval print("Average top 5 Tree Edit Distance (template-only) = %.2f" % avg_top5_temp_dist) print("Average top 5 Tree Edit Distance (whole-string) = %.2f" % avg_top5_dist) if len(predictions) > 5: print("Top 10 Template Match Score = %.3f" % (num_top10_correct_temp/num_eval)) print("Top 10 String Match Score = %.3f" % (num_top10_correct/num_eval)) avg_top10_temp_dist = (total_top10_temp_dist + 0.0) / num_eval avg_top10_dist = (total_top10_dist + 0.0) / num_eval print("Average top 10 Tree Edit Distance (template-only) = %.2f" % avg_top10_temp_dist) print("Average top 10 Tree Edit Distance (whole-string) = %.2f" % avg_top10_dist) print() return top1_temp_match_score, top1_string_match_score, avg_top1_temp_dist, avg_top1_dist
def decode_set(model, dataset, rev_sc_vocab, rev_tg_vocab, verbose=True): grouped_dataset = data_utils.group_data_by_nl(dataset, use_bucket=False, use_temp=False) with DBConnection() as db: db.remove_model(model_name) num_eval = 0 for sc_temp in grouped_dataset: batch_sc_strs, batch_tg_strs, batch_scs, batch_cmds = \ grouped_dataset[sc_temp] _, entities = tokenizer.ner_tokenizer(sc_temp) nl_fillers = entities[-1] if nl_fillers is not None: cm_slots = {} sc_str = batch_sc_strs[0] nl = batch_scs[0] if verbose: print("Example {}".format(num_eval+1)) print("Original English: " + sc_str.strip()) print("English: " + sc_temp) for j in xrange(len(batch_tg_strs)): print("GT Command {}: {}".format(j+1, batch_tg_strs[j].strip())) # retrieve top-ranked command template top_k_results = model.test(nl, 100) count = 0 for i in xrange(len(top_k_results)): nn, output_tokens, score = top_k_results[i] nn_str = ' '.join([rev_sc_vocab[j] for j in nn]) tokens = [] for j in xrange(1, len(output_tokens)-1): pred_token = rev_tg_vocab[output_tokens[j]] if "@@" in pred_token: pred_token = pred_token.split("@@")[-1] if nl_fillers is not None and \ pred_token in constants._ENTITIES: if j > 0 and slot_filling.is_min_flag( rev_tg_vocab[output_tokens[j-1]]): pred_token_type = 'Timespan' else: pred_token_type = pred_token cm_slots[j] = (pred_token, pred_token_type) tokens.append(pred_token) pred_cmd = ' '.join(tokens) # check if the predicted command templates have enough slots to # hold the fillers (to rule out templates that are trivially # unqualified) if FLAGS.dataset.startswith("bash"): pred_cmd = re.sub('( ;\s+)|( ;$)', ' \\; ', pred_cmd) tree = data_tools.bash_parser(pred_cmd) else: tree = data_tools.paren_parser(pred_cmd) if nl_fillers is None or len(cm_slots) >= len(nl_fillers): # Step 2: check if the predicted command template is grammatical # filter out non-grammatical output if tree is not None: matched = slot_filling.heuristic_slot_filling(tree, nl_fillers) if tree is not None: slot_filling.fill_default_value(tree) pred_cmd = data_tools.ast2command(tree) if verbose: print("NN: {}".format(nn_str)) print("Prediction {}: {} ({})".format(i, pred_cmd, score)) db.add_prediction(model_name, sc_str, pred_cmd, float(score), update_mode=False) count += 1 if count == 10: break print("") num_eval += 1