def automatic_eval(model_dir, decode_sig, dataset, FLAGS, top_k, num_samples=-1, verbose=False): """ Generate automatic evaluation metrics on a dev/test set. The following metrics are computed: Top 1,3,5,10 1. Structure accuracy 2. Full command accuracy 3. Command keyword overlap 4. BLEU """ use_bucket = False if "knn" in model_dir else True grouped_dataset = data_utils.group_parallel_data(dataset, use_bucket=use_bucket) vocabs = data_utils.load_vocabulary(FLAGS) # Load predictions prediction_list = load_predictions(model_dir, decode_sig, top_k) if len(grouped_dataset) != len(prediction_list): raise ValueError("ground truth and predictions length must be equal: " "{} vs. {}".format(len(grouped_dataset), len(prediction_list))) M = get_automatic_evaluation_metrics(grouped_dataset, prediction_list, vocabs, FLAGS, top_k, num_samples, verbose) return M
def automatic_eval(prediction_path, dataset, FLAGS, top_k, num_samples=-1, verbose=False): """ Generate automatic evaluation metrics on dev/test set. The following metrics are computed: Top 1,3,5,10 1. Structure accuracy 2. Full command accuracy 3. Command keyword overlap 4. BLEU """ grouped_dataset = data_utils.group_parallel_data(dataset) try: vocabs = data_utils.load_vocabulary(FLAGS) except ValueError: vocabs = None # Load predictions prediction_list = load_predictions(prediction_path, top_k) if len(grouped_dataset) != len(prediction_list): raise ValueError("ground truth and predictions length must be equal: " "{} vs. {}".format(len(grouped_dataset), len(prediction_list))) metrics = get_automatic_evaluation_metrics(grouped_dataset, prediction_list, vocabs, FLAGS, top_k, num_samples, verbose) return metrics
def demo(sess, model, FLAGS): """ Simple command line decoding interface. """ # Decode from standard input. sys.stdout.write('> ') sys.stdout.flush() sentence = sys.stdin.readline() vocabs = data_utils.load_vocabulary(FLAGS) while sentence: if FLAGS.fill_argument_slots: slot_filling_classifier = get_slot_filling_classifer(FLAGS) batch_outputs, sequence_logits = translate_fun( sentence, sess, model, vocabs, FLAGS, slot_filling_classifier=slot_filling_classifier) else: batch_outputs, sequence_logits = translate_fun( sentence, sess, model, vocabs, FLAGS) if FLAGS.token_decoding_algorithm == 'greedy': tree, pred_cmd, outputs = batch_outputs[0] score = sequence_logits[0] print('{} ({})'.format(pred_cmd, score)) elif FLAGS.token_decoding_algorithm == 'beam_search': if batch_outputs: top_k_predictions = batch_outputs[0] top_k_scores = sequence_logits[0] for j in xrange(min(FLAGS.beam_size, 10, len(batch_outputs[0]))): if len(top_k_predictions) <= j: break top_k_pred_tree, top_k_pred_cmd = top_k_predictions[j] print('Prediction {}: {} ({}) '.format( j + 1, top_k_pred_cmd, top_k_scores[j])) print() else: print(APOLOGY_MSG) print('> ', end='') sys.stdout.flush() sentence = sys.stdin.readline()
def gen_automatic_evaluation_table(dataset, FLAGS): # Group dataset grouped_dataset = data_utils.group_parallel_data(dataset, use_bucket=True) vocabs = data_utils.load_vocabulary(FLAGS) model_names, model_predictions = load_all_model_predictions( grouped_dataset, FLAGS, top_k=3) auto_evaluation_metrics = {} for model_id, model_name in enumerate(model_names): prediction_list = model_predictions[model_id] M = get_automatic_evaluation_metrics( grouped_dataset, prediction_list, vocabs, FLAGS, top_k=3) auto_evaluation_metrics[model_name] = \ [M['top_bleu'][0], M['top_bleu'][1], M['top_cms'][0], M['top_cms'][1]] metrics_names = ['BLEU1', 'BLEU3', 'TM1', 'TM3'] print_table(model_names, metrics_names, auto_evaluation_metrics)
def gen_automatic_evaluation_table(dataset, FLAGS): # Group dataset grouped_dataset = data_utils.group_parallel_data(dataset) vocabs = data_utils.load_vocabulary(FLAGS) model_names, model_predictions = load_all_model_predictions( grouped_dataset, FLAGS, top_k=3) auto_eval_metrics = {} for model_id, model_name in enumerate(model_names): prediction_list = model_predictions[model_id] if prediction_list is not None: M = get_automatic_evaluation_metrics(grouped_dataset, prediction_list, vocabs, FLAGS, top_k=3) auto_eval_metrics[model_name] = [ M['bleu'][0], M['bleu'][1], M['cms'][0], M['cms'][1] ] else: print('Model {} skipped in evaluation'.format(model_name)) metrics_names = ['BLEU1', 'BLEU3', 'TM1', 'TM3'] print_eval_table(model_names, metrics_names, auto_eval_metrics)
FLAGS.sc_vocab_size = 1324 FLAGS.tg_vocab_size = 1219 FLAGS.max_sc_token_size = 100 FLAGS.max_tg_token_size = 100 buckets = [(13, 57), (18, 57), (42, 57)] # Create tensorflow session sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) # create model and load nerual model parameters. model = translate.define_model(sess, forward_only=True, buckets=buckets) print('loading models from {}'.format(FLAGS.model_dir)) vocabs = data_utils.load_vocabulary(FLAGS) if FLAGS.fill_argument_slots: # Create slot filling classifier model_param_dir = os.path.join(FLAGS.model_dir, 'train.mappings.X.Y.npz') train_X, train_Y = data_utils.load_slot_filling_data(model_param_dir) slot_filling_classifier = classifiers.KNearestNeighborModel( FLAGS.num_nn_slot_filling, train_X, train_Y) print('Slot filling classifier parameters loaded.') else: slot_filling_classifier = None def translate_fun(sentence, slot_filling_classifier=slot_filling_classifier): print('translating |{}|'.format(sentence)) list_of_translations = decode_tools.translate_fun(sentence, sess, model,
def grid_search(train_fun, decode_fun, eval_fun, train_set, dev_set, FLAGS): ''' Perform hyperparameter tuning of a model using grid-search. Usage: ./run-script.sh --grid_search --tuning hp1,... :param train_fun: Function to train the model. :param decode_fun: Function to decode from the trained model. :param eval_fun: Function to evaluate the decoding results. :param train_set: Training dataset. :param dev_set: Development dataset. :param FLAGS: General model hyperparameters. ''' FLAGS.create_fresh_params = True hyperparameters = FLAGS.tuning.split(',') num_hps = len(hyperparameters) hp_range = hyperparam_range print('======== Grid Search ========') print('%d hyperparameter(s): ' % num_hps) for i in xrange(num_hps): print('{}: {}'.format(hyperparameters[i], hp_range[hyperparameters[i]])) print() if FLAGS.dataset.startswith('bash'): metrics = [ 'top1_temp_ms', 'top1_cms', 'top3_temp_ms', 'top3_cms', 'top1_str_ms', 'top3_str_ms' ] metrics_weights = [0.1875, 0.1875, 0.0625, 0.0625, 0.25, 0.25] else: metrics = ['top1_temp_ms'] metrics_weights = [1] metrics_signature = '+'.join( ['{}x{}'.format(m, mw) for m, mw in zip(metrics, metrics_weights)]) # Grid search experiment log grid_search_log_file_name = 'grid_search_log.{}'.format(FLAGS.channel) if FLAGS.use_copy: grid_search_log_file_name += '.{}'.format(FLAGS.copy_fun) if FLAGS.normalized: grid_search_log_file_name += '.normalized' grid_search_log_file = open( os.path.join(FLAGS.model_root_dir, grid_search_log_file_name), 'w') # Generate grid param_grid = [v for v in hp_range[hyperparameters[0]]] for i in xrange(1, num_hps): param_grid = itertools.product(param_grid, hp_range[hyperparameters[i]]) # Initialize metrics value best_hp_set = [-1] * num_hps best_seed = -1 best_metrics_value = 0 for row in param_grid: row = nest.flatten(row) # Set current hyperaramter set for i in xrange(num_hps): setattr(FLAGS, hyperparameters[i], row[i]) if hyperparameters[i] == 'universal_keep': setattr(FLAGS, 'sc_input_keep', row[i]) setattr(FLAGS, 'sc_output_keep', row[i]) setattr(FLAGS, 'tg_input_keep', row[i]) setattr(FLAGS, 'tg_output_keep', row[i]) setattr(FLAGS, 'attention_input_keep', row[i]) setattr(FLAGS, 'attention_output_keep', row[i]) print('Trying parameter set: ') for i in xrange(num_hps): print('* {}: {}'.format(hyperparameters[i], row[i])) # Try different random seed if tuning initialization num_trials = 5 if FLAGS.initialization else 1 if 'min_vocab_frequency' in hyperparameters or \ 'num_buckets' in hyperparameters: # Read train and dev sets from disk train_set, dev_set, test_set = \ data_utils.load_data(FLAGS, use_buckets=True, load_mappings=False) vocab = data_utils.load_vocabulary(FLAGS) FLAGS.sc_vocab_size = len(vocab.sc_vocab) FLAGS.tg_vocab_size = len(vocab.tg_vocab) FLAGS.max_sc_token_size = vocab.max_sc_token_size FLAGS.max_tg_token_size = vocab.max_tg_token_size for t in xrange(num_trials): seed = random.getrandbits(32) tf.set_random_seed(seed) metrics_value = single_round_model_eval(train_fun, decode_fun, eval_fun, train_set, dev_set, metrics, metrics_weights) print('Parameter set: ') for i in xrange(num_hps): print('* {}: {}'.format(hyperparameters[i], row[i])) print('random seed: {}'.format(seed)) print('{} = {}'.format(metrics_signature, metrics_value)) grid_search_log_file.write('Parameter set: \n') for i in xrange(num_hps): grid_search_log_file.write('* {}: {}\n'.format( hyperparameters[i], row[i])) grid_search_log_file.write('random seed: {}\n'.format(seed)) grid_search_log_file.write('{} = {}\n\n'.format( metrics_signature, metrics_value)) print('Best parameter set so far: ') for i in xrange(num_hps): print('* {}: {}'.format(hyperparameters[i], best_hp_set[i])) print('Best random seed so far: {}'.format(best_seed)) print('Best evaluation metrics so far = {}'.format( best_metrics_value)) if metrics_value > best_metrics_value: best_hp_set = row best_seed = seed best_metrics_value = metrics_value print('☺ New best parameter setting found\n') print() print('*****************************') print('Best parameter set: ') for i in xrange(num_hps): print('* {}: {}'.format(hyperparameters[i], best_hp_set[i])) print('Best seed = {}'.format(best_seed)) print('Best {} = {}'.format(metrics, best_metrics_value)) print('*****************************') grid_search_log_file.write('*****************************\n') grid_search_log_file.write('Best parameter set: \n') for i in xrange(num_hps): grid_search_log_file.write('* {}: {}\n'.format(hyperparameters[i], best_hp_set[i])) grid_search_log_file.write('Best seed = {}\n'.format(best_seed)) grid_search_log_file.write('Best {} = {}\n'.format(metrics, best_metrics_value)) grid_search_log_file.write('*****************************') grid_search_log_file.close()
def decode_set(sess, model, dataset, top_k, FLAGS, verbose=False): """ Compute top-k predictions on the dev/test dataset and write the predictions to disk. :param sess: A TensorFlow session. :param model: Prediction model object. :param top_k: Number of top predictions to compute. :param FLAGS: Training/testing hyperparameter settings. :param verbose: If set, also print decoding results to screen. """ nl2bash = FLAGS.dataset.startswith('bash') and not FLAGS.explain tokenizer_selector = 'cm' if FLAGS.explain else 'nl' grouped_dataset = data_utils.group_parallel_data( dataset, okenizer_selector=tokenizer_selector) vocabs = data_utils.load_vocabulary(FLAGS) rev_sc_vocab = vocabs.rev_sc_vocab ts = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H%M%S') pred_file_path = os.path.join(model.model_dir, 'predictions.{}.{}'.format( model.decode_sig, ts)) pred_file = open(pred_file_path, 'w') eval_file_path = os.path.join(model.model_dir, 'predictions.{}.{}.csv'.format( model.decode_sig, ts)) eval_file = open(eval_file_path, 'w') eval_file.write('example_id, description, ground_truth, prediction, ' + 'correct template, correct command\n') for example_id in xrange(len(grouped_dataset)): key, data_group = grouped_dataset[example_id] sc_txt = data_group[0].sc_txt.strip() sc_tokens = [rev_sc_vocab[i] for i in data_group[0].sc_ids] if FLAGS.channel == 'char': sc_temp = ''.join(sc_tokens) sc_temp = sc_temp.replace(constants._SPACE, ' ') else: sc_temp = ' '.join(sc_tokens) tg_txts = [dp.tg_txt for dp in data_group] tg_asts = [data_tools.bash_parser(tg_txt) for tg_txt in tg_txts] if verbose: print('\nExample {}:'.format(example_id)) print('Original Source: {}'.format(sc_txt.encode('utf-8'))) print('Source: {}'.format(sc_temp.encode('utf-8'))) for j in xrange(len(data_group)): print('GT Target {}: {}'.format(j+1, data_group[j].tg_txt.encode('utf-8'))) if FLAGS.fill_argument_slots: slot_filling_classifier = get_slot_filling_classifer(FLAGS) batch_outputs, sequence_logits = translate_fun(data_group, sess, model, vocabs, FLAGS, slot_filling_classifier=slot_filling_classifier) else: batch_outputs, sequence_logits = translate_fun(data_group, sess, model, vocabs, FLAGS) if FLAGS.tg_char: batch_outputs, batch_char_outputs = batch_outputs eval_row = '{},"{}",'.format(example_id, sc_txt.replace('"', '""')) if batch_outputs: if FLAGS.token_decoding_algorithm == 'greedy': tree, pred_cmd = batch_outputs[0] if nl2bash: pred_cmd = data_tools.ast2command( tree, loose_constraints=True) score = sequence_logits[0] if verbose: print('Prediction: {} ({})'.format(pred_cmd, score)) pred_file.write('{}\n'.format(pred_cmd)) elif FLAGS.token_decoding_algorithm == 'beam_search': top_k_predictions = batch_outputs[0] if FLAGS.tg_char: top_k_char_predictions = batch_char_outputs[0] top_k_scores = sequence_logits[0] num_preds = min(FLAGS.beam_size, top_k, len(top_k_predictions)) for j in xrange(num_preds): if j > 0: eval_row = ',,' if j < len(tg_txts): eval_row += '"{}",'.format(tg_txts[j].strip().replace('"', '""')) else: eval_row += ',' top_k_pred_tree, top_k_pred_cmd = top_k_predictions[j] if nl2bash: pred_cmd = data_tools.ast2command( top_k_pred_tree, loose_constraints=True) else: pred_cmd = top_k_pred_cmd pred_file.write('{}|||'.format(pred_cmd.encode('utf-8'))) eval_row += '"{}",'.format(pred_cmd.replace('"', '""')) temp_match = tree_dist.one_match( tg_asts, top_k_pred_tree, ignore_arg_value=True) str_match = tree_dist.one_match( tg_asts, top_k_pred_tree, ignore_arg_value=False) if temp_match: eval_row += 'y,' if str_match: eval_row += 'y' eval_file.write('{}\n'.format(eval_row.encode('utf-8'))) if verbose: print('Prediction {}: {} ({})'.format( j+1, pred_cmd.encode('utf-8'), top_k_scores[j])) if FLAGS.tg_char: print('Character-based prediction {}: {}'.format( j+1, top_k_char_predictions[j].encode('utf-8'))) pred_file.write('\n') else: print(APOLOGY_MSG) pred_file.write('\n') eval_file.write('{}\n'.format(eval_row)) eval_file.write('\n') eval_file.write('\n') pred_file.close() eval_file.close() shutil.copyfile(pred_file_path, os.path.join(FLAGS.model_dir, 'predictions.{}.latest'.format(model.decode_sig))) shutil.copyfile(eval_file_path, os.path.join(FLAGS.model_dir, 'predictions.{}.latest.csv'.format(model.decode_sig)))
def main(_): # set GPU device os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # set up data and model directories FLAGS.data_dir = os.path.join( os.path.dirname(__file__), "..", "data", FLAGS.dataset) print("Reading data from {}".format(FLAGS.data_dir)) # set up encoder/decider dropout rate if FLAGS.universal_keep >= 0 and FLAGS.universal_keep < 1: FLAGS.sc_input_keep = FLAGS.universal_keep FLAGS.sc_output_keep = FLAGS.universal_keep FLAGS.tg_input_keep = FLAGS.universal_keep FLAGS.tg_output_keep = FLAGS.universal_keep FLAGS.attention_input_keep = FLAGS.universal_keep FLAGS.attention_output_keep = FLAGS.universal_keep # adjust hyperparameters for batch normalization if FLAGS.recurrent_batch_normalization: # larger batch size FLAGS.batch_size *= 4 # larger initial learning rate FLAGS.learning_rate *= 10 if FLAGS.decoder_topology in ['basic_tree']: FLAGS.model_root_dir = os.path.join( os.path.dirname(__file__), "..", FLAGS.model_root_dir, "seq2tree") elif FLAGS.decoder_topology in ['rnn']: FLAGS.model_root_dir = os.path.join( os.path.dirname(__file__), "..", FLAGS.model_root_dir, "seq2seq") else: raise ValueError("Unrecognized decoder topology: {}." .format(FLAGS.decoder_topology)) print("Saving models to {}".format(FLAGS.model_root_dir)) if FLAGS.process_data: process_data() else: train_set, dev_set, test_set = \ data_utils.load_data(FLAGS, use_buckets=True, load_mappings=False) vocab = data_utils.load_vocabulary(FLAGS) print("Set dataset parameters") FLAGS.max_sc_length = train_set.max_sc_length if not train_set.buckets else \ train_set.buckets[-1][0] FLAGS.max_tg_length = train_set.max_tg_length if not train_set.buckets else \ train_set.buckets[-1][1] FLAGS.sc_vocab_size = len(vocab.sc_vocab) FLAGS.tg_vocab_size = len(vocab.tg_vocab) FLAGS.max_sc_token_size = vocab.max_sc_token_size FLAGS.max_tg_token_size = vocab.max_tg_token_size dataset = test_set if FLAGS.test else dev_set if FLAGS.eval: eval(dataset) save_hyperparameters() elif FLAGS.gen_error_analysis_sheet: gen_error_analysis_sheets(dataset, group_by_utility=True) elif FLAGS.gen_manual_evaluation_sheet: error_analysis.gen_manual_evaluation_csv(dataset, FLAGS) elif FLAGS.gen_manual_evaluation_sheet_single_model: error_analysis.gen_manual_evaluation_csv_single_model(dataset, FLAGS) elif FLAGS.gen_manual_evaluation_table: if FLAGS.test: eval_tools.gen_evaluation_table(dataset, FLAGS) else: eval_tools.gen_evaluation_table(dataset, FLAGS, num_examples=100) elif FLAGS.gen_auto_evaluation_table: eval_tools.gen_automatic_evaluation_table(dataset, FLAGS) elif FLAGS.tabulate_example_predictions: error_analysis.tabulate_example_predictions(dataset, FLAGS, num_examples=100) elif FLAGS.gen_slot_filling_training_data: gen_slot_filling_training_data(FLAGS, [train_set, dev_set, test_set]) elif FLAGS.decode: model = decode(dataset, buckets=train_set.buckets) if not FLAGS.explain: eval(dataset, model.model_dir, model.decode_sig, verbose=False) elif FLAGS.demo: demo(buckets=train_set.buckets) elif FLAGS.grid_search: meta_experiments.grid_search( train, decode, eval, train_set, dataset, FLAGS) elif FLAGS.schedule_experiments: schedule_experiments( train, decode, eval, train_set, dataset) else: # Train the model. train(train_set, dataset) if FLAGS.normalized: tf.reset_default_graph() gen_slot_filling_training_data(FLAGS, [train_set, dev_set, test_set]) FLAGS.fill_argument_slots = True # save model hyperparameters save_hyperparameters() # Decode the new model on the development set. tf.reset_default_graph() model = decode(dataset, buckets=train_set.buckets) # Run automatic evaluation on the development set. if not FLAGS.explain: eval(dataset, model.model_dir, model.decode_sig, verbose=False)