def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot, override, eval_only, ensemble): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) model_file_name = results_file_path + '_bestmodel.txt' if os.path.isfile(model_file_name) and not override: print 'loading existing model from {}'.format(model_file_name) model, encoder_frnn, encoder_rrnn, decoder_rnn = task1_attention_implementation.load_best_model(alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) print 'loaded existing model successfully' else: print 'could not find existing model or explicit override was requested. starting training from scratch...' model, encoder_frnn, encoder_rrnn, decoder_rnn = build_model(alphabet, input_dim, hidden_dim, layers, feature_types, feat_input_dim, feature_alphabet) if not eval_only: # start training trained_model, last_epoch, best_epoch = train_model(model, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, feat_index, feature_types, plot) model = trained_model print 'last epoch is {}'.format(last_epoch) print 'best epoch is {}'.format(best_epoch) print 'finished training' else: print 'skipped training, evaluating on test set...' if ensemble: predicted_sequences = predict_with_ensemble_majority(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, input_dim, inverse_alphabet_index, layers, test_feat_dicts, test_lemmas, test_words) else: predicted_sequences = predict_sequences(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_lemmas, test_feat_dicts, feat_index, feature_types) if len(predicted_sequences) > 0: # evaluate last model on test amount, accuracy = evaluate_model(predicted_sequences, test_lemmas, test_feat_dicts, test_words, feature_types, print_results=False) print 'initial eval: {}% accuracy'.format(accuracy) final_results = {} for i in xrange(len(test_lemmas)): joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) inflection = predicted_sequences[joint_index] final_results[i] = (test_lemmas[i], test_feat_dicts[i], ''.join(inflection)) # evaluate best models common.write_results_file_and_evaluate_externally(hyper_params, accuracy, train_path, test_path, results_file_path + '.external_eval.txt', sigmorphon_root_dir, final_results) return
def evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, test_cluster_to_data_indices, test_feat_dicts, test_lemmas, test_path, test_words, train_cluster_to_data_indices, train_path, train_words): accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]] if len(train_cluster_words) < 1: print 'only {} samples for this inflection type. skipping'.format(str(len(train_cluster_words))) continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model try: test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]] if ensemble: # load ensemble models ensemble_model_names = ensemble.split(',') print 'ensemble paths:\n' print '\n'.join(ensemble_model_names) ensemble_models = [] for ens in ensemble_model_names: model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, ens, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) ensemble_models.append((model, encoder_frnn, encoder_rrnn, decoder_rnn)) # predict the entire test set with each model in the ensemble ensemble_predictions = [] for em in ensemble_models: model, encoder_frnn, encoder_rrnn, decoder_rnn = em predicted_templates = predict_templates(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) ensemble_predictions.append(predicted_templates) predicted_templates = {} string_to_template = {} # perform voting for each test input - joint_index is a lemma+feats representation test_data = zip(test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words) for i, (lemma, feat_dict, word) in enumerate(test_data): joint_index = lemma + ':' + common.get_morph_string(feat_dict, feature_types) prediction_counter = defaultdict(int) for ens in ensemble_predictions: prediction_str = ''.join(instantiate_template(ens[joint_index], lemma)) prediction_counter[prediction_str] += 1 string_to_template[prediction_str] = ens[joint_index] print u'template: {} prediction: {}'.format(ens[joint_index], prediction_str) # return the most predicted output predicted_template_string = max(prediction_counter, key=prediction_counter.get) # hack: if chosen without majority, pick shortest prediction if prediction_counter[predicted_template_string] == 1: predicted_template_string = min(prediction_counter, key=len) print u'chosen:{} with {} votes\n'.format(predicted_template_string, prediction_counter[predicted_template_string]) predicted_templates[joint_index] = string_to_template[predicted_template_string] # progress indication sys.stdout.write("\r%d%%" % (float(i) / len(test_cluster_lemmas) * 100)) sys.stdout.flush() ## else: # load best model - no ensemble best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) print 'starting to predict for cluster: {}'.format(cluster_type) try: predicted_templates = predict_templates(best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) except Exception as e: print e traceback.print_exc() print 'evaluating predictions for cluster: {}'.format(cluster_type) try: accuracy = evaluate_model(predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, print_results=True) accuracies.append(accuracy) except Exception as e: print e traceback.print_exc() # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) inflection = instantiate_template(predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) except KeyError: print 'could not find relevant examples in test data for cluster: ' + cluster_type print 'clusters in test are: {}'.format(test_cluster_to_data_indices.keys()) print 'clusters in train are: {}'.format(train_cluster_to_data_indices.keys()) accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))] macro_avg_accuracy = sum(accuracy_vals) / len(accuracies) print 'macro avg accuracy: ' + str(macro_avg_accuracy) mic_nom = sum([accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))]) mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))]) micro_average_accuracy = mic_nom / mic_denom print 'micro avg accuracy: ' + str(micro_average_accuracy) if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file_and_evaluate_externally(hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim): hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # feat 2 int feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # cluster the data by POS type (features) train_cluster_to_data_indices = common.cluster_data_by_pos( train_feat_dicts) test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate( train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [ train_words[i] for i in train_cluster_to_data_indices[cluster_type] ] if len(train_cluster_words) < 1: print 'only ' + str( len(train_cluster_words )) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model try: test_cluster_lemmas = [ test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_words = [ test_words[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_feat_dicts = [ test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type] ] # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) predicted_templates = task1_joint_structured_inflection_feedback_fix.predict_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) accuracy = task1_joint_structured_inflection_feedback_fix.evaluate_model( predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, print_results=False) accuracies.append(accuracy) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string( test_feat_dicts[i], feature_types) inflection = task1_joint_structured_inflection_feedback_fix.instantiate_template( predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) except KeyError: print 'could not find relevant examples in test data for cluster: ' + cluster_type accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))] macro_avg_accuracy = sum(accuracy_vals) / len(accuracies) print 'macro avg accuracy: ' + str(macro_avg_accuracy) mic_nom = sum( [accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))]) mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))]) micro_average_accuracy = mic_nom / mic_denom print 'micro avg accuracy: ' + str(micro_average_accuracy) if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file_and_evaluate_externally( hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results)
def evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, test_feat_dicts, test_lemmas, test_path, test_words, train_path, print_results=False): print "<<<<<<<<<<<<<<<<<< DEBUG ==>evaluate ndst" accuracies = [] final_results = {} if ensemble: # load ensemble models ensemble_model_names = ensemble.split(',') print 'ensemble paths:\n' print '\n'.join(ensemble_model_names) ensemble_models = [] for ens in ensemble_model_names: model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( alphabet, ens, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) ensemble_models.append((model, encoder_frnn, encoder_rrnn, decoder_rnn)) # predict the entire test set with each model in the ensemble print 'predicting...' ensemble_predictions = [] count = 0 for em in ensemble_models: count += 1 model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = em predicted_sequences = predict_sequences(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, inverse_alphabet_index, test_lemmas, test_feat_dicts, feat_index, feature_types) ensemble_predictions.append(predicted_sequences) print 'finished to predict with ensemble: {}/{}'.format(count, len(ensemble_model_names)) predicted_sequences = {} string_to_sequence = {} # perform voting for each test input - joint_index is a lemma+feats representation test_data = zip(test_lemmas, test_feat_dicts, test_words) for i, (lemma, feat_dict, word) in enumerate(test_data): joint_index = lemma + ':' + common.get_morph_string(feat_dict, feature_types) prediction_counter = defaultdict(int) # count votes for en in ensemble_predictions: prediction_str = ''.join(en[joint_index]).replace(STEP, '') prediction_counter[prediction_str] += 1 string_to_sequence[prediction_str] = en[joint_index] if print_results: print 'template: {} prediction: {}'.format(en[joint_index].encode('utf8'), prediction_str.encode('utf8')) # return the most predicted output predicted_sequence_string = max(prediction_counter, key=prediction_counter.get) # hack: if chosen without majority, pick shortest prediction if prediction_counter[predicted_sequence_string] == 1: predicted_sequence_string = min(prediction_counter, key=len) if print_results: print 'chosen:{} with {} votes\n'.format(predicted_sequence_string.encode('utf8'), prediction_counter[predicted_sequence_string]) predicted_sequences[joint_index] = string_to_sequence[predicted_sequence_string] # progress indication sys.stdout.write("\r%d%%" % (float(i) / len(test_lemmas) * 100)) sys.stdout.flush() else: # load best model - no ensemble best_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) try: print "predicting" predicted_sequences = predict_sequences(best_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, inverse_alphabet_index, test_lemmas, test_feat_dicts, feat_index, feature_types) except Exception as e: print "except1!" print e traceback.print_exc() # run internal evaluation try: accuracy = evaluate_model(predicted_sequences, test_lemmas, test_feat_dicts, test_words, feature_types, print_results=False) accuracies.append(accuracy) except Exception as e: print "except2!" print e traceback.print_exc() # get predicted_sequences in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i, lemma in enumerate(test_lemmas): joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) inflection = ''.join(predicted_sequences[joint_index]).replace(STEP, '') final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))] macro_avg_accuracy = sum(accuracy_vals) / len(accuracies) print 'macro avg accuracy: ' + str(macro_avg_accuracy) mic_nom = sum([accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))]) mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))]) micro_average_accuracy = mic_nom / mic_denom print 'micro avg accuracy: ' + str(micro_average_accuracy) if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file_and_evaluate_externally(hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]: alphabet.append(marker) # indicates the FST to step forward in the input alphabet.append(STEP) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # cluster the data by POS type (features) train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]] if len(train_cluster_words) < 1: print 'only ' + str(len(train_cluster_words)) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model try: test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]] # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) predicted_templates = task1_ndst.predict_templates(best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) accuracy = task1_ndst.evaluate_model(predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, print_results=True) accuracies.append(accuracy) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) inflection = task1_ndst.instantiate_template( predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) except KeyError: print 'could not find relevant examples in test data for cluster: ' + cluster_type accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))] macro_avg_accuracy = sum(accuracy_vals) / len(accuracies) print 'macro avg accuracy: ' + str(macro_avg_accuracy) mic_nom = sum([accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))]) mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))]) micro_average_accuracy = mic_nom / mic_denom print 'micro avg accuracy: ' + str(micro_average_accuracy) if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file_and_evaluate_externally(hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results)
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot, override, eval_only, ensemble): hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) model_file_name = results_file_path + '_bestmodel.txt' if os.path.isfile(model_file_name) and not override: print 'loading existing model from {}'.format(model_file_name) model, encoder_frnn, encoder_rrnn, decoder_rnn = task1_attention_implementation.load_best_model( alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) print 'loaded existing model successfully' else: print 'could not find existing model or explicit override was requested. starting training from scratch...' model, encoder_frnn, encoder_rrnn, decoder_rnn = build_model( alphabet, input_dim, hidden_dim, layers, feature_types, feat_input_dim, feature_alphabet) if not eval_only: # start training trained_model, last_epoch, best_epoch = train_model( model, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, feat_index, feature_types, plot) model = trained_model print 'last epoch is {}'.format(last_epoch) print 'best epoch is {}'.format(best_epoch) print 'finished training' else: print 'skipped training, evaluating on test set...' if ensemble: predicted_sequences = predict_with_ensemble_majority( alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, input_dim, inverse_alphabet_index, layers, test_feat_dicts, test_lemmas, test_words) else: predicted_sequences = predict_sequences(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_lemmas, test_feat_dicts, feat_index, feature_types) if len(predicted_sequences) > 0: # evaluate last model on test amount, accuracy = evaluate_model(predicted_sequences, test_lemmas, test_feat_dicts, test_words, feature_types, print_results=False) print 'initial eval: {}% accuracy'.format(accuracy) final_results = {} for i in xrange(len(test_lemmas)): joint_index = test_lemmas[i] + ':' + common.get_morph_string( test_feat_dicts[i], feature_types) inflection = predicted_sequences[joint_index] final_results[i] = (test_lemmas[i], test_feat_dicts[i], ''.join(inflection)) # evaluate best models common.write_results_file_and_evaluate_externally( hyper_params, accuracy, train_path, test_path, results_file_path + '.external_eval.txt', sigmorphon_root_dir, final_results) return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim, nbest, ensemble, majority): hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'NBEST': nbest } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # feat 2 int feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # cluster the data by POS type (features) # train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) # test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) train_cluster_to_data_indices = task1_single_ms2s.get_single_pseudo_cluster( train_feat_dicts) test_cluster_to_data_indices = task1_single_ms2s.get_single_pseudo_cluster( test_feat_dicts) cluster_index = 0 cluster_type = 'single' accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate( train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [ train_words[i] for i in train_cluster_to_data_indices[cluster_type] ] if len(train_cluster_words) < 1: print 'only ' + str( len(train_cluster_words )) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model test_cluster_lemmas = [ test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_words = [ test_words[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_feat_dicts = [ test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type] ] # handle model ensemble if ensemble: ensemble_model_names = ensemble.split(',') print '\n'.join(ensemble_model_names) ensemble_models = [] for ens in ensemble_model_names: model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, ens, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) ensemble_models.append( (model, encoder_frnn, encoder_rrnn, decoder_rnn)) # predict using the ensemble if not majority: predicted_templates = task1_single_ms2s.predict_templates_with_ensemble( ensemble_models, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) else: predicted_templates = task1_single_ms2s.predict_templates_with_ensemble_majority( ensemble_models, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) else: # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) lang = train_path.split('/')[-1].replace('-task{0}-train'.format('1'), '') if nbest == 1: is_nbest = False if not ensemble: predicted_templates = task1_single_ms2s.predict_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) # compute the predictions accuracy accuracy = task1_single_ms2s.evaluate_model( predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, print_results=True) accuracies.append(accuracy) print '{0} {1} accuracy: {2}'.format(lang, cluster_type, accuracy[1]) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string( test_feat_dicts[i], feature_types) inflection = task1_single_ms2s.instantiate_template( predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) micro_average_accuracy = accuracy[1] else: # handle the creation of nbest lists is_nbest = True predicted_nbset_templates = task1_single_ms2s.predict_nbest_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types, nbest, test_cluster_words) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string( test_feat_dicts[i], feature_types) nbest_inflections = [] templates = [ t for (t, p) in predicted_nbset_templates[joint_index] ] for template in templates: nbest_inflections.append( task1_single_ms2s.instantiate_template( template, test_lemmas[i])) final_results[i] = (test_lemmas[i], test_feat_dicts[i], nbest_inflections) micro_average_accuracy = -1 if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file_and_evaluate_externally( hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results, is_nbest)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim, nbest, ensemble, majority): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'NBEST':nbest} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data( train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data( test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # feat 2 int feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # cluster the data by POS type (features) # train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) # test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) train_cluster_to_data_indices = task1_single_ms2s.get_single_pseudo_cluster(train_feat_dicts) test_cluster_to_data_indices = task1_single_ms2s.get_single_pseudo_cluster(test_feat_dicts) cluster_index = 0 cluster_type = 'single' accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]] if len(train_cluster_words) < 1: print 'only ' + str(len(train_cluster_words)) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]] # handle model ensemble if ensemble: ensemble_model_names = ensemble.split(',') print '\n'.join(ensemble_model_names) ensemble_models = [] for ens in ensemble_model_names: model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index), alphabet, ens, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) ensemble_models.append((model, encoder_frnn, encoder_rrnn, decoder_rnn)) # predict using the ensemble if not majority: predicted_templates = task1_single_ms2s.predict_templates_with_ensemble( ensemble_models, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) else: predicted_templates = task1_single_ms2s.predict_templates_with_ensemble_majority( ensemble_models, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) else: # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) lang = train_path.split('/')[-1].replace('-task{0}-train'.format('1'),'') if nbest == 1: is_nbest = False if not ensemble: predicted_templates = task1_single_ms2s.predict_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) # compute the predictions accuracy accuracy = task1_single_ms2s.evaluate_model(predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, print_results=True) accuracies.append(accuracy) print '{0} {1} accuracy: {2}'.format(lang, cluster_type, accuracy[1]) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) inflection = task1_single_ms2s.instantiate_template( predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) micro_average_accuracy = accuracy[1] else: # handle the creation of nbest lists is_nbest = True predicted_nbset_templates = task1_single_ms2s.predict_nbest_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types, nbest, test_cluster_words) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) nbest_inflections = [] templates = [t for (t,p) in predicted_nbset_templates[joint_index]] for template in templates: nbest_inflections.append( task1_single_ms2s.instantiate_template( template, test_lemmas[i])) final_results[i] = (test_lemmas[i], test_feat_dicts[i], nbest_inflections) micro_average_accuracy = -1 if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file_and_evaluate_externally(hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results, is_nbest)
def evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, test_cluster_to_data_indices, test_feat_dicts, test_lemmas, test_path, test_words, train_cluster_to_data_indices, train_path, train_words): accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate( train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [ train_words[i] for i in train_cluster_to_data_indices[cluster_type] ] if len(train_cluster_words) < 1: print 'only {} samples for this inflection type. skipping'.format( str(len(train_cluster_words))) continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model try: test_cluster_lemmas = [ test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_words = [ test_words[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_feat_dicts = [ test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type] ] if ensemble: # load ensemble models ensemble_model_names = ensemble.split(',') print 'ensemble paths:\n' print '\n'.join(ensemble_model_names) ensemble_models = [] for ens in ensemble_model_names: model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, ens, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) ensemble_models.append( (model, encoder_frnn, encoder_rrnn, decoder_rnn)) # predict the entire test set with each model in the ensemble ensemble_predictions = [] for em in ensemble_models: model, encoder_frnn, encoder_rrnn, decoder_rnn = em predicted_templates = predict_templates( model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) ensemble_predictions.append(predicted_templates) predicted_templates = {} string_to_template = {} # perform voting for each test input - joint_index is a lemma+feats representation test_data = zip(test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words) for i, (lemma, feat_dict, word) in enumerate(test_data): joint_index = lemma + ':' + common.get_morph_string( feat_dict, feature_types) prediction_counter = defaultdict(int) for ens in ensemble_predictions: prediction_str = ''.join( instantiate_template(ens[joint_index], lemma)) prediction_counter[prediction_str] += 1 string_to_template[prediction_str] = ens[joint_index] print u'template: {} prediction: {}'.format( ens[joint_index], prediction_str) # return the most predicted output predicted_template_string = max(prediction_counter, key=prediction_counter.get) # hack: if chosen without majority, pick shortest prediction if prediction_counter[predicted_template_string] == 1: predicted_template_string = min(prediction_counter, key=len) print u'chosen:{} with {} votes\n'.format( predicted_template_string, prediction_counter[predicted_template_string]) predicted_templates[joint_index] = string_to_template[ predicted_template_string] # progress indication sys.stdout.write( "\r%d%%" % (float(i) / len(test_cluster_lemmas) * 100)) sys.stdout.flush() ## else: # load best model - no ensemble best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) print 'starting to predict for cluster: {}'.format( cluster_type) try: predicted_templates = predict_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) except Exception as e: print e traceback.print_exc() print 'evaluating predictions for cluster: {}'.format(cluster_type) try: accuracy = evaluate_model(predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, print_results=True) accuracies.append(accuracy) except Exception as e: print e traceback.print_exc() # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string( test_feat_dicts[i], feature_types) inflection = instantiate_template( predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) except KeyError: print 'could not find relevant examples in test data for cluster: ' + cluster_type print 'clusters in test are: {}'.format( test_cluster_to_data_indices.keys()) print 'clusters in train are: {}'.format( train_cluster_to_data_indices.keys()) accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))] macro_avg_accuracy = sum(accuracy_vals) / len(accuracies) print 'macro avg accuracy: ' + str(macro_avg_accuracy) mic_nom = sum( [accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))]) mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))]) micro_average_accuracy = mic_nom / mic_denom print 'micro avg accuracy: ' + str(micro_average_accuracy) if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file_and_evaluate_externally( hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results)