def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim,
         epochs, layers, optimization, regularization, learning_rate, plot, override, eval_only, ensemble):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    model_file_name = results_file_path + '_bestmodel.txt'
    if os.path.isfile(model_file_name) and not override:
        print 'loading existing model from {}'.format(model_file_name)
        model, encoder_frnn, encoder_rrnn, decoder_rnn = task1_attention_implementation.load_best_model(alphabet,
                                                                         results_file_path, input_dim,
                                                                         hidden_dim, layers, feature_alphabet,
                                                                         feat_input_dim, feature_types)
        print 'loaded existing model successfully'
    else:
        print 'could not find existing model or explicit override was requested. starting training from scratch...'
        model, encoder_frnn, encoder_rrnn, decoder_rnn = build_model(alphabet, input_dim, hidden_dim, layers,
                                                                     feature_types, feat_input_dim, feature_alphabet)
    if not eval_only:
        # start training
        trained_model, last_epoch, best_epoch = train_model(model, encoder_frnn, encoder_rrnn, decoder_rnn,
                                                            train_lemmas, train_feat_dicts, train_words, dev_lemmas,
                                                            dev_feat_dicts, dev_words, alphabet_index,
                                                            inverse_alphabet_index, epochs, optimization,
                                                            results_file_path, feat_index, feature_types, plot)
        model = trained_model
        print 'last epoch is {}'.format(last_epoch)
        print 'best epoch is {}'.format(best_epoch)
        print 'finished training'
    else:
        print 'skipped training, evaluating on test set...'

    if ensemble:
        predicted_sequences = predict_with_ensemble_majority(alphabet, alphabet_index, ensemble, feat_index,
                                                             feat_input_dim, feature_alphabet, feature_types,
                                                             hidden_dim, input_dim, inverse_alphabet_index, layers,
                                                             test_feat_dicts, test_lemmas, test_words)
    else:
        predicted_sequences = predict_sequences(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index,
                                                inverse_alphabet_index, test_lemmas, test_feat_dicts, feat_index,
                                                feature_types)
    if len(predicted_sequences) > 0:
        # evaluate last model on test
        amount, accuracy = evaluate_model(predicted_sequences, test_lemmas, test_feat_dicts, test_words, feature_types,
                                          print_results=False)
        print 'initial eval: {}% accuracy'.format(accuracy)

        final_results = {}
        for i in xrange(len(test_lemmas)):
            joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)
            inflection = predicted_sequences[joint_index]
            final_results[i] = (test_lemmas[i], test_feat_dicts[i], ''.join(inflection))

        # evaluate best models
        common.write_results_file_and_evaluate_externally(hyper_params, accuracy, train_path, test_path,
                                                          results_file_path + '.external_eval.txt', sigmorphon_root_dir,
                                                          final_results)
    return
def evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, test_cluster_to_data_indices, test_feat_dicts, test_lemmas, test_path,
                  test_words, train_cluster_to_data_indices, train_path, train_words):
    accuracies = []
    final_results = {}
    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]]
        if len(train_cluster_words) < 1:
            print 'only {} samples for this inflection type. skipping'.format(str(len(train_cluster_words)))
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        try:
            test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]]
            test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]]
            test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]]

            if ensemble:
                # load ensemble models
                ensemble_model_names = ensemble.split(',')
                print 'ensemble paths:\n'
                print '\n'.join(ensemble_model_names)
                ensemble_models = []
                for ens in ensemble_model_names:
                    model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                        str(cluster_index),
                        alphabet,
                        ens,
                        input_dim,
                        hidden_dim,
                        layers,
                        feature_alphabet,
                        feat_input_dim,
                        feature_types)

                    ensemble_models.append((model, encoder_frnn, encoder_rrnn, decoder_rnn))

                # predict the entire test set with each model in the ensemble
                ensemble_predictions = []
                for em in ensemble_models:
                    model, encoder_frnn, encoder_rrnn, decoder_rnn = em
                    predicted_templates = predict_templates(model, decoder_rnn,
                                                            encoder_frnn,
                                                            encoder_rrnn,
                                                            alphabet_index,
                                                            inverse_alphabet_index,
                                                            test_cluster_lemmas,
                                                            test_cluster_feat_dicts,
                                                            feat_index,
                                                            feature_types)
                    ensemble_predictions.append(predicted_templates)

                predicted_templates = {}
                string_to_template = {}

                # perform voting for each test input - joint_index is a lemma+feats representation
                test_data = zip(test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words)
                for i, (lemma, feat_dict, word) in enumerate(test_data):
                    joint_index = lemma + ':' + common.get_morph_string(feat_dict, feature_types)
                    prediction_counter = defaultdict(int)
                    for ens in ensemble_predictions:
                        prediction_str = ''.join(instantiate_template(ens[joint_index], lemma))
                        prediction_counter[prediction_str] += 1
                        string_to_template[prediction_str] = ens[joint_index]
                        print u'template: {} prediction: {}'.format(ens[joint_index], prediction_str)

                    # return the most predicted output
                    predicted_template_string = max(prediction_counter, key=prediction_counter.get)

                    # hack: if chosen without majority, pick shortest prediction
                    if prediction_counter[predicted_template_string] == 1:
                        predicted_template_string = min(prediction_counter, key=len)

                    print u'chosen:{} with {} votes\n'.format(predicted_template_string,
                                                              prediction_counter[predicted_template_string])
                    predicted_templates[joint_index] = string_to_template[predicted_template_string]

                    # progress indication
                    sys.stdout.write("\r%d%%" % (float(i) / len(test_cluster_lemmas) * 100))
                    sys.stdout.flush()
                    ##

            else:
                # load best model - no ensemble
                best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                    str(cluster_index), alphabet,
                    results_file_path, input_dim,
                    hidden_dim, layers,
                    feature_alphabet, feat_input_dim,
                    feature_types)
                print 'starting to predict for cluster: {}'.format(cluster_type)
                try:
                    predicted_templates = predict_templates(best_model,
                                                            decoder_rnn,
                                                            encoder_frnn,
                                                            encoder_rrnn,
                                                            alphabet_index,
                                                            inverse_alphabet_index,
                                                            test_cluster_lemmas,
                                                            test_cluster_feat_dicts,
                                                            feat_index,
                                                            feature_types)
                except Exception as e:
                    print e
                    traceback.print_exc()

            print 'evaluating predictions for cluster: {}'.format(cluster_type)
            try:
                accuracy = evaluate_model(predicted_templates,
                                          test_cluster_lemmas,
                                          test_cluster_feat_dicts,
                                          test_cluster_words,
                                          feature_types,
                                          print_results=True)
                accuracies.append(accuracy)
            except Exception as e:
                print e
                traceback.print_exc()

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)
                inflection = instantiate_template(predicted_templates[joint_index],
                                                  test_lemmas[i])

                final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection)

        except KeyError:
            print 'could not find relevant examples in test data for cluster: ' + cluster_type
            print 'clusters in test are: {}'.format(test_cluster_to_data_indices.keys())
            print 'clusters in train are: {}'.format(train_cluster_to_data_indices.keys())

    accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))]
    macro_avg_accuracy = sum(accuracy_vals) / len(accuracies)
    print 'macro avg accuracy: ' + str(macro_avg_accuracy)

    mic_nom = sum([accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))])
    mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))])
    micro_average_accuracy = mic_nom / mic_denom
    print 'micro avg accuracy: ' + str(micro_average_accuracy)

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'

    common.write_results_file_and_evaluate_externally(hyper_params, micro_average_accuracy, train_path,
                                                      test_path, results_file_path + suffix, sigmorphon_root_dir,
                                                      final_results)
示例#3
0
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim):
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # feat 2 int
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # cluster the data by POS type (features)
    train_cluster_to_data_indices = common.cluster_data_by_pos(
        train_feat_dicts)
    test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(
            train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [
            train_words[i] for i in train_cluster_to_data_indices[cluster_type]
        ]
        if len(train_cluster_words) < 1:
            print 'only ' + str(
                len(train_cluster_words
                    )) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        try:
            test_cluster_lemmas = [
                test_lemmas[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]
            test_cluster_words = [
                test_words[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]
            test_cluster_feat_dicts = [
                test_feat_dicts[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]

            # load best model
            best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                str(cluster_index), alphabet, results_file_path, input_dim,
                hidden_dim, layers, feature_alphabet, feat_input_dim,
                feature_types)

            predicted_templates = task1_joint_structured_inflection_feedback_fix.predict_templates(
                best_model, decoder_rnn, encoder_frnn, encoder_rrnn,
                alphabet_index, inverse_alphabet_index, test_cluster_lemmas,
                test_cluster_feat_dicts, feat_index, feature_types)

            accuracy = task1_joint_structured_inflection_feedback_fix.evaluate_model(
                predicted_templates,
                test_cluster_lemmas,
                test_cluster_feat_dicts,
                test_cluster_words,
                feature_types,
                print_results=False)
            accuracies.append(accuracy)

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(
                    test_feat_dicts[i], feature_types)
                inflection = task1_joint_structured_inflection_feedback_fix.instantiate_template(
                    predicted_templates[joint_index], test_lemmas[i])
                final_results[i] = (test_lemmas[i], test_feat_dicts[i],
                                    inflection)

        except KeyError:
            print 'could not find relevant examples in test data for cluster: ' + cluster_type

    accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))]
    macro_avg_accuracy = sum(accuracy_vals) / len(accuracies)
    print 'macro avg accuracy: ' + str(macro_avg_accuracy)

    mic_nom = sum(
        [accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))])
    mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))])
    micro_average_accuracy = mic_nom / mic_denom
    print 'micro avg accuracy: ' + str(micro_average_accuracy)

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'
    common.write_results_file_and_evaluate_externally(
        hyper_params, micro_average_accuracy, train_path, test_path,
        results_file_path + suffix, sigmorphon_root_dir, final_results)
def evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, test_feat_dicts, test_lemmas, test_path,
                  test_words, train_path, print_results=False):
    print "<<<<<<<<<<<<<<<<<< DEBUG ==>evaluate ndst"
    accuracies = []
    final_results = {}
    if ensemble:
        # load ensemble models
        ensemble_model_names = ensemble.split(',')
        print 'ensemble paths:\n'
        print '\n'.join(ensemble_model_names)
        ensemble_models = []
        for ens in ensemble_model_names:
            model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                alphabet,
                ens,
                input_dim,
                hidden_dim,
                layers,
                feature_alphabet,
                feat_input_dim,
                feature_types)

            ensemble_models.append((model, encoder_frnn, encoder_rrnn, decoder_rnn))

        # predict the entire test set with each model in the ensemble
        print 'predicting...'
        ensemble_predictions = []
        count = 0
        for em in ensemble_models:
            count += 1
            model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = em
            predicted_sequences = predict_sequences(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn,
                                                    alphabet_index,
                                                    inverse_alphabet_index,
                                                    test_lemmas,
                                                    test_feat_dicts,
                                                    feat_index,
                                                    feature_types)
            ensemble_predictions.append(predicted_sequences)
            print 'finished to predict with ensemble: {}/{}'.format(count, len(ensemble_model_names))

        predicted_sequences = {}
        string_to_sequence = {}

        # perform voting for each test input - joint_index is a lemma+feats representation
        test_data = zip(test_lemmas, test_feat_dicts, test_words)
        for i, (lemma, feat_dict, word) in enumerate(test_data):
            joint_index = lemma + ':' + common.get_morph_string(feat_dict, feature_types)
            prediction_counter = defaultdict(int)

            # count votes
            for en in ensemble_predictions:
                prediction_str = ''.join(en[joint_index]).replace(STEP, '')
                prediction_counter[prediction_str] += 1
                string_to_sequence[prediction_str] = en[joint_index]
                if print_results:
                    print 'template: {} prediction: {}'.format(en[joint_index].encode('utf8'),
                                                               prediction_str.encode('utf8'))

            # return the most predicted output
            predicted_sequence_string = max(prediction_counter, key=prediction_counter.get)

            # hack: if chosen without majority, pick shortest prediction
            if prediction_counter[predicted_sequence_string] == 1:
                predicted_sequence_string = min(prediction_counter, key=len)

            if print_results:
                print 'chosen:{} with {} votes\n'.format(predicted_sequence_string.encode('utf8'),
                                                         prediction_counter[predicted_sequence_string])

            predicted_sequences[joint_index] = string_to_sequence[predicted_sequence_string]

            # progress indication
            sys.stdout.write("\r%d%%" % (float(i) / len(test_lemmas) * 100))
            sys.stdout.flush()
    else:
        # load best model - no ensemble
        best_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(alphabet,
                                                                              results_file_path, input_dim,
                                                                              hidden_dim, layers,
                                                                              feature_alphabet, feat_input_dim,
                                                                              feature_types)
        try:
            print "predicting"
            predicted_sequences = predict_sequences(best_model,
                                                    char_lookup, feat_lookup, R, bias, encoder_frnn,
                                                    encoder_rrnn, decoder_rnn,
                                                    alphabet_index,
                                                    inverse_alphabet_index,
                                                    test_lemmas,
                                                    test_feat_dicts,
                                                    feat_index,
                                                    feature_types)
        except Exception as e:
            print "except1!"
            print e
            traceback.print_exc()

    # run internal evaluation
    try:
        accuracy = evaluate_model(predicted_sequences,
                                  test_lemmas,
                                  test_feat_dicts,
                                  test_words,
                                  feature_types,
                                  print_results=False)
        accuracies.append(accuracy)
    except Exception as e:
        print "except2!"
        print e
        traceback.print_exc()

    # get predicted_sequences in the same order they appeared in the original file
    # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
    for i, lemma in enumerate(test_lemmas):
        joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)
        inflection = ''.join(predicted_sequences[joint_index]).replace(STEP, '')
        final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection)

    accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))]
    macro_avg_accuracy = sum(accuracy_vals) / len(accuracies)
    print 'macro avg accuracy: ' + str(macro_avg_accuracy)

    mic_nom = sum([accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))])
    mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))])
    micro_average_accuracy = mic_nom / mic_denom
    print 'micro avg accuracy: ' + str(micro_average_accuracy)

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'

    common.write_results_file_and_evaluate_externally(hyper_params, micro_average_accuracy, train_path,
                                                      test_path, results_file_path + suffix, sigmorphon_root_dir,
                                                      final_results)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers,
         optimization, feat_input_dim):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization}


    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # cluster the data by POS type (features)
    train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]]
        if len(train_cluster_words) < 1:
            print 'only ' + str(len(train_cluster_words)) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        try:
            test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]]
            test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]]
            test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]]

            # load best model
            best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index), alphabet,
                                                                                  results_file_path, input_dim,
                                                                                  hidden_dim, layers,
                                                                                  feature_alphabet, feat_input_dim,
                                                                                  feature_types)

            predicted_templates = task1_ndst.predict_templates(best_model,
                                                               decoder_rnn,
                                                               encoder_frnn,
                                                               encoder_rrnn,
                                                               alphabet_index,
                                                               inverse_alphabet_index,
                                                               test_cluster_lemmas,
                                                               test_cluster_feat_dicts,
                                                               feat_index,
                                                               feature_types)

            accuracy = task1_ndst.evaluate_model(predicted_templates,
                                                 test_cluster_lemmas,
                                                 test_cluster_feat_dicts,
                                                 test_cluster_words,
                                                 feature_types,
                                                 print_results=True)
            accuracies.append(accuracy)

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)
                inflection = task1_ndst.instantiate_template(
                    predicted_templates[joint_index], test_lemmas[i])
                final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection)

        except KeyError:
            print 'could not find relevant examples in test data for cluster: ' + cluster_type

    accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))]
    macro_avg_accuracy = sum(accuracy_vals) / len(accuracies)
    print 'macro avg accuracy: ' + str(macro_avg_accuracy)

    mic_nom = sum([accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))])
    mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))])
    micro_average_accuracy = mic_nom / mic_denom
    print 'micro avg accuracy: ' + str(micro_average_accuracy)

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'
    common.write_results_file_and_evaluate_externally(hyper_params, micro_average_accuracy, train_path,
                                                      test_path, results_file_path + suffix, sigmorphon_root_dir,
                                                      final_results)
def main(train_path, dev_path, test_path, results_file_path,
         sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization, regularization, learning_rate, plot, override,
         eval_only, ensemble):
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'FEAT_INPUT_DIM': feat_input_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization,
        'PATIENCE': MAX_PATIENCE,
        'REGULARIZATION': regularization,
        'LEARNING_RATE': learning_rate
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    (dev_words, dev_lemmas,
     dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    model_file_name = results_file_path + '_bestmodel.txt'
    if os.path.isfile(model_file_name) and not override:
        print 'loading existing model from {}'.format(model_file_name)
        model, encoder_frnn, encoder_rrnn, decoder_rnn = task1_attention_implementation.load_best_model(
            alphabet, results_file_path, input_dim, hidden_dim, layers,
            feature_alphabet, feat_input_dim, feature_types)
        print 'loaded existing model successfully'
    else:
        print 'could not find existing model or explicit override was requested. starting training from scratch...'
        model, encoder_frnn, encoder_rrnn, decoder_rnn = build_model(
            alphabet, input_dim, hidden_dim, layers, feature_types,
            feat_input_dim, feature_alphabet)
    if not eval_only:
        # start training
        trained_model, last_epoch, best_epoch = train_model(
            model, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas,
            train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts,
            dev_words, alphabet_index, inverse_alphabet_index, epochs,
            optimization, results_file_path, feat_index, feature_types, plot)
        model = trained_model
        print 'last epoch is {}'.format(last_epoch)
        print 'best epoch is {}'.format(best_epoch)
        print 'finished training'
    else:
        print 'skipped training, evaluating on test set...'

    if ensemble:
        predicted_sequences = predict_with_ensemble_majority(
            alphabet, alphabet_index, ensemble, feat_index, feat_input_dim,
            feature_alphabet, feature_types, hidden_dim, input_dim,
            inverse_alphabet_index, layers, test_feat_dicts, test_lemmas,
            test_words)
    else:
        predicted_sequences = predict_sequences(model, decoder_rnn,
                                                encoder_frnn, encoder_rrnn,
                                                alphabet_index,
                                                inverse_alphabet_index,
                                                test_lemmas, test_feat_dicts,
                                                feat_index, feature_types)
    if len(predicted_sequences) > 0:
        # evaluate last model on test
        amount, accuracy = evaluate_model(predicted_sequences,
                                          test_lemmas,
                                          test_feat_dicts,
                                          test_words,
                                          feature_types,
                                          print_results=False)
        print 'initial eval: {}% accuracy'.format(accuracy)

        final_results = {}
        for i in xrange(len(test_lemmas)):
            joint_index = test_lemmas[i] + ':' + common.get_morph_string(
                test_feat_dicts[i], feature_types)
            inflection = predicted_sequences[joint_index]
            final_results[i] = (test_lemmas[i], test_feat_dicts[i],
                                ''.join(inflection))

        # evaluate best models
        common.write_results_file_and_evaluate_externally(
            hyper_params, accuracy, train_path, test_path,
            results_file_path + '.external_eval.txt', sigmorphon_root_dir,
            final_results)
    return
示例#7
0
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim,
         nbest, ensemble, majority):
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization,
        'NBEST': nbest
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # feat 2 int
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # cluster the data by POS type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    # test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)

    train_cluster_to_data_indices = task1_single_ms2s.get_single_pseudo_cluster(
        train_feat_dicts)
    test_cluster_to_data_indices = task1_single_ms2s.get_single_pseudo_cluster(
        test_feat_dicts)
    cluster_index = 0
    cluster_type = 'single'

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(
            train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [
            train_words[i] for i in train_cluster_to_data_indices[cluster_type]
        ]
        if len(train_cluster_words) < 1:
            print 'only ' + str(
                len(train_cluster_words
                    )) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        test_cluster_lemmas = [
            test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]
        ]
        test_cluster_words = [
            test_words[i] for i in test_cluster_to_data_indices[cluster_type]
        ]
        test_cluster_feat_dicts = [
            test_feat_dicts[i]
            for i in test_cluster_to_data_indices[cluster_type]
        ]

        # handle model ensemble
        if ensemble:
            ensemble_model_names = ensemble.split(',')
            print '\n'.join(ensemble_model_names)
            ensemble_models = []
            for ens in ensemble_model_names:
                model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                    str(cluster_index), alphabet, ens, input_dim, hidden_dim,
                    layers, feature_alphabet, feat_input_dim, feature_types)

                ensemble_models.append(
                    (model, encoder_frnn, encoder_rrnn, decoder_rnn))

            # predict using the ensemble
            if not majority:
                predicted_templates = task1_single_ms2s.predict_templates_with_ensemble(
                    ensemble_models, alphabet_index, inverse_alphabet_index,
                    test_cluster_lemmas, test_cluster_feat_dicts, feat_index,
                    feature_types)
            else:
                predicted_templates = task1_single_ms2s.predict_templates_with_ensemble_majority(
                    ensemble_models, alphabet_index, inverse_alphabet_index,
                    test_cluster_lemmas, test_cluster_feat_dicts, feat_index,
                    feature_types)
        else:
            # load best model
            best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                str(cluster_index), alphabet, results_file_path, input_dim,
                hidden_dim, layers, feature_alphabet, feat_input_dim,
                feature_types)

        lang = train_path.split('/')[-1].replace('-task{0}-train'.format('1'),
                                                 '')
        if nbest == 1:
            is_nbest = False
            if not ensemble:
                predicted_templates = task1_single_ms2s.predict_templates(
                    best_model, decoder_rnn, encoder_frnn, encoder_rrnn,
                    alphabet_index, inverse_alphabet_index,
                    test_cluster_lemmas, test_cluster_feat_dicts, feat_index,
                    feature_types)

            # compute the predictions accuracy
            accuracy = task1_single_ms2s.evaluate_model(
                predicted_templates,
                test_cluster_lemmas,
                test_cluster_feat_dicts,
                test_cluster_words,
                feature_types,
                print_results=True)
            accuracies.append(accuracy)
            print '{0} {1} accuracy: {2}'.format(lang, cluster_type,
                                                 accuracy[1])

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(
                    test_feat_dicts[i], feature_types)
                inflection = task1_single_ms2s.instantiate_template(
                    predicted_templates[joint_index], test_lemmas[i])
                final_results[i] = (test_lemmas[i], test_feat_dicts[i],
                                    inflection)

            micro_average_accuracy = accuracy[1]

        else:
            # handle the creation of nbest lists
            is_nbest = True

            predicted_nbset_templates = task1_single_ms2s.predict_nbest_templates(
                best_model, decoder_rnn, encoder_frnn, encoder_rrnn,
                alphabet_index, inverse_alphabet_index, test_cluster_lemmas,
                test_cluster_feat_dicts, feat_index, feature_types, nbest,
                test_cluster_words)

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(
                    test_feat_dicts[i], feature_types)

                nbest_inflections = []
                templates = [
                    t for (t, p) in predicted_nbset_templates[joint_index]
                ]
                for template in templates:
                    nbest_inflections.append(
                        task1_single_ms2s.instantiate_template(
                            template, test_lemmas[i]))
                final_results[i] = (test_lemmas[i], test_feat_dicts[i],
                                    nbest_inflections)

            micro_average_accuracy = -1

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'

    common.write_results_file_and_evaluate_externally(
        hyper_params, micro_average_accuracy, train_path, test_path,
        results_file_path + suffix, sigmorphon_root_dir, final_results,
        is_nbest)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers,
         optimization, feat_input_dim, nbest, ensemble, majority):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers,
                    'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'NBEST':nbest}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(
        train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(
        test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # feat 2 int
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # cluster the data by POS type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    # test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)

    # cluster the data by inflection type (features)
    # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)


    train_cluster_to_data_indices = task1_single_ms2s.get_single_pseudo_cluster(train_feat_dicts)
    test_cluster_to_data_indices = task1_single_ms2s.get_single_pseudo_cluster(test_feat_dicts)
    cluster_index = 0
    cluster_type = 'single'

    accuracies = []
    final_results = {}

    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]]
        if len(train_cluster_words) < 1:
            print 'only ' + str(len(train_cluster_words)) + ' samples for this inflection type. skipping'
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]]
        test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]]
        test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]]

        # handle model ensemble
        if ensemble:
            ensemble_model_names = ensemble.split(',')
            print '\n'.join(ensemble_model_names)
            ensemble_models = []
            for ens in ensemble_model_names:
                model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index),
                                                                                alphabet,
                                                                                ens,
                                                                                input_dim,
                                                                                hidden_dim,
                                                                                layers,
                                                                                feature_alphabet,
                                                                                feat_input_dim,
                                                                                feature_types)

                ensemble_models.append((model, encoder_frnn, encoder_rrnn, decoder_rnn))

            # predict using the ensemble
            if not majority:
                predicted_templates = task1_single_ms2s.predict_templates_with_ensemble(
                ensemble_models,
                alphabet_index,
                inverse_alphabet_index,
                test_cluster_lemmas,
                test_cluster_feat_dicts,
                feat_index,
                feature_types)
            else:
                predicted_templates = task1_single_ms2s.predict_templates_with_ensemble_majority(
                    ensemble_models,
                    alphabet_index,
                    inverse_alphabet_index,
                    test_cluster_lemmas,
                    test_cluster_feat_dicts,
                    feat_index,
                    feature_types)
        else:
            # load best model
            best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index),
                                                                                  alphabet,
                                                                                  results_file_path,
                                                                                  input_dim,
                                                                                  hidden_dim,
                                                                                  layers,
                                                                                  feature_alphabet,
                                                                                  feat_input_dim,
                                                                                  feature_types)

        lang  = train_path.split('/')[-1].replace('-task{0}-train'.format('1'),'')
        if nbest == 1:
            is_nbest = False
            if not ensemble:
                predicted_templates = task1_single_ms2s.predict_templates(
                best_model,
                decoder_rnn,
                encoder_frnn, encoder_rrnn,
                alphabet_index,
                inverse_alphabet_index,
                test_cluster_lemmas,
                test_cluster_feat_dicts,
                feat_index,
                feature_types)

            # compute the predictions accuracy
            accuracy = task1_single_ms2s.evaluate_model(predicted_templates,
                                                        test_cluster_lemmas,
                                                        test_cluster_feat_dicts,
                                                        test_cluster_words,
                                                        feature_types,
                                                        print_results=True)
            accuracies.append(accuracy)
            print '{0} {1} accuracy: {2}'.format(lang, cluster_type, accuracy[1])

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)
                inflection = task1_single_ms2s.instantiate_template(
                    predicted_templates[joint_index], test_lemmas[i])
                final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection)

            micro_average_accuracy = accuracy[1]

        else:
            # handle the creation of nbest lists
            is_nbest = True

            predicted_nbset_templates = task1_single_ms2s.predict_nbest_templates(
            best_model,
            decoder_rnn,
            encoder_frnn,
            encoder_rrnn,
            alphabet_index,
            inverse_alphabet_index,
            test_cluster_lemmas,
            test_cluster_feat_dicts,
            feat_index,
            feature_types,
            nbest,
            test_cluster_words)

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types)

                nbest_inflections = []
                templates = [t for (t,p) in predicted_nbset_templates[joint_index]]
                for template in templates:
                    nbest_inflections.append(
                        task1_single_ms2s.instantiate_template(
                                template,
                                test_lemmas[i]))
                final_results[i] = (test_lemmas[i], test_feat_dicts[i], nbest_inflections)

            micro_average_accuracy = -1


    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'

    common.write_results_file_and_evaluate_externally(hyper_params,
                                                      micro_average_accuracy,
                                                      train_path,
                                                      test_path,
                                                      results_file_path + suffix,
                                                      sigmorphon_root_dir,
                                                      final_results,
                                                      is_nbest)
示例#9
0
def evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index,
                  feat_input_dim, feature_alphabet, feature_types, hidden_dim,
                  hyper_params, input_dim, inverse_alphabet_index, layers,
                  results_file_path, sigmorphon_root_dir,
                  test_cluster_to_data_indices, test_feat_dicts, test_lemmas,
                  test_path, test_words, train_cluster_to_data_indices,
                  train_path, train_words):
    accuracies = []
    final_results = {}
    # factored model: new model per inflection type
    for cluster_index, cluster_type in enumerate(
            train_cluster_to_data_indices):

        # get the inflection-specific data
        train_cluster_words = [
            train_words[i] for i in train_cluster_to_data_indices[cluster_type]
        ]
        if len(train_cluster_words) < 1:
            print 'only {} samples for this inflection type. skipping'.format(
                str(len(train_cluster_words)))
            continue
        else:
            print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \
                  str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \
                  str(len(train_cluster_words)) + ' examples'

        # test best model
        try:
            test_cluster_lemmas = [
                test_lemmas[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]
            test_cluster_words = [
                test_words[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]
            test_cluster_feat_dicts = [
                test_feat_dicts[i]
                for i in test_cluster_to_data_indices[cluster_type]
            ]

            if ensemble:
                # load ensemble models
                ensemble_model_names = ensemble.split(',')
                print 'ensemble paths:\n'
                print '\n'.join(ensemble_model_names)
                ensemble_models = []
                for ens in ensemble_model_names:
                    model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                        str(cluster_index), alphabet, ens, input_dim,
                        hidden_dim, layers, feature_alphabet, feat_input_dim,
                        feature_types)

                    ensemble_models.append(
                        (model, encoder_frnn, encoder_rrnn, decoder_rnn))

                # predict the entire test set with each model in the ensemble
                ensemble_predictions = []
                for em in ensemble_models:
                    model, encoder_frnn, encoder_rrnn, decoder_rnn = em
                    predicted_templates = predict_templates(
                        model, decoder_rnn, encoder_frnn, encoder_rrnn,
                        alphabet_index, inverse_alphabet_index,
                        test_cluster_lemmas, test_cluster_feat_dicts,
                        feat_index, feature_types)
                    ensemble_predictions.append(predicted_templates)

                predicted_templates = {}
                string_to_template = {}

                # perform voting for each test input - joint_index is a lemma+feats representation
                test_data = zip(test_cluster_lemmas, test_cluster_feat_dicts,
                                test_cluster_words)
                for i, (lemma, feat_dict, word) in enumerate(test_data):
                    joint_index = lemma + ':' + common.get_morph_string(
                        feat_dict, feature_types)
                    prediction_counter = defaultdict(int)
                    for ens in ensemble_predictions:
                        prediction_str = ''.join(
                            instantiate_template(ens[joint_index], lemma))
                        prediction_counter[prediction_str] += 1
                        string_to_template[prediction_str] = ens[joint_index]
                        print u'template: {} prediction: {}'.format(
                            ens[joint_index], prediction_str)

                    # return the most predicted output
                    predicted_template_string = max(prediction_counter,
                                                    key=prediction_counter.get)

                    # hack: if chosen without majority, pick shortest prediction
                    if prediction_counter[predicted_template_string] == 1:
                        predicted_template_string = min(prediction_counter,
                                                        key=len)

                    print u'chosen:{} with {} votes\n'.format(
                        predicted_template_string,
                        prediction_counter[predicted_template_string])
                    predicted_templates[joint_index] = string_to_template[
                        predicted_template_string]

                    # progress indication
                    sys.stdout.write(
                        "\r%d%%" % (float(i) / len(test_cluster_lemmas) * 100))
                    sys.stdout.flush()
                    ##

            else:
                # load best model - no ensemble
                best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(
                    str(cluster_index), alphabet, results_file_path, input_dim,
                    hidden_dim, layers, feature_alphabet, feat_input_dim,
                    feature_types)
                print 'starting to predict for cluster: {}'.format(
                    cluster_type)
                try:
                    predicted_templates = predict_templates(
                        best_model, decoder_rnn, encoder_frnn, encoder_rrnn,
                        alphabet_index, inverse_alphabet_index,
                        test_cluster_lemmas, test_cluster_feat_dicts,
                        feat_index, feature_types)
                except Exception as e:
                    print e
                    traceback.print_exc()

            print 'evaluating predictions for cluster: {}'.format(cluster_type)
            try:
                accuracy = evaluate_model(predicted_templates,
                                          test_cluster_lemmas,
                                          test_cluster_feat_dicts,
                                          test_cluster_words,
                                          feature_types,
                                          print_results=True)
                accuracies.append(accuracy)
            except Exception as e:
                print e
                traceback.print_exc()

            # get predicted_templates in the same order they appeared in the original file
            # iterate through them and foreach concat morph, lemma, features in order to print later in the task format
            for i in test_cluster_to_data_indices[cluster_type]:
                joint_index = test_lemmas[i] + ':' + common.get_morph_string(
                    test_feat_dicts[i], feature_types)
                inflection = instantiate_template(
                    predicted_templates[joint_index], test_lemmas[i])

                final_results[i] = (test_lemmas[i], test_feat_dicts[i],
                                    inflection)

        except KeyError:
            print 'could not find relevant examples in test data for cluster: ' + cluster_type
            print 'clusters in test are: {}'.format(
                test_cluster_to_data_indices.keys())
            print 'clusters in train are: {}'.format(
                train_cluster_to_data_indices.keys())

    accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))]
    macro_avg_accuracy = sum(accuracy_vals) / len(accuracies)
    print 'macro avg accuracy: ' + str(macro_avg_accuracy)

    mic_nom = sum(
        [accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))])
    mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))])
    micro_average_accuracy = mic_nom / mic_denom
    print 'micro avg accuracy: ' + str(micro_average_accuracy)

    if 'test' in test_path:
        suffix = '.best.test'
    else:
        suffix = '.best'

    common.write_results_file_and_evaluate_externally(
        hyper_params, micro_average_accuracy, train_path, test_path,
        results_file_path + suffix, sigmorphon_root_dir, final_results)