def main():
    #langs = ['russian', 'turkish', 'spanish', 'arabic', 'georgian', 'german', 'navajo', 'finnish']
    langs = ['arabic']
    sig_root = '/Users/roeeaharoni/GitHub/sigmorphon2016/'
    for lang in langs:
        train_path = '{0}/data/{1}-task1-train'.format(sig_root, lang)
        test_path = '{0}/data/{1}-task1-dev'.format(sig_root, lang)
        # load train and test data
        (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
        (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
        alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

        # align the words to the inflections, the alignment will later be used by the model
        print 'started aligning'
        train_word_pairs = zip(train_lemmas, train_words)
        test_word_pairs = zip(test_lemmas, test_words)
        align_symbol = '~'

        # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
        train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

        # TODO: align together?
        test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
        # random.shuffle(train_aligned_pairs)
        # for p in train_aligned_pairs[:100]:
        #    generate_template(p)
        print 'finished aligning'
        for i, p in enumerate(test_aligned_pairs):
            print i
            print p[0]
            print p[1] + '\n'
    return
Пример #2
0
def main():
    #langs = ['russian', 'turkish', 'spanish', 'arabic', 'georgian', 'german', 'navajo', 'finnish']
    langs = ['arabic']
    sig_root = '/Users/roeeaharoni/GitHub/sigmorphon2016/'
    for lang in langs:
        train_path = '{0}/data/{1}-task1-train'.format(sig_root, lang)
        test_path = '{0}/data/{1}-task1-dev'.format(sig_root, lang)
        # load train and test data
        (train_words, train_lemmas,
         train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
        (test_words, test_lemmas,
         test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
        alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
            train_words, train_lemmas, train_feat_dicts)

        # align the words to the inflections, the alignment will later be used by the model
        print 'started aligning'
        train_word_pairs = zip(train_lemmas, train_words)
        test_word_pairs = zip(test_lemmas, test_words)
        align_symbol = '~'

        # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
        train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

        # TODO: align together?
        test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
        # random.shuffle(train_aligned_pairs)
        # for p in train_aligned_pairs[:100]:
        #    generate_template(p)
        print 'finished aligning'
        for i, p in enumerate(test_aligned_pairs):
            print i
            print p[0]
            print p[1] + '\n'
    return
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim,
         epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble):
    if plot:
        parallelize_training = False
        print 'plotting, parallelization is disabled!!!'
    else:
        parallelize_training = PARALLELIZE

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'dev path =' + str(dev_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    dev_word_pairs = zip(dev_lemmas, dev_words)

    # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL)

    # TODO: align together?
    dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    dev_pos_to_data_indices = common.cluster_data_by_pos(dev_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    dev_cluster_to_data_indices = dev_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(dev_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # dev_cluster_to_data_indices = test_morph_to_data_indices

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts,
                       train_words, dev_lemmas, dev_feat_dicts, train_cluster_to_data_indices, dev_words,
                       dev_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
                       optimization, results_file_path, train_aligned_pairs, dev_aligned_pairs, feat_index,
                       feature_types, feat_input_dim, feature_alphabet, plot])

    if not eval_only:
        if parallelize_training:

            # set maxtasksperchild=1 to free finished processes
            p = Pool(4, maxtasksperchild=1)
            print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
            p.map(train_cluster_model_wrapper, params)
        else:
            print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
            last_epochs = []
            for p in params:
                cluster_index = p[3]
                cluster_name = p[4]
                trained_model, last_epoch = train_cluster_model(*p)

                # print when did each model stop
                epoch_output = 'cluster {0} - {1} stopped on epoch {2}'.format(cluster_index, cluster_name, last_epoch)
                last_epochs.append(epoch_output)
                print epoch_output

            with open(results_file_path + '.epochs', 'w') as f:
                f.writelines(last_epochs)

        print 'finished training all models'
    else:
        print 'skipped training by request. evaluating best models:'

    # eval on dev
    print '=========DEV EVALUATION:========='
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, dev_cluster_to_data_indices, dev_feat_dicts, dev_lemmas, dev_path,
                  dev_words, train_cluster_to_data_indices, train_path, train_words)

    # eval on test
    print '=========TEST EVALUATION:========='
    test_cluster_to_data_indices = common.cluster_data_by_pos(dev_feat_dicts)
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, test_cluster_to_data_indices, test_feat_dicts, test_lemmas, test_path,
                  test_words, train_cluster_to_data_indices, train_path, train_words)

    return
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim,
         epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'dev path =' + str(dev_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    if not eval_only:

        # align the words to the inflections, the alignment will later be used by the model
        print 'started aligning'
        train_word_pairs = zip(train_lemmas, train_words)
        dev_word_pairs = zip(dev_lemmas, dev_words)

        # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL)
        train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL)

        # TODO: align together?
        dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL)
        print 'finished aligning'

        last_epochs = []
        trained_model, last_epoch = train_model_wrapper(input_dim, hidden_dim, layers, train_lemmas, train_feat_dicts,
                                                        train_words, dev_lemmas, dev_feat_dicts, dev_words,
                                                        alphabet, alphabet_index, inverse_alphabet_index, epochs,
                                                        optimization, results_file_path, train_aligned_pairs,
                                                        dev_aligned_pairs,
                                                        feat_index, feature_types, feat_input_dim, feature_alphabet,
                                                        plot)

        # print when did each model stop
        print 'stopped on epoch {}'.format(last_epoch)

        with open(results_file_path + '.epochs', 'w') as f:
            f.writelines(last_epochs)

        print 'finished training all models'
    else:
        print 'skipped training by request. evaluating best models:'

    # eval on dev
    #~ print '=========DEV EVALUATION:========='
    #~ evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  #~ hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  #~ sigmorphon_root_dir, dev_feat_dicts, dev_lemmas, dev_path,
                  #~ dev_words, train_path)

    # eval on test
    print '=========TEST EVALUATION:========='
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, test_feat_dicts, test_lemmas, test_path,
                  test_words, train_path)

    return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization, regularization, learning_rate, plot):
    if plot:
        parallelize_training = False
        print 'plotting, parallelization is disabled!!!'
    else:
        parallelize_training = PARALLELIZE

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

    # TODO: align together?
    test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts,
                       train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words,
                       test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
                       optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index,
                       feature_types, feat_input_dim, feature_alphabet, plot])

    if parallelize_training:
        # set maxtasksperchild=1 to free finished processes
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
        models = p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
        for p in params:
            trained_model, last_epoch = train_cluster_model(*p)
    print 'finished training all models'

    # evaluate best models
    os.system('python task1_evaluate_best_joint_structured_models_blstm_feed_fix.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \
                 --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim,
                                                                                      feat_input_dim, epochs,
                                                                                      layers, optimization, train_path,
                                                                                      test_path,
                                                                                      results_file_path,
                                                                                      sigmorphon_root_dir))
    return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization, regularization, learning_rate, plot):
    if plot:
        parallelize_training = False
        print 'plotting, parallelization is disabled!!!'
    else:
        parallelize_training = PARALLELIZE

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_target_words, train_source_words, train_target_feat_dicts,
     train_source_feat_dicts) = prepare_sigmorphon_data.load_data(train_path, 2)
    (test_target_words, test_source_words, test_target_feat_dicts,
     test_source_feat_dicts) = prepare_sigmorphon_data.load_data(test_path, 2)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_target_words, train_source_words,
                                                                   train_target_feat_dicts, train_source_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_source_feat_dicts + train_target_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_source_words, train_target_words)
    test_word_pairs = zip(test_source_words, test_target_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

    # TODO: align together?
    test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    # TODO: do we need to cluster on both source and target feats? 
    #       probably enough to cluster on source here becasue pos will be same
    #       (no derivational morphology in this task)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_source_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_source_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append(
            [input_dim, hidden_dim, layers, cluster_index, cluster_type, train_source_words, train_source_feat_dicts,
             train_target_words, train_target_feat_dicts, test_source_words, test_source_feat_dicts,
             train_cluster_to_data_indices, test_target_words, test_target_feat_dicts,
             test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
             optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index,
             feature_types, feat_input_dim, feature_alphabet, plot])

    if parallelize_training:
        # set maxtasksperchild=1 to free finished processes
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
        models = p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
        for p in params:
            trained_model, last_epoch = train_cluster_model(*p)
    print 'finished training all models'

    # evaluate best models
    os.system('python task2_evaluate_best_joint_structured_models_blstm_feed_fix.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \
                 --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim,
                                                                                      feat_input_dim, epochs,
                                                                                      layers, optimization, train_path,
                                                                                      test_path,
                                                                                      results_file_path,
                                                                                      sigmorphon_root_dir))
    return
Пример #7
0
def main(train_path, test_path, results_file_path, sigmorphon_root_dir,
         input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization):
    parallelize_training = PARALLELIZE
    hyper_params = {
        'INPUT_DIM': input_dim,
        'HIDDEN_DIM': hidden_dim,
        'FEAT_INPUT_DIM': feat_input_dim,
        'EPOCHS': epochs,
        'LAYERS': layers,
        'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
        'OPTIMIZATION': optimization,
        'PATIENCE': MAX_PATIENCE,
        'REGULARIZATION': REGULARIZATION,
        'LEARNING_RATE': LEARNING_RATE
    }

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {
        index: char
        for char, index in alphabet_index.items()
    }

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

    # TODO: align together?
    test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # TODO: change build_model (done), train_model (in progress), predict (done), one word loss (done) etc. to take the
    # features in account

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(
            train_cluster_to_data_indices):
        params.append([
            input_dim, hidden_dim, layers, cluster_index, cluster_type,
            train_lemmas, train_feat_dicts, train_words, test_lemmas,
            test_feat_dicts, train_cluster_to_data_indices, test_words,
            test_cluster_to_data_indices, alphabet, alphabet_index,
            inverse_alphabet_index, epochs, optimization, results_file_path,
            train_aligned_pairs, test_aligned_pairs, feat_index, feature_types,
            feat_input_dim, feature_alphabet
        ])

    if parallelize_training:
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(
            len(train_cluster_to_data_indices))
        p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(
            len(train_cluster_to_data_indices))
        for p in params:
            train_cluster_model(*p)
    print 'finished training all models'

    # evaluate best models
    os.system(
        'python task1_evaluate_best_joint_structured_models.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \
                 --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.
        format(input_dim, hidden_dim, feat_input_dim, epochs, layers,
               optimization, train_path, test_path, results_file_path,
               sigmorphon_root_dir))
    return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs,
         layers, optimization, regularization, learning_rate, plot):
    if plot:
        parallelize_training = False
        print 'plotting, parallelization is disabled!!!'
    else:
        parallelize_training = PARALLELIZE

    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3*MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)

    # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL)

    # TODO: align together?
    test_aligned_pairs = common.mcmc_align(test_word_pairs, ALIGN_SYMBOL)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    # joint model: cluster the data by POS type (features)
    train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts)
    test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts)
    train_cluster_to_data_indices = train_pos_to_data_indices
    test_cluster_to_data_indices = test_pos_to_data_indices

    # factored model: cluster the data by inflection type (features)
    # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types)
    # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types)
    # train_cluster_to_data_indices = train_morph_to_data_indices
    # test_cluster_to_data_indices = test_morph_to_data_indices

    # create input for each model and then parallelize or run in loop.
    params = []
    for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices):
        params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts,
                       train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words,
                       test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs,
                       optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index,
                       feature_types, feat_input_dim, feature_alphabet, plot])

    if parallelize_training:

        # set maxtasksperchild=1 to free finished processes
        p = Pool(4, maxtasksperchild=1)
        print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices))
        models = p.map(train_cluster_model_wrapper, params)
    else:
        print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices))
        last_epochs = []
        for p in params:
            cluster_index = p[3]
            cluster_name = p[4]
            trained_model, last_epoch = train_cluster_model(*p)

            # print when did each model stop
            epoch_output = 'cluster {0} - {1} stopped on epoch {2}'.format(cluster_index, cluster_name,
                                                                           last_epoch)
            last_epochs.append(epoch_output)
            print epoch_output

        with open(results_file_path + '.epochs', 'w') as f:
            f.writelines(last_epochs)

    print 'finished training all models'

    # evaluate best models
    os.system('python task1_evaluate_best_nfst_models.py --cnn-mem 6096 --input={0} --hidden={1} \
    --feat-input={2} --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim,
                                                                                      feat_input_dim, epochs,
                                                                                      layers, optimization, train_path,
                                                                                      test_path,
                                                                                      results_file_path,
                                                                                      sigmorphon_root_dir))
    for e in last_epochs:
        print 'last epoch is {}'.format(e)

    return
Пример #9
0
def train_model(model, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas,
                train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts,
                dev_words, alphabet_index, inverse_alphabet_index, epochs,
                optimization, results_file_path, feat_index, feature_types,
                plot):
    print 'training...'

    np.random.seed(17)
    random.seed(17)

    if optimization == 'ADAM':
        trainer = pc.AdamTrainer(model,
                                 lam=REGULARIZATION,
                                 alpha=LEARNING_RATE,
                                 beta_1=0.9,
                                 beta_2=0.999,
                                 eps=1e-8)
    elif optimization == 'MOMENTUM':
        trainer = pc.MomentumSGDTrainer(model)
    elif optimization == 'SGD':
        trainer = pc.SimpleSGDTrainer(model)
    elif optimization == 'ADAGRAD':
        trainer = pc.AdagradTrainer(model)
    elif optimization == 'ADADELTA':
        trainer = pc.AdadeltaTrainer(model)
    else:
        trainer = pc.SimpleSGDTrainer(model)

    train_sanity_set_size = 100
    total_loss = 0
    best_avg_dev_loss = 999
    best_dev_accuracy = -1
    best_train_accuracy = -1
    best_dev_epoch = 0
    best_train_epoch = 0
    patience = 0
    train_len = len(train_words)
    epochs_x = []
    train_loss_y = []
    dev_loss_y = []
    train_accuracy_y = []
    dev_accuracy_y = []

    # progress bar init
    widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()]
    train_progress_bar = progressbar.ProgressBar(widgets=widgets,
                                                 maxval=epochs).start()
    avg_loss = -1
    e = 0

    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    dev_word_pairs = zip(dev_lemmas, dev_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)
    dev_aligned_pairs = common.mcmc_align(dev_word_pairs, align_symbol)

    print 'finished aligning'

    for e in xrange(epochs):

        # randomize the training set
        indices = range(train_len)
        random.shuffle(indices)
        train_set = zip(train_lemmas, train_feat_dicts, train_words,
                        train_aligned_pairs)
        train_set = [train_set[i] for i in indices]

        # compute loss for each example and update
        for i, example in enumerate(train_set):
            lemma, feats, word, alignment = example
            loss = compute_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn,
                                lemma, feats, word, alphabet_index, feat_index,
                                feature_types, alignment)
            loss_value = loss.value()
            total_loss += loss_value
            loss.backward()
            trainer.update()
            if i > 0:
                avg_loss = total_loss / float(i + e * train_len)
            else:
                avg_loss = total_loss

            if i % 100 == 0 and i > 0:
                print 'went through {} examples out of {}'.format(i, train_len)

        if EARLY_STOPPING:
            print 'starting epoch evaluation'

            # get train accuracy
            print 'train sanity prediction:'
            train_predictions = predict_sequences(
                model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index,
                inverse_alphabet_index, train_lemmas[:train_sanity_set_size],
                train_feat_dicts[:train_sanity_set_size], feat_index,
                feature_types)
            print 'train sanity evaluation:'
            train_accuracy = evaluate_model(
                train_predictions, train_lemmas[:train_sanity_set_size],
                train_feat_dicts[:train_sanity_set_size],
                train_words[:train_sanity_set_size], feature_types, True)[1]

            if train_accuracy > best_train_accuracy:
                best_train_accuracy = train_accuracy
                best_train_epoch = e

            dev_accuracy = 0
            avg_dev_loss = 0

            if len(dev_lemmas) > 0:
                print 'dev prediction:'
                # get dev accuracy
                dev_predictions = predict_sequences(model, decoder_rnn,
                                                    encoder_frnn, encoder_rrnn,
                                                    alphabet_index,
                                                    inverse_alphabet_index,
                                                    dev_lemmas, dev_feat_dicts,
                                                    feat_index, feature_types)
                print 'dev evaluation:'
                # get dev accuracy
                dev_accuracy = evaluate_model(dev_predictions,
                                              dev_lemmas,
                                              dev_feat_dicts,
                                              dev_words,
                                              feature_types,
                                              print_results=True)[1]

                if dev_accuracy >= best_dev_accuracy:
                    best_dev_accuracy = dev_accuracy
                    best_dev_epoch = e

                    # save best model to disk
                    task1_attention_implementation.save_pycnn_model(
                        model, results_file_path)
                    print 'saved new best model'
                    patience = 0
                else:
                    patience += 1

                # found "perfect" model
                if dev_accuracy == 1:
                    train_progress_bar.finish()
                    if plot:
                        plt.cla()
                    return model, e

                # get dev loss
                total_dev_loss = 0
                for i in xrange(len(dev_lemmas)):
                    total_dev_loss += compute_loss(
                        model, encoder_frnn, encoder_rrnn, decoder_rnn,
                        dev_lemmas[i], dev_feat_dicts[i], dev_words[i],
                        alphabet_index, feat_index, feature_types,
                        dev_aligned_pairs[i]).value()

                avg_dev_loss = total_dev_loss / float(len(dev_lemmas))
                if avg_dev_loss < best_avg_dev_loss:
                    best_avg_dev_loss = avg_dev_loss

                print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev accuracy: {3:.4f} train accuracy = {4:.4f} \
 best dev accuracy {5:.4f} (epoch {8}) best train accuracy: {6:.4f} (epoch {9}) patience = {7}'.format(
                    e, avg_loss, avg_dev_loss, dev_accuracy, train_accuracy,
                    best_dev_accuracy, best_train_accuracy, patience,
                    best_dev_epoch, best_train_epoch)

                if patience == MAX_PATIENCE:
                    print 'out of patience after {0} epochs'.format(str(e))
                    # TODO: would like to return best model but pycnn has a bug with save and load. Maybe copy via code?
                    # return best_model[0]
                    train_progress_bar.finish()
                    if plot:
                        plt.cla()
                    return model, e
            else:

                # if no dev set is present, optimize on train set
                print 'no dev set for early stopping, running all epochs until perfectly fitting or patience was \
                reached on the train set'

                if train_accuracy > best_train_accuracy:
                    best_train_accuracy = train_accuracy

                    # save best model to disk
                    task1_attention_implementation.save_pycnn_model(
                        model, results_file_path)
                    print 'saved new best model'
                    patience = 0
                else:
                    patience += 1

                print 'epoch: {0} train loss: {1:.4f} train accuracy = {2:.4f} best train accuracy: {3:.4f} \
                patience = {4}'.format(e, avg_loss, train_accuracy,
                                       best_train_accuracy, patience)

                # found "perfect" model on train set or patience has reached
                if train_accuracy == 1 or patience == MAX_PATIENCE:
                    train_progress_bar.finish()
                    if plot:
                        plt.cla()
                    return model, e

            # update lists for plotting
            train_accuracy_y.append(train_accuracy)
            epochs_x.append(e)
            train_loss_y.append(avg_loss)
            dev_loss_y.append(avg_dev_loss)
            dev_accuracy_y.append(dev_accuracy)

        # finished epoch
        train_progress_bar.update(e)

        if plot:
            with plt.style.context('fivethirtyeight'):
                p1, = plt.plot(epochs_x, dev_loss_y, label='dev loss')
                p2, = plt.plot(epochs_x, train_loss_y, label='train loss')
                p3, = plt.plot(epochs_x, dev_accuracy_y, label='dev acc.')
                p4, = plt.plot(epochs_x, train_accuracy_y, label='train acc.')
                plt.legend(loc='upper left', handles=[p1, p2, p3, p4])
            plt.savefig(results_file_path + 'plot.png')

    train_progress_bar.finish()
    if plot:
        plt.cla()
    print 'finished training. average loss: {} best epoch on dev: {} best epoch on train: {}'.format(
        str(avg_loss), best_dev_epoch, best_train_epoch)
    return model, e, best_train_epoch
def main():
    # train_path = '../data/heb/hebrew-task1-train'
    # dev_path = '../data/heb/hebrew-task1-dev'
    # test_path = '../data/heb/hebrew-task1-test'

    # train_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/german-task1-train'
    # dev_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/german-task1-dev'
    # test_path = '../biu/gold/german-task1-test'

    train_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/finnish-task1-train'
    dev_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/finnish-task1-dev'
    test_path = '../biu/gold/finnish-task1-test'

    (train_words, train_lemmas,
     train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas,
     dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas,
     test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(
        train_words, train_lemmas, train_feat_dicts)

    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    test_word_pairs = zip(test_lemmas, test_words)
    dev_word_pairs = zip(dev_lemmas, dev_words)
    align_symbol = '~'

    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)

    index2template = {}
    for i, aligned_pair in enumerate(train_aligned_pairs):
        template = task1_single_ms2s.generate_template_from_alignment(
            aligned_pair)
        index2template[i] = template

    dev_handled = 0
    print 'now trying all templates on dev'
    for pair in dev_word_pairs:
        lemma, inflection = pair
        for template in index2template.values():
            prediction = task1_single_ms2s.instantiate_template(
                template, lemma)
            if prediction == inflection:
                dev_handled += 1
                break

    print "train templates handled {} examples in dev out of {}, {}%".format(
        dev_handled, len(dev_lemmas),
        float(dev_handled) / len(dev_lemmas) * 100)

    test_handled = 0
    print 'now trying all templates on test'
    for pair in test_word_pairs:
        lemma, inflection = pair
        for template in index2template.values():
            prediction = task1_single_ms2s.instantiate_template(
                template, lemma)
            if prediction == inflection:
                test_handled += 1
                break

    print "train templates handled {} examples in test out of {}, {}%".format(
        test_handled, len(test_lemmas),
        float(test_handled) / len(test_lemmas) * 100)
def train_model(model, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas, train_feat_dicts, train_words, dev_lemmas,
                dev_feat_dicts, dev_words, alphabet_index, inverse_alphabet_index, epochs, optimization,
                results_file_path, feat_index, feature_types, plot):
    print 'training...'

    np.random.seed(17)
    random.seed(17)

    if optimization == 'ADAM':
        trainer = pc.AdamTrainer(model, lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8)
    elif optimization == 'MOMENTUM':
        trainer = pc.MomentumSGDTrainer(model)
    elif optimization == 'SGD':
        trainer = pc.SimpleSGDTrainer(model)
    elif optimization == 'ADAGRAD':
        trainer = pc.AdagradTrainer(model)
    elif optimization == 'ADADELTA':
        trainer = pc.AdadeltaTrainer(model)
    else:
        trainer = pc.SimpleSGDTrainer(model)

    train_sanity_set_size = 100
    total_loss = 0
    best_avg_dev_loss = 999
    best_dev_accuracy = -1
    best_train_accuracy = -1
    best_dev_epoch = 0
    best_train_epoch = 0
    patience = 0
    train_len = len(train_words)
    epochs_x = []
    train_loss_y = []
    dev_loss_y = []
    train_accuracy_y = []
    dev_accuracy_y = []

    # progress bar init
    widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()]
    train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start()
    avg_loss = -1
    e = 0

    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    dev_word_pairs = zip(dev_lemmas, dev_words)
    align_symbol = '~'

    # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol)
    dev_aligned_pairs = common.mcmc_align(dev_word_pairs, align_symbol)

    print 'finished aligning'

    for e in xrange(epochs):

        # randomize the training set
        indices = range(train_len)
        random.shuffle(indices)
        train_set = zip(train_lemmas, train_feat_dicts, train_words, train_aligned_pairs)
        train_set = [train_set[i] for i in indices]

        # compute loss for each example and update
        for i, example in enumerate(train_set):
            lemma, feats, word, alignment = example
            loss = compute_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word,
                                alphabet_index, feat_index, feature_types, alignment)
            loss_value = loss.value()
            total_loss += loss_value
            loss.backward()
            trainer.update()
            if i > 0:
                avg_loss = total_loss / float(i + e * train_len)
            else:
                avg_loss = total_loss

            if i % 100 == 0 and i > 0:
                print 'went through {} examples out of {}'.format(i, train_len)

        if EARLY_STOPPING:
            print 'starting epoch evaluation'

            # get train accuracy
            print 'train sanity prediction:'
            train_predictions = predict_sequences(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index,
                                                  inverse_alphabet_index, train_lemmas[:train_sanity_set_size],
                                                  train_feat_dicts[:train_sanity_set_size],
                                                  feat_index,
                                                  feature_types)
            print 'train sanity evaluation:'
            train_accuracy = evaluate_model(train_predictions, train_lemmas[:train_sanity_set_size],
                                            train_feat_dicts[:train_sanity_set_size],
                                            train_words[:train_sanity_set_size],
                                            feature_types, True)[1]

            if train_accuracy > best_train_accuracy:
                best_train_accuracy = train_accuracy
                best_train_epoch = e

            dev_accuracy = 0
            avg_dev_loss = 0

            if len(dev_lemmas) > 0:
                print 'dev prediction:'
                # get dev accuracy
                dev_predictions = predict_sequences(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index,
                                                    inverse_alphabet_index, dev_lemmas, dev_feat_dicts, feat_index,
                                                    feature_types)
                print 'dev evaluation:'
                # get dev accuracy
                dev_accuracy = evaluate_model(dev_predictions, dev_lemmas, dev_feat_dicts, dev_words, feature_types,
                                              print_results=True)[1]

                if dev_accuracy >= best_dev_accuracy:
                    best_dev_accuracy = dev_accuracy
                    best_dev_epoch = e

                    # save best model to disk
                    save_pycnn_model(model, results_file_path)
                    print 'saved new best model'
                    patience = 0
                else:
                    patience += 1

                # found "perfect" model
                if dev_accuracy == 1:
                    train_progress_bar.finish()
                    if plot:
                        plt.cla()
                    return model, e

                # get dev loss
                total_dev_loss = 0
                for i in xrange(len(dev_lemmas)):
                    total_dev_loss += compute_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn, dev_lemmas[i],
                                                   dev_feat_dicts[i], dev_words[i], alphabet_index, feat_index,
                                                   feature_types, dev_aligned_pairs[i]).value()

                avg_dev_loss = total_dev_loss / float(len(dev_lemmas))
                if avg_dev_loss < best_avg_dev_loss:
                    best_avg_dev_loss = avg_dev_loss

                print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev accuracy: {3:.4f} train accuracy = {4:.4f} \
 best dev accuracy {5:.4f} (epoch {8}) best train accuracy: {6:.4f} (epoch {9}) patience = {7}'.format(
                                                                                                e,
                                                                                                avg_loss,
                                                                                                avg_dev_loss,
                                                                                                dev_accuracy,
                                                                                                train_accuracy,
                                                                                                best_dev_accuracy,
                                                                                                best_train_accuracy,
                                                                                                patience,
                                                                                                best_dev_epoch,
                                                                                                best_train_epoch)

                if patience == MAX_PATIENCE:
                    print 'out of patience after {0} epochs'.format(str(e))
                    # TODO: would like to return best model but pycnn has a bug with save and load. Maybe copy via code?
                    # return best_model[0]
                    train_progress_bar.finish()
                    if plot:
                        plt.cla()
                    return model, e
            else:

                # if no dev set is present, optimize on train set
                print 'no dev set for early stopping, running all epochs until perfectly fitting or patience was \
                reached on the train set'

                if train_accuracy > best_train_accuracy:
                    best_train_accuracy = train_accuracy

                    # save best model to disk
                    save_pycnn_model(model, results_file_path)
                    print 'saved new best model'
                    patience = 0
                else:
                    patience += 1

                print 'epoch: {0} train loss: {1:.4f} train accuracy = {2:.4f} best train accuracy: {3:.4f} \
                patience = {4}'.format(e, avg_loss, train_accuracy, best_train_accuracy, patience)

                # found "perfect" model on train set or patience has reached
                if train_accuracy == 1 or patience == MAX_PATIENCE:
                    train_progress_bar.finish()
                    if plot:
                        plt.cla()
                    return model, e

            # update lists for plotting
            train_accuracy_y.append(train_accuracy)
            epochs_x.append(e)
            train_loss_y.append(avg_loss)
            dev_loss_y.append(avg_dev_loss)
            dev_accuracy_y.append(dev_accuracy)

        # finished epoch
        train_progress_bar.update(e)

        if plot:
            with plt.style.context('fivethirtyeight'):
                p1, = plt.plot(epochs_x, dev_loss_y, label='dev loss')
                p2, = plt.plot(epochs_x, train_loss_y, label='train loss')
                p3, = plt.plot(epochs_x, dev_accuracy_y, label='dev acc.')
                p4, = plt.plot(epochs_x, train_accuracy_y, label='train acc.')
                plt.legend(loc='upper left', handles=[p1, p2, p3, p4])
            plt.savefig(results_file_path + 'plot.png')

    train_progress_bar.finish()
    if plot:
        plt.cla()
    print 'finished training. average loss: {} best epoch on dev: {} best epoch on train: {}'.format(str(avg_loss),
                                                                                                     best_dev_epoch,
                                                                                                     best_train_epoch)
    return model, e, best_train_epoch
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim,
         epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble):
    hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim,
                    'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN,
                    'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization,
                    'LEARNING_RATE': learning_rate}

    print 'train path = ' + str(train_path)
    print 'dev path =' + str(dev_path)
    print 'test path =' + str(test_path)
    for param in hyper_params:
        print param + '=' + str(hyper_params[param])

    # load train and test data
    (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path)
    (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path)
    (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path)
    alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts)

    # used for character dropout
    alphabet.append(NULL)
    alphabet.append(UNK)

    # used during decoding
    alphabet.append(EPSILON)
    alphabet.append(BEGIN_WORD)
    alphabet.append(END_WORD)

    # add indices to alphabet - used to indicate when copying from lemma to word
    for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]:
        alphabet.append(marker)

    # indicates the FST to step forward in the input
    alphabet.append(STEP)

    # char 2 int
    alphabet_index = dict(zip(alphabet, range(0, len(alphabet))))
    inverse_alphabet_index = {index: char for char, index in alphabet_index.items()}

    # feat 2 int
    feature_alphabet = common.get_feature_alphabet(train_feat_dicts)
    feature_alphabet.append(UNK_FEAT)
    feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet))))

    # align the words to the inflections, the alignment will later be used by the model
    print 'started aligning'
    train_word_pairs = zip(train_lemmas, train_words)
    dev_word_pairs = zip(dev_lemmas, dev_words)

    # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL)
    train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL)

    # TODO: align together?
    dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL)
    # random.shuffle(train_aligned_pairs)
    # for p in train_aligned_pairs[:100]:
    #    generate_template(p)
    print 'finished aligning'

    if not eval_only:
        last_epochs = []
        trained_model, last_epoch = train_model_wrapper(input_dim, hidden_dim, layers, train_lemmas, train_feat_dicts,
                                                        train_words, dev_lemmas, dev_feat_dicts, dev_words,
                                                        alphabet, alphabet_index, inverse_alphabet_index, epochs,
                                                        optimization, results_file_path, train_aligned_pairs,
                                                        dev_aligned_pairs,
                                                        feat_index, feature_types, feat_input_dim, feature_alphabet,
                                                        plot)

        # print when did each model stop
        print 'stopped on epoch {}'.format(last_epoch)

        with open(results_file_path + '.epochs', 'w') as f:
            f.writelines(last_epochs)

        print 'finished training all models'
    else:
        print 'skipped training by request. evaluating best models:'

    # eval on dev
    print '=========DEV EVALUATION:========='
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, dev_feat_dicts, dev_lemmas, dev_path,
                  dev_words, train_path)

    # eval on test
    print '=========TEST EVALUATION:========='
    evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types,
                  hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path,
                  sigmorphon_root_dir, test_feat_dicts, test_lemmas, test_path,
                  test_words, train_path)

    return