def main(): #langs = ['russian', 'turkish', 'spanish', 'arabic', 'georgian', 'german', 'navajo', 'finnish'] langs = ['arabic'] sig_root = '/Users/roeeaharoni/GitHub/sigmorphon2016/' for lang in langs: train_path = '{0}/data/{1}-task1-train'.format(sig_root, lang) test_path = '{0}/data/{1}-task1-dev'.format(sig_root, lang) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) test_word_pairs = zip(test_lemmas, test_words) align_symbol = '~' # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol) train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol) # TODO: align together? test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' for i, p in enumerate(test_aligned_pairs): print i print p[0] print p[1] + '\n' return
def main(): #langs = ['russian', 'turkish', 'spanish', 'arabic', 'georgian', 'german', 'navajo', 'finnish'] langs = ['arabic'] sig_root = '/Users/roeeaharoni/GitHub/sigmorphon2016/' for lang in langs: train_path = '{0}/data/{1}-task1-train'.format(sig_root, lang) test_path = '{0}/data/{1}-task1-dev'.format(sig_root, lang) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) test_word_pairs = zip(test_lemmas, test_words) align_symbol = '~' # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol) train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol) # TODO: align together? test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' for i, p in enumerate(test_aligned_pairs): print i print p[0] print p[1] + '\n' return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization): parallelize_training = True hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': REGULARIZATION, 'LEARNING_RATE': LEARNING_RATE} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feats = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # cluster the data by inflection type (features) train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feats) test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feats) # factored model: new model per inflection type params = [] for morph_index, morph_type in enumerate(train_morph_to_data_indices): params.append([input_dim, hidden_dim, layers, morph_index, morph_type, train_lemmas, train_words, test_lemmas, train_morph_to_data_indices, test_words, test_morph_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path]) if parallelize_training: p = Pool(4, maxtasksperchild=1) p.map(train_morph_model_wrapper, params) print 'finished training all models' else: for p in params: train_morph_model(*p) # evaluate best models os.system('python task1_evaluate_best_factored_models.py --cnn-mem 4096 --input={0} --hidden={1} --epochs={2} --layers={3}\ --optimization={4} {5} {6} {7} {8}'.format(input_dim, hidden_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) return
def init_model(dev_path, feat_input_dim, hidden_dim, input_dim, layers, results_file_path, test_path, train_path): # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]: alphabet.append(marker) # indicates the FST to step forward in the input alphabet.append(STEP) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) model_file_name = results_file_path + '_bestmodel.txt' # load model and everything else needed for prediction initial_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = hard_attention.load_best_model( alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) print 'loaded existing model successfully' return (initial_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, feat_index, feature_types, inverse_alphabet_index, dev_words, dev_lemmas, dev_feat_dicts)
def init_model(dev_path, feat_input_dim, hidden_dim, input_dim, layers, results_file_path, test_path, train_path): # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]: alphabet.append(marker) # indicates the FST to step forward in the input alphabet.append(STEP) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) model_file_name = results_file_path + '_bestmodel.txt' # load model and everything else needed for prediction initial_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn = hard_attention.load_best_model( alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) print 'loaded existing model successfully' return (initial_model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, feat_index, feature_types, inverse_alphabet_index, dev_words, dev_lemmas, dev_feat_dicts)
def convert_sigmorphon_to_morphtrans(sig_file, morphtrans_file, create_alphabet = True): (words, lemmas, feat_dicts) = prepare_sigmorphon_data.load_data(sig_file) alphabet, feats = prepare_sigmorphon_data.get_alphabet(words, lemmas, feat_dicts) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) if create_alphabet: with codecs.open(morphtrans_file + '.word_alphabet', "w", encoding='utf8') as alphabet_file: alphabet_file.write(' '.join([c for c in list(alphabet) if len(c) < 2]) + ' ' + END_WORD + ' ' + BEGIN_WORD) morph2feats = common.cluster_data_by_morph_type(feat_dicts, feats) with codecs.open(morphtrans_file + '.morph_alphabet', "w", encoding='utf8') as alphabet_file: alphabet_file.write(' '.join([key for key in morph2feats.keys()])) with codecs.open(morphtrans_file, "w", encoding='utf8') as output_file: for lemma, word, dict in zip(lemmas, words, feat_dicts): # <s> a b g a s k l a p p e </s>|<s> a b g a s k l a p p e </s>|case=nominative:number=singular output_file.write(BEGIN_WORD + ' ' + ' '.join(list(lemma)) + ' ' + END_WORD + '|' + BEGIN_WORD + ' ' + ' '.join(list(word)) + ' ' + END_WORD + '|' + get_morph_string(dict, feats) + '\n') return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot): if plot: parallelize_training = False print 'plotting, parallelization is disabled!!!' else: parallelize_training = PARALLELIZE hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) test_word_pairs = zip(test_lemmas, test_words) align_symbol = '~' # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol) train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol) # TODO: align together? test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' # joint model: cluster the data by POS type (features) train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) train_cluster_to_data_indices = train_pos_to_data_indices test_cluster_to_data_indices = test_pos_to_data_indices # factored model: cluster the data by inflection type (features) # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) # train_cluster_to_data_indices = train_morph_to_data_indices # test_cluster_to_data_indices = test_morph_to_data_indices # create input for each model and then parallelize or run in loop. params = [] for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts, train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words, test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index, feature_types, feat_input_dim, feature_alphabet, plot]) if parallelize_training: # set maxtasksperchild=1 to free finished processes p = Pool(4, maxtasksperchild=1) print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices)) models = p.map(train_cluster_model_wrapper, params) else: print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices)) for p in params: trained_model, last_epoch = train_cluster_model(*p) print 'finished training all models' # evaluate best models os.system('python task1_evaluate_best_joint_structured_models_blstm_feed_fix.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \ --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) return
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate} print 'train path = ' + str(train_path) print 'dev path =' + str(dev_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]: alphabet.append(marker) # indicates the FST to step forward in the input alphabet.append(STEP) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) if not eval_only: # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) dev_word_pairs = zip(dev_lemmas, dev_words) # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL) train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL) # TODO: align together? dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL) print 'finished aligning' last_epochs = [] trained_model, last_epoch = train_model_wrapper(input_dim, hidden_dim, layers, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, dev_aligned_pairs, feat_index, feature_types, feat_input_dim, feature_alphabet, plot) # print when did each model stop print 'stopped on epoch {}'.format(last_epoch) with open(results_file_path + '.epochs', 'w') as f: f.writelines(last_epochs) print 'finished training all models' else: print 'skipped training by request. evaluating best models:' # eval on dev #~ print '=========DEV EVALUATION:=========' #~ evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, #~ hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, #~ sigmorphon_root_dir, dev_feat_dicts, dev_lemmas, dev_path, #~ dev_words, train_path) # eval on test print '=========TEST EVALUATION:=========' evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, test_feat_dicts, test_lemmas, test_path, test_words, train_path) return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim): hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # feat 2 int feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # cluster the data by POS type (features) train_cluster_to_data_indices = common.cluster_data_by_pos( train_feat_dicts) test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate( train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [ train_words[i] for i in train_cluster_to_data_indices[cluster_type] ] if len(train_cluster_words) < 1: print 'only ' + str( len(train_cluster_words )) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model try: test_cluster_lemmas = [ test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_words = [ test_words[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_feat_dicts = [ test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type] ] # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) predicted_templates = task1_joint_structured_inflection_feedback_fix.predict_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) accuracy = task1_joint_structured_inflection_feedback_fix.evaluate_model( predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, print_results=False) accuracies.append(accuracy) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string( test_feat_dicts[i], feature_types) inflection = task1_joint_structured_inflection_feedback_fix.instantiate_template( predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) except KeyError: print 'could not find relevant examples in test data for cluster: ' + cluster_type accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))] macro_avg_accuracy = sum(accuracy_vals) / len(accuracies) print 'macro avg accuracy: ' + str(macro_avg_accuracy) mic_nom = sum( [accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))]) mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))]) micro_average_accuracy = mic_nom / mic_denom print 'micro avg accuracy: ' + str(micro_average_accuracy) if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file_and_evaluate_externally( hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results)
def main(): langs = [ 'russian', 'georgian', 'finnish', 'arabic', 'navajo', 'spanish', 'turkish', 'german', 'hungarian', 'maltese' ] for lang in langs: task_num = 1 train_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/{0}-task{1}-train'.format( lang, str(task_num)) dev_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/{0}-task{1}-dev'.format( lang, str(task_num)) if task_num == 1 or task_num == 3: (train_targets, train_sources, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_targets, train_sources, train_feat_dicts) train_cluster_to_data_indices = common.cluster_data_by_pos( train_feat_dicts) test_cluster_to_data_indices = common.cluster_data_by_pos( test_feat_dicts) train_morph_to_data_indices = common.cluster_data_by_morph_type( train_feat_dicts, feature_types) test_morph_to_data_indices = common.cluster_data_by_morph_type( test_feat_dicts, feature_types) if task_num == 2: (train_targets, train_sources, train_target_feat_dicts, train_source_feat_dicts) = prepare_sigmorphon_data.load_data( train_path, task=2) (test_targets, test_sources, test_target_feat_dicts, test_source_feat_dicts) = prepare_sigmorphon_data.load_data( dev_path, task=2) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_targets, train_sources, train_target_feat_dicts, train_source_feat_dicts) train_cluster_to_data_indices = common.cluster_data_by_pos( train_target_feat_dicts) test_cluster_to_data_indices = common.cluster_data_by_pos( test_target_feat_dicts) train_morph_to_data_indices = common.cluster_data_by_morph_type( train_target_feat_dicts, feature_types) test_morph_to_data_indices = common.cluster_data_by_morph_type( test_target_feat_dicts, feature_types) train_agg = 0 for cluster in train_cluster_to_data_indices: train_agg += len(train_cluster_to_data_indices[cluster]) print 'train ' + lang + ' ' + cluster + ' : ' + str( len(train_cluster_to_data_indices[cluster])) + ' examples' print 'train ' + lang + ' ' + 'agg' + ' : ' + str( train_agg) + ' examples' dev_agg = 0 for cluster in test_cluster_to_data_indices: dev_agg += len(test_cluster_to_data_indices[cluster]) print 'dev ' + lang + ' ' + cluster + ' : ' + str( len(test_cluster_to_data_indices[cluster])) + ' examples' print 'dev ' + lang + ' ' + 'agg' + ' : ' + str(dev_agg) + ' examples' print lang + ' train morphs: ' + str(len(train_morph_to_data_indices)) print lang + ' avg ex. per morph: ' + str( sum([len(l) for l in train_morph_to_data_indices.keys()]) / float(len(train_morph_to_data_indices))) print lang + ' dev morphs: ' + str(len(test_morph_to_data_indices)) print lang + ' num features: ' + str(len(feature_types)) for cluster in train_cluster_to_data_indices: train_cluster_words = [ train_targets[i] for i in train_cluster_to_data_indices[cluster] ] train_cluster_lemmas = [ train_sources[i] for i in train_cluster_to_data_indices[cluster] ] prefix_count, suffix_count, same_count, circumfix_count, other_count, lev_avg, del_avg = get_morpheme_stats( train_cluster_words, train_cluster_lemmas) print "train {0} {1} {2} & {3} & {4} & {5} & {6} & {7:.3f} & {8:.3f}".format( lang, cluster, prefix_count, suffix_count, same_count, circumfix_count, other_count, lev_avg, del_avg) for cluster in train_cluster_to_data_indices: print 'train ' + lang + ' ' + cluster + ' : ' + str( len(train_cluster_to_data_indices[cluster])) + ' examples' prefix_count, suffix_count, same_count, circumfix_count, other_count, lev_avg, del_avg = get_morpheme_stats( train_targets, train_sources) print "train {0} {1} {2} & {3} & {4} & {5} & {6} & {7:.3f} & {8:.3f}".format( lang, 'AGG', prefix_count, suffix_count, same_count, circumfix_count, other_count, lev_avg, del_avg)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization): parallelize_training = True hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': REGULARIZATION, 'LEARNING_RATE': LEARNING_RATE } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feats = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # cluster the data by inflection type (features) train_morph_to_data_indices = common.cluster_data_by_morph_type( train_feat_dicts, feats) test_morph_to_data_indices = common.cluster_data_by_morph_type( test_feat_dicts, feats) # factored model: new model per inflection type params = [] for morph_index, morph_type in enumerate(train_morph_to_data_indices): params.append([ input_dim, hidden_dim, layers, morph_index, morph_type, train_lemmas, train_words, test_lemmas, train_morph_to_data_indices, test_words, test_morph_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path ]) if parallelize_training: p = Pool(4, maxtasksperchild=1) p.map(train_morph_model_wrapper, params) print 'finished training all models' else: for p in params: train_morph_model(*p) # evaluate best models os.system( 'python task1_evaluate_best_factored_models.py --cnn-mem 4096 --input={0} --hidden={1} --epochs={2} --layers={3}\ --optimization={4} {5} {6} {7} {8}'.format(input_dim, hidden_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) return
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot, override, eval_only, ensemble): hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) model_file_name = results_file_path + '_bestmodel.txt' if os.path.isfile(model_file_name) and not override: print 'loading existing model from {}'.format(model_file_name) model, encoder_frnn, encoder_rrnn, decoder_rnn = task1_attention_implementation.load_best_model( alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) print 'loaded existing model successfully' else: print 'could not find existing model or explicit override was requested. starting training from scratch...' model, encoder_frnn, encoder_rrnn, decoder_rnn = build_model( alphabet, input_dim, hidden_dim, layers, feature_types, feat_input_dim, feature_alphabet) if not eval_only: # start training trained_model, last_epoch, best_epoch = train_model( model, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, feat_index, feature_types, plot) model = trained_model print 'last epoch is {}'.format(last_epoch) print 'best epoch is {}'.format(best_epoch) print 'finished training' else: print 'skipped training, evaluating on test set...' if ensemble: predicted_sequences = predict_with_ensemble_majority( alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, input_dim, inverse_alphabet_index, layers, test_feat_dicts, test_lemmas, test_words) else: predicted_sequences = predict_sequences(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_lemmas, test_feat_dicts, feat_index, feature_types) if len(predicted_sequences) > 0: # evaluate last model on test amount, accuracy = evaluate_model(predicted_sequences, test_lemmas, test_feat_dicts, test_words, feature_types, print_results=False) print 'initial eval: {}% accuracy'.format(accuracy) final_results = {} for i in xrange(len(test_lemmas)): joint_index = test_lemmas[i] + ':' + common.get_morph_string( test_feat_dicts[i], feature_types) inflection = predicted_sequences[joint_index] final_results[i] = (test_lemmas[i], test_feat_dicts[i], ''.join(inflection)) # evaluate best models common.write_results_file_and_evaluate_externally( hyper_params, accuracy, train_path, test_path, results_file_path + '.external_eval.txt', sigmorphon_root_dir, final_results) return
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate} print 'train path = ' + str(train_path) print 'dev path =' + str(dev_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]: alphabet.append(marker) # indicates the FST to step forward in the input alphabet.append(STEP) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) dev_word_pairs = zip(dev_lemmas, dev_words) # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL) train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL) # TODO: align together? dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' if not eval_only: last_epochs = [] trained_model, last_epoch = train_model_wrapper(input_dim, hidden_dim, layers, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, dev_aligned_pairs, feat_index, feature_types, feat_input_dim, feature_alphabet, plot) # print when did each model stop print 'stopped on epoch {}'.format(last_epoch) with open(results_file_path + '.epochs', 'w') as f: f.writelines(last_epochs) print 'finished training all models' else: print 'skipped training by request. evaluating best models:' # eval on dev print '=========DEV EVALUATION:=========' evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, dev_feat_dicts, dev_lemmas, dev_path, dev_words, train_path) # eval on test print '=========TEST EVALUATION:=========' evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, test_feat_dicts, test_lemmas, test_path, test_words, train_path) return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot): if plot: parallelize_training = False print 'plotting, parallelization is disabled!!!' else: parallelize_training = PARALLELIZE hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_target_words, train_source_words, train_target_feat_dicts, train_source_feat_dicts) = prepare_sigmorphon_data.load_data(train_path, 2) (test_target_words, test_source_words, test_target_feat_dicts, test_source_feat_dicts) = prepare_sigmorphon_data.load_data(test_path, 2) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_target_words, train_source_words, train_target_feat_dicts, train_source_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_source_feat_dicts + train_target_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_source_words, train_target_words) test_word_pairs = zip(test_source_words, test_target_words) align_symbol = '~' # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol) train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol) # TODO: align together? test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' # joint model: cluster the data by POS type (features) # TODO: do we need to cluster on both source and target feats? # probably enough to cluster on source here becasue pos will be same # (no derivational morphology in this task) train_pos_to_data_indices = common.cluster_data_by_pos(train_source_feat_dicts) test_pos_to_data_indices = common.cluster_data_by_pos(test_source_feat_dicts) train_cluster_to_data_indices = train_pos_to_data_indices test_cluster_to_data_indices = test_pos_to_data_indices # factored model: cluster the data by inflection type (features) # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) # train_cluster_to_data_indices = train_morph_to_data_indices # test_cluster_to_data_indices = test_morph_to_data_indices # create input for each model and then parallelize or run in loop. params = [] for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): params.append( [input_dim, hidden_dim, layers, cluster_index, cluster_type, train_source_words, train_source_feat_dicts, train_target_words, train_target_feat_dicts, test_source_words, test_source_feat_dicts, train_cluster_to_data_indices, test_target_words, test_target_feat_dicts, test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index, feature_types, feat_input_dim, feature_alphabet, plot]) if parallelize_training: # set maxtasksperchild=1 to free finished processes p = Pool(4, maxtasksperchild=1) print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices)) models = p.map(train_cluster_model_wrapper, params) else: print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices)) for p in params: trained_model, last_epoch = train_cluster_model(*p) print 'finished training all models' # evaluate best models os.system('python task2_evaluate_best_joint_structured_models_blstm_feed_fix.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \ --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': REGULARIZATION, 'LEARNING_RATE': LEARNING_RATE} parallelize_training = True print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # cluster the data by POS type (features) train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) train_cluster_to_data_indices = train_pos_to_data_indices test_cluster_to_data_indices = test_pos_to_data_indices # cluster the data by inflection type (features) - used for sanity check # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) # train_cluster_to_data_indices = train_morph_to_data_indices # test_cluster_to_data_indices = test_morph_to_data_indices # generate params for each model params = [] for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts, train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words, test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, feat_index, feature_types, feat_input_dim, feature_alphabet]) # train models in parallel or in loop if parallelize_training: p = Pool(4, maxtasksperchild=1) print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices)) p.map(train_cluster_model_wrapper, params) else: print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices)) for p in params: train_cluster_model(*p) print 'finished training all models' # evaluate best models os.system('python task1_evaluate_best_joint_models.py --cnn-mem 4096 --input={0} --hidden={1} --input-feat {2} \ --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim, nbest): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'NBEST':nbest} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data( train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data( test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # feat 2 int feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # cluster the data by POS type (features) train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]] if len(train_cluster_words) < 1: print 'only ' + str(len(train_cluster_words)) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]] # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) lang = train_path.split('/')[-1].replace('-task{0}-train'.format('1'),'') if nbest == 1: is_nbest = False predicted_templates = task1_joint_structured_inflection_blstm_feedback_fix.predict_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) accuracy = task1_joint_structured_inflection_blstm_feedback_fix.evaluate_model(predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, print_results=False) accuracies.append(accuracy) print '{0} {1} accuracy: {2}'.format(lang, cluster_type, accuracy[1]) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) inflection = task1_joint_structured_inflection_blstm_feedback_fix.instantiate_template( predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) micro_average_accuracy = accuracy[1] else: is_nbest = True predicted_nbset_templates = task1_joint_structured_inflection_blstm_feedback_fix.predict_nbest_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types, nbest, test_cluster_words) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) nbest_inflections = [] templates = [t for (t,p) in predicted_nbset_templates[joint_index]] for template in templates: nbest_inflections.append( task1_joint_structured_inflection_blstm_feedback_fix.instantiate_template( template, test_lemmas[i])) final_results[i] = (test_lemmas[i], test_feat_dicts[i], nbest_inflections) micro_average_accuracy = -1 if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file(hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results, is_nbest)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim, nbest): hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'NBEST': nbest } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_target_words, train_source_words, train_target_feat_dicts, train_source_feat_dicts) = prepare_sigmorphon_data.load_data( train_path, 2) (test_target_words, test_source_words, test_target_feat_dicts, test_source_feat_dicts) = prepare_sigmorphon_data.load_data(test_path, 2) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_target_words, train_source_words, train_target_feat_dicts, train_source_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) feature_alphabet = common.get_feature_alphabet(train_source_feat_dicts + train_target_feat_dicts) feature_alphabet.append(UNK_FEAT) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # feat 2 int feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # cluster the data by POS type (features) # TODO: do we need to cluster on both source and target feats? # probably enough to cluster on source here becasue pos will be same # (no derivational morphology in this task) # train_cluster_to_data_indices = common.cluster_data_by_pos(train_source_feat_dicts) # test_cluster_to_data_indices = common.cluster_data_by_pos(test_source_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) # no clustering, single model train_cluster_to_data_indices = common.get_single_pseudo_cluster( train_source_feat_dicts) test_cluster_to_data_indices = common.get_single_pseudo_cluster( test_source_feat_dicts) accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate( train_cluster_to_data_indices): # get the inflection-specific data train_cluster_target_words = [ train_target_words[i] for i in train_cluster_to_data_indices[cluster_type] ] if len(train_cluster_target_words) < 1: print 'only ' + str( len(train_cluster_target_words )) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_target_words)) + ' examples' # test best model test_cluster_source_words = [ test_source_words[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_target_words = [ test_target_words[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_source_feat_dicts = [ test_source_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type] ] test_cluster_target_feat_dicts = [ test_target_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type] ] # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model( str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) lang = train_path.split('/')[-1].replace('-task{0}-train'.format('1'), '') # handle greedy prediction if nbest == 1: is_nbest = False predicted_templates = task2_ms2s.predict_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_source_words, test_cluster_source_feat_dicts, test_cluster_target_feat_dicts, feat_index, feature_types) accuracy = task2_ms2s.evaluate_model( predicted_templates, test_cluster_source_words, test_cluster_source_feat_dicts, test_cluster_target_words, test_cluster_target_feat_dicts, feature_types, print_results=False) accuracies.append(accuracy) print '{0} {1} accuracy: {2}'.format(lang, cluster_type, accuracy[1]) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_source_words[i] + ':' + common.get_morph_string(test_source_feat_dicts[i], feature_types) \ + ':' + common.get_morph_string(test_target_feat_dicts[i], feature_types) inflection = task2_ms2s.instantiate_template( predicted_templates[joint_index], test_source_words[i]) final_results[i] = (test_source_words[i], test_source_feat_dicts[i], inflection, test_target_feat_dicts[i]) micro_average_accuracy = accuracy[1] # handle n-best prediction else: is_nbest = True predicted_nbset_templates = task2_ms2s.predict_nbest_templates( best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_source_words, test_cluster_source_feat_dicts, test_cluster_target_feat_dicts, feat_index, feature_types, nbest, test_cluster_target_words) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_source_words[i] + ':' + common.get_morph_string(test_source_feat_dicts[i], feature_types) \ + ':' + common.get_morph_string(test_target_feat_dicts[i], feature_types) nbest_inflections = [] templates = [ t for (t, p) in predicted_nbset_templates[joint_index] ] for template in templates: nbest_inflections.append( task2_ms2s.instantiate_template( template, test_source_words[i])) final_results[i] = (test_source_words[i], test_source_feat_dicts[i], nbest_inflections, test_target_feat_dicts[i]) micro_average_accuracy = -1 if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' task2_joint_structured_inflection.write_results_file( hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results, is_nbest)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization): hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': REGULARIZATION, 'LEARNING_RATE': LEARNING_RATE } parallelize_training = True print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # cluster the data by POS type (features) train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) train_cluster_to_data_indices = train_pos_to_data_indices test_cluster_to_data_indices = test_pos_to_data_indices # cluster the data by inflection type (features) - used for sanity check # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) # train_cluster_to_data_indices = train_morph_to_data_indices # test_cluster_to_data_indices = test_morph_to_data_indices # generate params for each model params = [] for cluster_index, cluster_type in enumerate( train_cluster_to_data_indices): params.append([ input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts, train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words, test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, feat_index, feature_types, feat_input_dim, feature_alphabet ]) # train models in parallel or in loop if parallelize_training: p = Pool(4, maxtasksperchild=1) print 'now training {0} models in parallel'.format( len(train_cluster_to_data_indices)) p.map(train_cluster_model_wrapper, params) else: print 'now training {0} models in loop'.format( len(train_cluster_to_data_indices)) for p in params: train_cluster_model(*p) print 'finished training all models' # evaluate best models os.system( 'python task1_evaluate_best_joint_models.py --cnn-mem 4096 --input={0} --hidden={1} --input-feat {2} \ --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'. format(input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) return
def main(): # train_path = '../data/heb/hebrew-task1-train' # dev_path = '../data/heb/hebrew-task1-dev' # test_path = '../data/heb/hebrew-task1-test' # train_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/german-task1-train' # dev_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/german-task1-dev' # test_path = '../biu/gold/german-task1-test' train_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/finnish-task1-train' dev_path = '/Users/roeeaharoni/GitHub/sigmorphon2016/data/finnish-task1-dev' test_path = '../biu/gold/finnish-task1-test' (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) test_word_pairs = zip(test_lemmas, test_words) dev_word_pairs = zip(dev_lemmas, dev_words) align_symbol = '~' train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol) index2template = {} for i, aligned_pair in enumerate(train_aligned_pairs): template = task1_single_ms2s.generate_template_from_alignment( aligned_pair) index2template[i] = template dev_handled = 0 print 'now trying all templates on dev' for pair in dev_word_pairs: lemma, inflection = pair for template in index2template.values(): prediction = task1_single_ms2s.instantiate_template( template, lemma) if prediction == inflection: dev_handled += 1 break print "train templates handled {} examples in dev out of {}, {}%".format( dev_handled, len(dev_lemmas), float(dev_handled) / len(dev_lemmas) * 100) test_handled = 0 print 'now trying all templates on test' for pair in test_word_pairs: lemma, inflection = pair for template in index2template.values(): prediction = task1_single_ms2s.instantiate_template( template, lemma) if prediction == inflection: test_handled += 1 break print "train templates handled {} examples in test out of {}, {}%".format( test_handled, len(test_lemmas), float(test_handled) / len(test_lemmas) * 100)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization): parallelize_training = False hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'CHAR_DROPOUT_PROB': CHAR_DROPOUT_PROB, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': REGULARIZATION, 'LEARNING_RATE': LEARNING_RATE} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feats = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # cluster the data by inflection type (features) train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feats) test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feats) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) test_word_pairs = zip(test_lemmas, test_words) align_symbol = '~' # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol) train_aligned_pairs = mcmc_align(train_word_pairs, align_symbol) # TODO: align together? test_aligned_pairs = mcmc_align(test_word_pairs, align_symbol) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' # factored model: new model per inflection type. create input for each model and then parallelize or run in loop. params = [] for morph_index, morph_type in enumerate(train_morph_to_data_indices): params.append([input_dim, hidden_dim, layers, morph_index, morph_type, train_lemmas, train_words, test_lemmas, train_morph_to_data_indices, test_words, test_morph_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, test_aligned_pairs]) if parallelize_training: p = Pool(4, maxtasksperchild=1) p.map(train_morph_model, params) print 'finished training all models' else: for p in params: if not check_if_exists(p[-3], p[3]): train_morph_model(*p) else: print 'model ' + str(p[3]) + ' exists, skipping...' # evaluate best models os.system('python task1_evaluate_best_factored_structured_models.py --cnn-mem 8192 --input={0} --hidden={1} --epochs={2} \ --layers={3} --optimization={4} {5} {6} {7} {8}'.format(input_dim, hidden_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim, ensemble): hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]: alphabet.append(marker) # indicates the FST to step forward in the input alphabet.append(STEP) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # cluster the data by POS type (features) train_cluster_to_data_indices = common.cluster_data_by_pos( train_feat_dicts) test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) task1_ndst_twin_2.evaluate_ndst( alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, test_cluster_to_data_indices, test_feat_dicts, test_lemmas, test_path, test_words, train_cluster_to_data_indices, train_path, train_words)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization, feat_input_dim): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data( train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data( test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # feat 2 int feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # cluster the data by POS type (features) train_cluster_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) test_cluster_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) # cluster the data by inflection type (features) # train_cluster_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_cluster_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) accuracies = [] final_results = {} # factored model: new model per inflection type for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): # get the inflection-specific data train_cluster_words = [train_words[i] for i in train_cluster_to_data_indices[cluster_type]] if len(train_cluster_words) < 1: print 'only ' + str(len(train_cluster_words)) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for cluster ' + str(cluster_index + 1) + '/' + \ str(len(train_cluster_to_data_indices)) + ': ' + cluster_type + ' with ' + \ str(len(train_cluster_words)) + ' examples' # test best model try: test_cluster_lemmas = [test_lemmas[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_words = [test_words[i] for i in test_cluster_to_data_indices[cluster_type]] test_cluster_feat_dicts = [test_feat_dicts[i] for i in test_cluster_to_data_indices[cluster_type]] # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(cluster_index), alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) predicted_templates = task1_joint_structured_inflection.predict_templates(best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_cluster_lemmas, test_cluster_feat_dicts, feat_index, feature_types) accuracy = task1_joint_structured_inflection.evaluate_model(predicted_templates, test_cluster_lemmas, test_cluster_feat_dicts, test_cluster_words, feature_types, True) accuracies.append(accuracy) # get predicted_templates in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_cluster_to_data_indices[cluster_type]: joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) inflection = task1_joint_structured_inflection.instantiate_template(predicted_templates[joint_index], test_lemmas[i]) final_results[i] = (test_lemmas[i], test_feat_dicts[i], inflection) except KeyError: print 'could not find relevant examples in test data for cluster: ' + cluster_type accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))] macro_avg_accuracy = sum(accuracy_vals) / len(accuracies) print 'macro avg accuracy: ' + str(macro_avg_accuracy) mic_nom = sum([accuracies[i][0] * accuracies[i][1] for i in xrange(len(accuracies))]) mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))]) micro_average_accuracy = mic_nom / mic_denom print 'micro avg accuracy: ' + str(micro_average_accuracy) if 'test' in test_path: suffix = '.best.test' else: suffix = '.best' common.write_results_file(hyper_params, micro_average_accuracy, train_path, test_path, results_file_path + suffix, sigmorphon_root_dir, final_results)
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, epochs, layers, optimization): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data( train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data( test_path) alphabet, feats = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # cluster the data by inflection type (features) train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feats) test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feats) accuracies = [] final_results = {} # factored model: new model per inflection type for morph_index, morph_type in enumerate(train_morph_to_data_indices): # get the inflection-specific data train_morph_words = [train_words[i] for i in train_morph_to_data_indices[morph_type]] if len(train_morph_words) < 1: print 'only ' + str(len(train_morph_words)) + ' samples for this inflection type. skipping' continue else: print 'now evaluating model for morph ' + str(morph_index) + '/' + str(len(train_morph_to_data_indices)) + \ ': ' + morph_type + ' with ' + str(len(train_morph_words)) + ' examples' # test best model try: test_morph_lemmas = [test_lemmas[i] for i in test_morph_to_data_indices[morph_type]] test_morph_words = [test_words[i] for i in test_morph_to_data_indices[morph_type]] # load best model best_model, encoder_frnn, encoder_rrnn, decoder_rnn = load_best_model(str(morph_index), alphabet, results_file_path, input_dim, hidden_dim, layers) predictions = task1_factored_inflection.predict(best_model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_morph_lemmas, test_morph_words) test_data = zip(test_morph_lemmas, test_morph_words) accuracy = task1_factored_inflection.evaluate_model(predictions, test_data) accuracies.append(accuracy) # get predictions in the same order they appeared in the original file # iterate through them and foreach concat morph, lemma, features in order to print later in the task format for i in test_morph_to_data_indices[morph_type]: final_results[i] = (test_lemmas[i], predictions[test_lemmas[i]], morph_type) except KeyError: print 'could not find relevant examples in test data for morph: ' + morph_type accuracy_vals = [accuracies[i][1] for i in xrange(len(accuracies))] macro_avg_accuracy = sum(accuracy_vals)/len(accuracies) print 'macro avg accuracy: ' + str(macro_avg_accuracy) mic_nom = sum([accuracies[i][0]*accuracies[i][1] for i in xrange(len(accuracies))]) mic_denom = sum([accuracies[i][0] for i in xrange(len(accuracies))]) micro_average_accuracy = mic_nom/mic_denom print 'micro avg accuracy: ' + str(micro_average_accuracy) task1_factored_inflection.write_results_file(hyper_params, macro_avg_accuracy, micro_average_accuracy, train_path, test_path, results_file_path + '.best', sigmorphon_root_dir, final_results)
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot, override, eval_only, ensemble): hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) model_file_name = results_file_path + '_bestmodel.txt' if os.path.isfile(model_file_name) and not override: print 'loading existing model from {}'.format(model_file_name) model, encoder_frnn, encoder_rrnn, decoder_rnn = task1_attention_implementation.load_best_model(alphabet, results_file_path, input_dim, hidden_dim, layers, feature_alphabet, feat_input_dim, feature_types) print 'loaded existing model successfully' else: print 'could not find existing model or explicit override was requested. starting training from scratch...' model, encoder_frnn, encoder_rrnn, decoder_rnn = build_model(alphabet, input_dim, hidden_dim, layers, feature_types, feat_input_dim, feature_alphabet) if not eval_only: # start training trained_model, last_epoch, best_epoch = train_model(model, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, feat_index, feature_types, plot) model = trained_model print 'last epoch is {}'.format(last_epoch) print 'best epoch is {}'.format(best_epoch) print 'finished training' else: print 'skipped training, evaluating on test set...' if ensemble: predicted_sequences = predict_with_ensemble_majority(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, input_dim, inverse_alphabet_index, layers, test_feat_dicts, test_lemmas, test_words) else: predicted_sequences = predict_sequences(model, decoder_rnn, encoder_frnn, encoder_rrnn, alphabet_index, inverse_alphabet_index, test_lemmas, test_feat_dicts, feat_index, feature_types) if len(predicted_sequences) > 0: # evaluate last model on test amount, accuracy = evaluate_model(predicted_sequences, test_lemmas, test_feat_dicts, test_words, feature_types, print_results=False) print 'initial eval: {}% accuracy'.format(accuracy) final_results = {} for i in xrange(len(test_lemmas)): joint_index = test_lemmas[i] + ':' + common.get_morph_string(test_feat_dicts[i], feature_types) inflection = predicted_sequences[joint_index] final_results[i] = (test_lemmas[i], test_feat_dicts[i], ''.join(inflection)) # evaluate best models common.write_results_file_and_evaluate_externally(hyper_params, accuracy, train_path, test_path, results_file_path + '.external_eval.txt', sigmorphon_root_dir, final_results) return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot): if plot: parallelize_training = False print 'plotting, parallelization is disabled!!!' else: parallelize_training = PARALLELIZE hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate} print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(3*MAX_PREDICTION_LEN)]: alphabet.append(marker) # indicates the FST to step forward in the input alphabet.append(STEP) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) test_word_pairs = zip(test_lemmas, test_words) # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL) train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL) # TODO: align together? test_aligned_pairs = common.mcmc_align(test_word_pairs, ALIGN_SYMBOL) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' # joint model: cluster the data by POS type (features) train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) train_cluster_to_data_indices = train_pos_to_data_indices test_cluster_to_data_indices = test_pos_to_data_indices # factored model: cluster the data by inflection type (features) # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) # train_cluster_to_data_indices = train_morph_to_data_indices # test_cluster_to_data_indices = test_morph_to_data_indices # create input for each model and then parallelize or run in loop. params = [] for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts, train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words, test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index, feature_types, feat_input_dim, feature_alphabet, plot]) if parallelize_training: # set maxtasksperchild=1 to free finished processes p = Pool(4, maxtasksperchild=1) print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices)) models = p.map(train_cluster_model_wrapper, params) else: print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices)) last_epochs = [] for p in params: cluster_index = p[3] cluster_name = p[4] trained_model, last_epoch = train_cluster_model(*p) # print when did each model stop epoch_output = 'cluster {0} - {1} stopped on epoch {2}'.format(cluster_index, cluster_name, last_epoch) last_epochs.append(epoch_output) print epoch_output with open(results_file_path + '.epochs', 'w') as f: f.writelines(last_epochs) print 'finished training all models' # evaluate best models os.system('python task1_evaluate_best_nfst_models.py --cnn-mem 6096 --input={0} --hidden={1} \ --feat-input={2} --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'.format(input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) for e in last_epochs: print 'last epoch is {}'.format(e) return
def main(train_path, dev_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, regularization, learning_rate, plot, eval_only, ensemble): if plot: parallelize_training = False print 'plotting, parallelization is disabled!!!' else: parallelize_training = PARALLELIZE hyper_params = {'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': regularization, 'LEARNING_RATE': learning_rate} print 'train path = ' + str(train_path) print 'dev path =' + str(dev_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (dev_words, dev_lemmas, dev_feat_dicts) = prepare_sigmorphon_data.load_data(dev_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet(train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(3 * MAX_PREDICTION_LEN)]: alphabet.append(marker) # indicates the FST to step forward in the input alphabet.append(STEP) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = {index: char for char, index in alphabet_index.items()} # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) dev_word_pairs = zip(dev_lemmas, dev_words) # train_aligned_pairs = dumb_align(train_word_pairs, ALIGN_SYMBOL) train_aligned_pairs = common.mcmc_align(train_word_pairs, ALIGN_SYMBOL) # TODO: align together? dev_aligned_pairs = common.mcmc_align(dev_word_pairs, ALIGN_SYMBOL) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' # joint model: cluster the data by POS type (features) train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) dev_pos_to_data_indices = common.cluster_data_by_pos(dev_feat_dicts) train_cluster_to_data_indices = train_pos_to_data_indices dev_cluster_to_data_indices = dev_pos_to_data_indices # factored model: cluster the data by inflection type (features) # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_morph_to_data_indices = common.cluster_data_by_morph_type(dev_feat_dicts, feature_types) # train_cluster_to_data_indices = train_morph_to_data_indices # dev_cluster_to_data_indices = test_morph_to_data_indices # create input for each model and then parallelize or run in loop. params = [] for cluster_index, cluster_type in enumerate(train_cluster_to_data_indices): params.append([input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, train_cluster_to_data_indices, dev_words, dev_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, dev_aligned_pairs, feat_index, feature_types, feat_input_dim, feature_alphabet, plot]) if not eval_only: if parallelize_training: # set maxtasksperchild=1 to free finished processes p = Pool(4, maxtasksperchild=1) print 'now training {0} models in parallel'.format(len(train_cluster_to_data_indices)) p.map(train_cluster_model_wrapper, params) else: print 'now training {0} models in loop'.format(len(train_cluster_to_data_indices)) last_epochs = [] for p in params: cluster_index = p[3] cluster_name = p[4] trained_model, last_epoch = train_cluster_model(*p) # print when did each model stop epoch_output = 'cluster {0} - {1} stopped on epoch {2}'.format(cluster_index, cluster_name, last_epoch) last_epochs.append(epoch_output) print epoch_output with open(results_file_path + '.epochs', 'w') as f: f.writelines(last_epochs) print 'finished training all models' else: print 'skipped training by request. evaluating best models:' # eval on dev print '=========DEV EVALUATION:=========' evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, dev_cluster_to_data_indices, dev_feat_dicts, dev_lemmas, dev_path, dev_words, train_cluster_to_data_indices, train_path, train_words) # eval on test print '=========TEST EVALUATION:=========' test_cluster_to_data_indices = common.cluster_data_by_pos(dev_feat_dicts) evaluate_ndst(alphabet, alphabet_index, ensemble, feat_index, feat_input_dim, feature_alphabet, feature_types, hidden_dim, hyper_params, input_dim, inverse_alphabet_index, layers, results_file_path, sigmorphon_root_dir, test_cluster_to_data_indices, test_feat_dicts, test_lemmas, test_path, test_words, train_cluster_to_data_indices, train_path, train_words) return
def main(train_path, test_path, results_file_path, sigmorphon_root_dir, input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization): parallelize_training = PARALLELIZE hyper_params = { 'INPUT_DIM': input_dim, 'HIDDEN_DIM': hidden_dim, 'FEAT_INPUT_DIM': feat_input_dim, 'EPOCHS': epochs, 'LAYERS': layers, 'MAX_PREDICTION_LEN': MAX_PREDICTION_LEN, 'OPTIMIZATION': optimization, 'PATIENCE': MAX_PATIENCE, 'REGULARIZATION': REGULARIZATION, 'LEARNING_RATE': LEARNING_RATE } print 'train path = ' + str(train_path) print 'test path =' + str(test_path) for param in hyper_params: print param + '=' + str(hyper_params[param]) # load train and test data (train_words, train_lemmas, train_feat_dicts) = prepare_sigmorphon_data.load_data(train_path) (test_words, test_lemmas, test_feat_dicts) = prepare_sigmorphon_data.load_data(test_path) alphabet, feature_types = prepare_sigmorphon_data.get_alphabet( train_words, train_lemmas, train_feat_dicts) # used for character dropout alphabet.append(NULL) alphabet.append(UNK) # used during decoding alphabet.append(EPSILON) alphabet.append(BEGIN_WORD) alphabet.append(END_WORD) # add indices to alphabet - used to indicate when copying from lemma to word for marker in [str(i) for i in xrange(MAX_PREDICTION_LEN)]: alphabet.append(marker) # char 2 int alphabet_index = dict(zip(alphabet, range(0, len(alphabet)))) inverse_alphabet_index = { index: char for char, index in alphabet_index.items() } # feat 2 int feature_alphabet = common.get_feature_alphabet(train_feat_dicts) feature_alphabet.append(UNK_FEAT) feat_index = dict(zip(feature_alphabet, range(0, len(feature_alphabet)))) # align the words to the inflections, the alignment will later be used by the model print 'started aligning' train_word_pairs = zip(train_lemmas, train_words) test_word_pairs = zip(test_lemmas, test_words) align_symbol = '~' # train_aligned_pairs = dumb_align(train_word_pairs, align_symbol) train_aligned_pairs = common.mcmc_align(train_word_pairs, align_symbol) # TODO: align together? test_aligned_pairs = common.mcmc_align(test_word_pairs, align_symbol) # random.shuffle(train_aligned_pairs) # for p in train_aligned_pairs[:100]: # generate_template(p) print 'finished aligning' # joint model: cluster the data by POS type (features) train_pos_to_data_indices = common.cluster_data_by_pos(train_feat_dicts) test_pos_to_data_indices = common.cluster_data_by_pos(test_feat_dicts) train_cluster_to_data_indices = train_pos_to_data_indices test_cluster_to_data_indices = test_pos_to_data_indices # factored model: cluster the data by inflection type (features) # train_morph_to_data_indices = common.cluster_data_by_morph_type(train_feat_dicts, feature_types) # test_morph_to_data_indices = common.cluster_data_by_morph_type(test_feat_dicts, feature_types) # train_cluster_to_data_indices = train_morph_to_data_indices # test_cluster_to_data_indices = test_morph_to_data_indices # TODO: change build_model (done), train_model (in progress), predict (done), one word loss (done) etc. to take the # features in account # create input for each model and then parallelize or run in loop. params = [] for cluster_index, cluster_type in enumerate( train_cluster_to_data_indices): params.append([ input_dim, hidden_dim, layers, cluster_index, cluster_type, train_lemmas, train_feat_dicts, train_words, test_lemmas, test_feat_dicts, train_cluster_to_data_indices, test_words, test_cluster_to_data_indices, alphabet, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, test_aligned_pairs, feat_index, feature_types, feat_input_dim, feature_alphabet ]) if parallelize_training: p = Pool(4, maxtasksperchild=1) print 'now training {0} models in parallel'.format( len(train_cluster_to_data_indices)) p.map(train_cluster_model_wrapper, params) else: print 'now training {0} models in loop'.format( len(train_cluster_to_data_indices)) for p in params: train_cluster_model(*p) print 'finished training all models' # evaluate best models os.system( 'python task1_evaluate_best_joint_structured_models.py --cnn-mem 6096 --input={0} --hidden={1} --feat-input={2} \ --epochs={3} --layers={4} --optimization={5} {6} {7} {8} {9}'. format(input_dim, hidden_dim, feat_input_dim, epochs, layers, optimization, train_path, test_path, results_file_path, sigmorphon_root_dir)) return