def smtp_pipeline(config): path_to_moses = config.get("Environment Settings", "path_to_moses_decoder") mem_limit = config.getint("Environment Settings", "mem_limit") max_len = config.getint("Iteration Settings", "max_sentence_len") min_len = config.getint("Iteration Settings", "min_sentence_len") srcf = utilities.safe_string(config.get("Iteration Settings", "src_lang_data")) piv1f = utilities.safe_string(config.get("Iteration Settings", "src_piv_lang_data")) piv2f = utilities.safe_string(config.get("Iteration Settings", "piv_tar_lang_data")) tarf = utilities.safe_string(config.get("Iteration Settings", "tar_lang_data")) train = config.getfloat("Iteration Settings", "train_split") test = config.getfloat("Iteration Settings", "test_split") ncpus = config.getint("Environment Settings", "ncpus") ngram = config.getint("Environment Settings", "ngram") work_dir1 = utilities.safe_string(config.get("Iteration Settings", "working_dir_first_leg")) work_dir2 = utilities.safe_string(config.get("Iteration Settings", "working_dir_second_leg")) pair1, pair2 = FileDataPair(srcf, piv1f), FileDataPair(piv2f, tarf) raw_files = pair1.get_raw_filenames() + pair2.get_raw_filenames() pair1_tokenized_src, pair1_tokenized_tar = pair1.get_tokenized_filenames() pair2_tokenized_src, pair2_tokenized_tar = pair2.get_tokenized_filenames() pair1_cleansed_src, pair1_cleansed_tar = pair1.get_cleansed_filenames() pair2_cleansed_src, pair2_cleansed_tar = pair2.get_cleansed_filenames() parser = Parser(path_to_moses, mem_limit, max_len, min_len, False) parser.tokenize_files(raw_files) parser.cleanse(pair1_tokenized_src, pair1_tokenized_tar) parser.cleanse(pair2_tokenized_src, pair2_tokenized_tar) parser.split_train_tune_test(pair1_cleansed_src, pair1_cleansed_tar, \ pair2_cleansed_src, pair2_cleansed_tar, train, test) parser.match(pair1_test_src, pair2_test_tar, pair2_test_src, pair2_test_tar) pair1_target_train_filename = pair1.get_target_train_filename() pair2_target_train_filename = pair2.get_target_train_filename() pair1_train_src, pair1_train_tar = pair1.get_train_filenames() pair2_train_src, pair2_train_tar = pair2.get_train_filenames() trainer = Train(path_to_moses, ncpus, ngram, False) trainer.build_language_models(pair1_target_train_filename) trainer.build_language_models(pair2_target_train_filename) trainer.train(pair1_train_src, pair1_train_tar, work_dir1) trainer.train(pair2_train_src, pair2_train_tar, work_dir2) pair1_tune_src, pair1_tune_tar = pair1.get_tune_filenames() pair2_tune_src, pair2_tune_tar = pair2.get_tune_filenames() tuner = Tune(path_to_moses, ncpus, False) tuner.tune(pair1_tune_src, pair1_tune_tar, work_dir1) tuner.tune(pair2_tune_src, pair2_tune_tar, work_dir2) pair1_test_src, pair1_test_tar = pair1.get_test_filenames() pair2_test_src, pair2_test_tar = pair2.get_test_filenames() pair1_test_tar = pair1.get_eval_filename() pair2_test_tar = pair2.get_eval_filename() test = Test(path_to_moses, False) test.test_pivoting_quality(pair1_test_tar, work_dir1, pair2_test_tar, work_dir2)
def main(): parser = Parser(True) # Tokenize the data parser.tokenize("src/europarl-v7.es-en.es") parser.tokenize("src/europarl-v7.es-en.en") parser.tokenize("src/europarl-v7.fr-en.en") parser.tokenize("src/europarl-v7.fr-en.fr") # Normalize the data parser.cleanse("data/europarl-v7.es-en.es.tok", "data/europarl-v7.es-en.en.tok") parser.cleanse("data/europarl-v7.fr-en.en.tok", "data/europarl-v7.fr-en.fr.tok") # Split data into train, tune, test sets parser.split_train_tune_test("data/europarl-v7.es-en.es.tok.cleansed", "data/europarl-v7.es-en.en.tok.cleansed", "data/europarl-v7.fr-en.en.tok.cleansed", "data/europarl-v7.fr-en.fr.tok.cleansed", .6, .2) parser.match("data/test/europarl-v7.es-en.es.tok.cleansed.test", "data/test/europarl-v7.es-en.en.tok.cleansed.test", "data/test/europarl-v7.fr-en.en.tok.cleansed.test", "data/test/europarl-v7.fr-en.fr.tok.cleansed.test") trainer = Train(True) # Build target language models trainer.build_language_models("data/train/europarl-v7.es-en.en.tok.cleansed.train") trainer.build_language_models("data/train/europarl-v7.fr-en.fr.tok.cleansed.train") # Train each leg of the translation system trainer.train("data/train/europarl-v7.es-en.es.tok.cleansed.train", "data/train/europarl-v7.es-en.en.tok.cleansed.train", "es-en.working") trainer.train("data/train/europarl-v7.fr-en.en.tok.cleansed.train", "data/train/europarl-v7.fr-en.fr.tok.cleansed.train", "en-fr.working") # Tune the system on held out data tuner = Tune(True) tuner.tune("data/tune/europarl-v7.es-en.es.tok.cleansed.tune", "data/tune/europarl-v7.es-en.en.tok.cleansed.tune", "es-en.working") tuner.tune("data/tune/europarl-v7.fr-en.en.tok.cleansed.tune", "data/tune/europarl-v7.fr-en.fr.tok.cleansed.tune", "en-fr.working") test = Test(True) # Run interactive translator server test.test_translator_interactive("es-en.working") test.test_translator_interactive("en-fr.working") # Score translation quality between pivot translations using held out test data test.test_translation_quality("data/test/europarl-v7.es-en.es.tok.cleansed.test", "data/test/europarl-v7.es-en.en.tok.cleansed.test", "es-en.working") test.test_translation_quality("data/test/europarl-v7.fr-en.en.tok.cleansed.test", "data/test/europarl-v7.fr-en.fr.tok.cleansed.test", "en-fr.working") # Run interactive translator on pivoting system test.test_pivoting_interactive("es-en.working", "en-fr.working") # Score translation quality on entire translation using matched test data test.test_pivoting_quality("data/test/europarl-v7.es-en.es.tok.cleansed.test.matched", "es-en.working", "data/test/europarl-v7.fr-en.fr.tok.cleansed.test.matched", "en-fr.working")
def evaluate_pivoting(): # Create Test instance, set verbose to True to see whats happening test = Test(True) # We can finally evaluate the translations the pivoting provides by # calling test_pivoting_quality and providing the source language test file, # the directory containing the trained translation model for the source to # pivot languages, the target language (true) test file and the directory # containing the trained translation model for the pivot to target language. # To obtain an accurat scoring, the data files provided MUST be matched. # The output is a Bleu score regarding translation quality. Generally, # higher scoring is better. test.test_pivoting_quality("data/test/europarl-v7.es-en.es.tok.cleansed.test.matched", "es-en.working", "data/test/europarl-v7.fr-en.fr.tok.cleansed.test.matched", "en-fr.working")