def load_params(model, from_file): log_info("Loading parameters from %s" % from_file) params = np.load(from_file) model.p_f_given_e = params
def save_params(model, to_file): log_info("Saving parameters to %s" % to_file) with open(to_file, "w+") as f: np.save(f, model.p_f_given_e)
english_validation_file_path = "data/validation/dev.e" french_testing_file_path = "data/testing/test/test.f" english_testing_file_path = "data/testing/test/test.e" french_vocab_path = "data/vocabulary/french.txt" english_vocab_path = "data/vocabulary/english.txt" validation_golden = 'data/validation/dev.wa.nonullalign' testing_golden = 'data/testing/answers/test.wa.nonullalign' # Load the vocabularies for English and French. vocab_french = Vocabulary(french_file_path, vocab_file_path=french_vocab_path, min_count=min_count, \ max_size=max_vocab_size) vocab_english = Vocabulary(english_file_path, vocab_file_path=english_vocab_path, min_count=min_count, \ max_size=max_vocab_size) # Set up the model. log_info("Setting up the model, French vocabulary size = %d, English vocabulary size = %d." % \ (len(vocab_french), len(vocab_english))) model = IBM1(french_vocab_size=len(vocab_french), english_vocab_size=len(vocab_english)) log_info("Model has been set up.") # Tokenize the French and English sentences. parallel_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \ french_file_path=french_file_path, english_file_path=english_file_path) parallel_validation_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \ french_file_path=french_validation_file_path, english_file_path=english_validation_file_path) parallel_testing_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \ french_file_path=french_testing_file_path, english_file_path=english_testing_file_path) # Calculate the validation AER and log likelihood for the initial parameters. validation_aer = evaluate_model(model, validation_golden,
french_file_path = "data/training/small/hansards.36.2.f" if small_dataset else "data/training/hansards.36.2.f" french_validation_file_path = "data/validation/dev.f" english_file_path = "data/training/small/hansards.36.2.e" if small_dataset else "data/training/hansards.36.2.e" french_validation_file_path = "data/validation/dev.f" english_validation_file_path = "data/validation/dev.e" french_vocab_path = "data/vocabulary/french.txt" english_vocab_path = "data/vocabulary/english.txt" # Load the vocabularies for English and French. vocab_french = Vocabulary(french_file_path, vocab_file_path=french_vocab_path, min_count=min_count, \ max_size=max_vocab_size) vocab_english = Vocabulary(english_file_path, vocab_file_path=english_vocab_path, min_count=min_count, \ max_size=max_vocab_size) # Set up the model. log_info("Setting up the model, French vocabulary size = %d, English vocabulary size = %d, alpha=%f." % \ (len(vocab_french), len(vocab_english), alpha)) model = VariationalIBM1(french_vocab_size=len(vocab_french), english_vocab_size=len(vocab_english), alpha=alpha) log_info("Model has been set up.") # Tokenize the French and English sentences. parallel_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \ french_file_path=french_file_path, english_file_path=english_file_path) parallel_validation_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \ french_file_path=french_validation_file_path, english_file_path=english_validation_file_path) # Calculate the validation AER and log likelihood for the initial parameters. predictions = [] for french_sentence, english_sentence in parallel_validation_corpus: alignments = model.align(french_sentence, english_sentence) # Remove null alignments from predictions
english_validation_file_path = "data/validation/dev.e" french_vocab_path = "data/vocabulary/french.txt" english_vocab_path = "data/vocabulary/english.txt" validation_golden = 'data/validation/dev.wa.nonullalign' testing_golden = 'data/testing/answers/test.wa.nonullalign' french_testing_file_path = "data/testing/test/test.f" english_testing_file_path = "data/testing/test/test.e" # Load the vocabularies for English and French. vocab_french = Vocabulary(french_file_path, vocab_file_path=french_vocab_path, min_count=min_count, max_size=max_vocab_size) vocab_english = Vocabulary(english_file_path, vocab_file_path=english_vocab_path, min_count=min_count, max_size=max_vocab_size) # Set up the model. log_info("Setting up the model, French vocabulary size = %d, English vocabulary size = %d, max_jump = %d." % \ (len(vocab_french), len(vocab_english), max_jump)) model = IBM2(french_vocab_size=len(vocab_french), english_vocab_size=len(vocab_english), max_jump=max_jump, \ init=args.init) log_info("Model has been set up.") # Tokenize the French and English sentences. log_info("Loading parallel corpus from %s and %s" % (french_file_path, english_file_path)) parallel_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \ french_file_path=french_file_path, english_file_path=english_file_path) parallel_validation_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \ french_file_path=french_validation_file_path, english_file_path=english_validation_file_path) parallel_testing_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \ french_file_path=french_testing_file_path, english_file_path=english_testing_file_path) # Load IBM1 parameters if args.init == "ibm1":