def converged(dutch_vocab: Set[str], english_vocab: Set[str], translation_table: TranslationTable, counts: TranslationTable, convergence_factor: float, max_iterations: int, iteration: int = 0, verbose: bool = False) -> bool: """ After we have reached the maximum number of iterations/epoaches or have reached the convergence condition (negligible change observed in probabilities) then we can stop training.""" printv("Testing convergence... ", verbose, end="") if iteration == max_iterations: # Since training takes a long while, we want to be able to limit the number of # iterations then just stop and use whatever model we have at the end of max_iterations # number of iterations. printv("Done.", verbose) return True if iteration != 0 and counts != {}: for dutch_word in dutch_vocab: for english_word in english_vocab: if abs(translation_table[dutch_word][english_word] - counts[dutch_word][english_word]) > convergence_factor: # Then that means that some improvement has happened # somewhere and that we should continue. printv("Done.", verbose) return False printv("Done.", verbose) return True printv("Done.", verbose) return False
def write_back_data(data: Any, output_filename: str, verbose: bool = False) -> None: printv("Writing back training data to {}.pkl...".format(output_filename), verbose, end="") with open(output_filename + ".pkl", "wb") as f: pickle.dump(data, f) printv("Done.", verbose)
parser.add_argument( "-x", "--matrix", help="Indicate that this is a matrix instead of a table.", action="store_true") # TODO: automatically detect and act. parser.add_argument( "-a", "--augment", help= "Try to do some optimiztion and improve the translation using ad hoc methods.", action="store_true") args = parser.parse_args() verbose = args.verbose raw_sentences = get_sentences_from_document(args.document) normalized_sentences = clean_sentences(raw_sentences, keep_numbers=True) printv("Done.", verbose) if args.matrix: printv("Loading translations probability matrix... ", verbose, end="") translation_matrix = TranslationMatrix.thaw( args.translation_probabilities_table) printv("Done.", verbose) printv("Translating sentences... ", verbose, end="") translated_sentences = translate_from_matrix(normalized_sentences, translation_matrix, args.augment) printv("Done.", verbose) else: printv("Loading translations probability table... ", verbose, end="") unpickled_data = unpickle(args.translation_probabilities_table) translation_table = unpickled_data["data"] printv("Done.", verbose)
def train_table(dutch_sentences: List[str], english_sentences: List[str], max_iterations: int, convergence_factor: float, output_filename: str, resume_from_file: str, write_back_epoach: bool = False, verbose: bool = False) -> None: """ The engine for training the statistical machine translator based on the IBM Model 1 (TODO: explain more here in this docstring). """ printv("Determining the vocabularies... ", verbose, end="") english_vocab = get_vocab(english_sentences) dutch_vocab = get_vocab(dutch_sentences) printv("Done.", verbose) if resume_from_file: printv("Reloading the translation probabilities table... ", verbose, end="") reloaded_data = unpickle(resume_from_file) iteration = reloaded_data["iteration"] translation_table = reloaded_data["data"] printv("Done.", verbose) else: printv("Intializing the translation probabilities table... ", verbose, end="") iteration = 0 initial_probability = 1 / (len(english_vocab)) translation_table = { f: {e: initial_probability for e in english_vocab} for f in dutch_vocab } printv("Done.", verbose) counts = {} # type: TranslationTable printv("Beginning the Expectation-Maximization algorithm.", verbose) while not converged(dutch_vocab, english_vocab, translation_table, counts, convergence_factor, max_iterations, iteration, verbose): start_time = time.time() iteration += 1 printv("Intializing the counts and totals... ", verbose, end="") counts = {f: {e: 0.0 for e in english_vocab} for f in dutch_vocab} totals = {f: 0.0 for f in dutch_vocab} printv("Done.", verbose) printv("Calculating probabilities and collecting counts... ", verbose, end="") for english_sentence, dutch_sentence in zip(english_sentences, dutch_sentences): subtotals = defaultdict(float) # type: Dict[str, float] for english_word in english_sentence.split(): for dutch_word in dutch_sentence.split(): subtotals[english_word] += translation_table[dutch_word][ english_word] for english_word in english_sentence.split(): for dutch_word in dutch_sentence.split(): amount = translation_table[dutch_word][ english_word] / subtotals[english_word] counts[dutch_word][english_word] += amount totals[dutch_word] += amount printv("Done.", verbose) printv("Updating translations probabilities table... ", verbose, end="") for dutch_word in dutch_vocab: for english_word in english_vocab: translation_table[dutch_word][english_word] = counts[ dutch_word][english_word] / totals[dutch_word] printv("Done.", verbose) counts = {} totals = {} gc.collect() end_time = time.time() print("Completed iteration {} in {} seconds".format( iteration, end_time - start_time)) if write_back_epoach or (iteration == max_iterations): write_back_data({ "iteration": iteration, "data": translation_table }, output_filename, verbose)
def train_matrix(dutch_sentences: List[str], english_sentences: List[str], max_iterations: int, output_filename: str, resume_from_file: str, write_back_epoach: bool = False, verbose: bool = False) -> None: if resume_from_file: printv("Reloading the translation matrix instance... ", verbose, end="") translation_matrix = TranslationMatrix.thaw(resume_from_file) else: printv("Creating a new translation matrix instance... ", verbose, end="") translation_matrix = TranslationMatrix( list(get_vocab(dutch_sentences)), list(get_vocab(english_sentences))) printv("Done.", verbose) counts = [[0.0 for i in range(len(translation_matrix.english_vocab))] for j in range(len(translation_matrix.dutch_vocab))] printv("Beginning the Expectation-Maximization algorithm.", verbose) # To further improve speed, instead of convergence, we just specify the max number # of iterations to run for instead of testing convergence each iteration. # convergence takes too long anyways. while translation_matrix.iteration < max_iterations: start_time = time.time() translation_matrix.iteration += 1 # count's iteration value does not need to increase, we made counts a TranslationMatrix just for the methods. totals = [0.0] * len(translation_matrix.dutch_vocab) printv("Calculating probabilities and collecting counts... ", verbose, end="") for english_sentence, dutch_sentence in zip(english_sentences, dutch_sentences): subtotals = defaultdict(float) # type: Dict[str, float] # This first loop is for finding the subtotals which will contain the normalization factors for # the counts collection stage. for english_word in english_sentence.split(): for dutch_word in dutch_sentence.split(): subtotals[ english_word] += translation_matrix.get_translation_probability( dutch_word, english_word) # This loop is for the counts collection stage. for english_word in english_sentence.split(): for dutch_word in dutch_sentence.split(): di = translation_matrix.dutch_vocab_index[dutch_word] ei = translation_matrix.english_vocab_index[english_word] amount = translation_matrix.matrix[di][ei] / subtotals[ english_word] counts[di][ei] += amount totals[di] += amount # totals contains yet more normalization factors for the updating translations probabilities stage. printv("Done.", verbose) printv("Updating translations probabilities table... ", verbose, end="") for dutch_word in translation_matrix.dutch_vocab: for english_word in translation_matrix.english_vocab: di = translation_matrix.dutch_vocab_index[dutch_word] ei = translation_matrix.english_vocab_index[english_word] translation_matrix.matrix[di][ei] = counts[di][ei] / totals[di] printv("Done.", verbose) # clear counts. l1 = len(translation_matrix.dutch_vocab) l2 = len(translation_matrix.english_vocab) for i in range(l1): for j in range(l2): counts[i][j] = 0.0 end_time = time.time() print("Completed iteration {} in {} seconds".format( translation_matrix.iteration, end_time - start_time)) if write_back_epoach or (translation_matrix.iteration == max_iterations): printv("Freezing the translations probability matrix... ", verbose, end="") translation_matrix.freeze(output_filename) printv("Done.", verbose)
help="Invert the order of translation to English -> Dutch.", action="store_true") parser.add_argument( "-x", "--matrix", help= "Generate a translation probabilities matrix (list of lists) instead of a table (dict of dicts)", action="store_true") args = parser.parse_args() training_set = int(args.percentage) if training_set not in [1, 3, 5, 10]: raise ValueError( "Invaild percentage value. Valid values: [1, 3, 5, 10].") printv("Beginning training with the {}% dataset.".format(training_set), args.verbose) dutch_sentences = unpickle( "datasets/training/dutch/dutch_{}p_5t.reduced.pkl".format( training_set)) english_sentences = unpickle( "datasets/training/english/english_{}p_5t.reduced.pkl".format( training_set)) if args.invert: english_sentences, dutch_sentences = dutch_sentences, english_sentences if args.matrix: if args.output == "translation_probabilities_table": args.output = "translation_probabilities_matrix" train_matrix(dutch_sentences, english_sentences, int(args.max_iterations), args.output,