def __init__(self, translation_model, production_model): self.translation_model = translation_model self.production_model = production_model self.filter_max = 16 self.null_prior = 0.00007 #FIXME: problem with words being elided too much self.phi2_prior = 1.0 #FIXME: Refactor this constructor if type(translation_model) == type( str()) and type(production_model) == type(str()): self.production_model = EnglishModel(production_model) tm = TranslationModel() tm.learn_from_text(translation_model, production_model) self.translation_model = tm
def score_fr_en_europarl(): print "\nFR->EN Europarl:" # max lines is 300 at the moment num_lines = 300 num_chars = 50 tstart = time.time() en_lines = get_europarl_en_lines() fr_lines = get_europarl_fr_lines() en_learn_set = [] fr_learn_set = [] en_eval_set = [] fr_eval_set = [] for index, pair in enumerate(zip(en_lines, fr_lines)[:num_lines]): pair0 = pair[0][:num_chars] pair1 = pair[1][:num_chars] if index % 4 == 0: en_eval_set.append(pair0) fr_eval_set.append(pair1) else: en_learn_set.append(pair0) fr_learn_set.append(pair1) fr_text = '\n'.join(fr_learn_set) en_text = '\n'.join(en_learn_set) trx_model = TranslationModel() english = EnglishModel(['austen-emma.txt']) trx_model.learn_from_text(fr_text, en_text) translator = Translator(trx_model, english) scorer = TranslationScore() n = 0 score = 0 for xfr, xen in zip(fr_eval_set, en_eval_set): trx_en = translator.translate(xfr) n += 1 score += scorer.of(trx_en, xen) avg_score = float(score) / n print "lines: ", num_lines, ", chars: ", num_chars print "Translation score: {:0.2f}".format(avg_score) + " (of " + str( n) + " comparisons)" tend = time.time() print tdiff(tstart, tend) + " seconds elapsed."