示例#1
0
    def __init__(self, translation_model, production_model):
        self.translation_model = translation_model
        self.production_model = production_model
        self.filter_max = 16
        self.null_prior = 0.00007  #FIXME: problem with words being elided too much
        self.phi2_prior = 1.0

        #FIXME: Refactor this constructor
        if type(translation_model) == type(
                str()) and type(production_model) == type(str()):
            self.production_model = EnglishModel(production_model)
            tm = TranslationModel()
            tm.learn_from_text(translation_model, production_model)
            self.translation_model = tm
示例#2
0
def score_fr_en_europarl():
    print "\nFR->EN Europarl:"
    # max lines is 300 at the moment
    num_lines = 300
    num_chars = 50

    tstart = time.time()
    en_lines = get_europarl_en_lines()
    fr_lines = get_europarl_fr_lines()

    en_learn_set = []
    fr_learn_set = []
    en_eval_set = []
    fr_eval_set = []

    for index, pair in enumerate(zip(en_lines, fr_lines)[:num_lines]):
        pair0 = pair[0][:num_chars]
        pair1 = pair[1][:num_chars]
        if index % 4 == 0:
            en_eval_set.append(pair0)
            fr_eval_set.append(pair1)
        else:
            en_learn_set.append(pair0)
            fr_learn_set.append(pair1)

    fr_text = '\n'.join(fr_learn_set)
    en_text = '\n'.join(en_learn_set)

    trx_model = TranslationModel()
    english = EnglishModel(['austen-emma.txt'])
    trx_model.learn_from_text(fr_text, en_text)
    translator = Translator(trx_model, english)

    scorer = TranslationScore()
    n = 0
    score = 0
    for xfr, xen in zip(fr_eval_set, en_eval_set):
        trx_en = translator.translate(xfr)
        n += 1
        score += scorer.of(trx_en, xen)
    avg_score = float(score) / n
    print "lines: ", num_lines, ", chars: ", num_chars
    print "Translation score: {:0.2f}".format(avg_score) + " (of " + str(
        n) + " comparisons)"
    tend = time.time()
    print tdiff(tstart, tend) + " seconds elapsed."