Пример #1
0
def get_instance(ti_line):
    j = json.loads(ti_line)
    ti = TrainingInstance.from_dict(j)
    obs = [
        o.l2_word for o in ti.current_sent if o.lang == 'de'
        if o.l2_word.strip() != ''
    ]
    guesses = [g.guess for g in ti.current_guesses if g.guess.strip() != '']
    guesses += [
        o.l2_word for o in ti.current_sent if o.lang == 'en'
        if o.l2_word.strip() != ''
    ]  # wink! ;)
    guesses += [
        g.guess for g in ti.past_correct_guesses if g.guess.strip() != ''
    ]
    guesses += [
        g.guess for g in ti.past_guesses_for_current_sent
        if g.guess.strip() != ''
    ]
    guesses = [g.split() for g in guesses]
    obs = [o.split() for o in obs]
    guesses_flat = sum(guesses, [])
    #for gf in guesses_flat:
    #    add_to_tags(gf)

    obs_flat = sum(obs, [])
    for of in obs_flat:
        add_to_obs(of)
    return ti, obs_flat, guesses_flat
Пример #2
0
def batch_predictions(training_instance,
                      theta_en_en_names, theta_en_de_names,
                      theta_en_en, theta_en_de,
                      phi_wapper, lr,
                      en_domain, de2id, en2id, d2t, qp=False):
    j_ti = json.loads(training_instance)
    ti = TrainingInstance.from_dict(j_ti)
    sent_id = ti.current_sent[0].sent_id

    fg = create_factor_graph(ti=ti,
                             learning_rate=lr,
                             theta_en_en_names=theta_en_en_names,
                             theta_en_de_names=theta_en_de_names,
                             theta_en_de=theta_en_de,
                             theta_en_en=theta_en_en,
                             phi_wrapper=phi_wapper,
                             en_domain=en_domain,
                             de2id=de2id,
                             en2id=en2id, d2t=d2t)

    fg.initialize()
    fg.treelike_inference(3)
    p = fg.get_posterior_probs()
    if qp:
        factor_dist = None
        fgs = None
    else:
        fgs = '\n'.join(['*SENT_ID:' + str(sent_id)] + fg.to_string())
        factor_dist = fg.to_dist()
    p0,p25, p50, t = fg.get_precision_counts()
    return [p, fgs, factor_dist, (p0, p25, p50,t)]
Пример #3
0
def batch_predictions(training_instance,
                      f_en_en_theta,
                      f_en_de_theta,
                      adapt_phi_en_en,
                      adapt_phi_en_de, lr,
                      en_domain,
                      de2id,
                      en2id,
                      basic_f_en_en,
                      basic_f_en_de,
                      domain2theta):
    j_ti = json.loads(training_instance)
    ti = TrainingInstance.from_dict(j_ti)
    sent_id = ti.current_sent[0].sent_id
    fg = create_factor_graph(ti=ti,
                             learning_rate=lr,
                             theta_en_de=f_en_de_theta,
                             theta_en_en=f_en_en_theta,
                             phi_en_en=adapt_phi_en_en,
                             phi_en_de=adapt_phi_en_de,
                             basic_f_en_en=basic_f_en_en,
                             basic_f_en_de=basic_f_en_de,
                             en_domain=en_domain,
                             de2id=de2id,
                             en2id=en2id,
                             domain2theta=domain2theta)

    fg.initialize()
    fg.treelike_inference(3)
    return fg.get_posterior_probs()
Пример #4
0
def batch_sgd(training_instance,
              theta_en_en,
              theta_en_de,
              phi_en_en,
              phi_en_de, lr,
              en_domain,
              de2id,
              en2id,
              basic_f_en_en,
              basic_f_en_de,
              domain2theta):
    j_ti = json.loads(training_instance)
    ti = TrainingInstance.from_dict(j_ti)
    sent_id = ti.current_sent[0].sent_id
    fg = create_factor_graph(ti=ti,
                             learning_rate=lr,
                             theta_en_de=theta_en_de,
                             theta_en_en=theta_en_en,
                             phi_en_en=phi_en_en,
                             phi_en_de=phi_en_de,
                             basic_f_en_en=basic_f_en_en,
                             basic_f_en_de=basic_f_en_de,
                             en_domain=en_domain,
                             de2id=de2id,
                             en2id=en2id,
                             domain2theta=domain2theta)

    fg.initialize()
    # sys.stderr.write('.')
    fg.treelike_inference(3)
    # sys.stderr.write('.')
    # f_en_en_theta, f_en_de_theta = fg.update_theta()
    g_en_en, g_en_de = fg.get_unregularized_gradeint()

    sample_ag = {}
    for f_type, u in fg.active_domains:
        g = g_en_en.copy() if f_type == 'en_en' else g_en_de.copy()
        t = domain2theta[f_type, u]
        r = fg.regularization_param
        l = fg.learning_rate
        sample_ag[f_type, u] = apply_regularization(r * 0.001, g, l, t)  # use a smaller regularization term
    g_en_en = apply_regularization(r, g_en_en, l, fg.theta_en_en)
    g_en_de = apply_regularization(r, g_en_de, l, fg.theta_en_de)
    # turn off adapt_phi
    return [sent_id, g_en_en, g_en_de, sample_ag]
Пример #5
0
def get_instance(ti_line):
    j = json.loads(ti_line)
    ti = TrainingInstance.from_dict(j)
    obs = [o.l2_word for o in ti.current_sent if o.lang == 'de' if o.l2_word.strip() != '']
    guesses = [g.guess for g in ti.current_guesses if g.guess.strip() != '']
    guesses += [o.l2_word for o in ti.current_sent if o.lang == 'en' if o.l2_word.strip() != '']  # wink! ;)
    guesses += [g.guess for g in ti.past_correct_guesses if g.guess.strip() != '']
    guesses += [g.guess for g in ti.past_guesses_for_current_sent if g.guess.strip() != '']
    guesses = [g.split() for g in guesses]
    obs = [o.split() for o in obs]
    guesses_flat = sum(guesses, [])
    #for gf in guesses_flat:
    #    add_to_tags(gf)

    obs_flat = sum(obs, [])
    for of in obs_flat:
        add_to_obs(of)
    return ti, obs_flat, guesses_flat
Пример #6
0
def batch_sgd(training_instance,
              theta_en_en_names, theta_en_de_names,
              theta_en_en, theta_en_de,
              phi_wrapper, lr,
              en_domain, de2id, en2id, d2t):
    j_ti = json.loads(training_instance)
    ti = TrainingInstance.from_dict(j_ti)
    sent_id = ti.current_sent[0].sent_id
    fg = create_factor_graph(ti=ti,
                             learning_rate=lr,
                             theta_en_en_names=theta_en_en_names,
                             theta_en_de_names=theta_en_de_names,
                             theta_en_de=theta_en_de,
                             theta_en_en=theta_en_en,
                             phi_wrapper=phi_wrapper,
                             en_domain=en_domain,
                             de2id=de2id,
                             en2id=en2id,
                             d2t=d2t)
    fg.initialize()
    fg.treelike_inference(3)
    # f_en_en_theta, f_en_de_theta = fg.update_theta()
    if options.user_adapt or options.experience_adapt:
        g_en_en, g_en_de = fg.get_unregularized_gradeint()
        sample_ag = {}
        for f_type, d in fg.active_domains:
            g = g_en_en.copy() if f_type == 'en_en' else g_en_de.copy()
            t = domain2theta[f_type, d]
            r = fg.regularization_param
            l = fg.learning_rate
            scale_reg = float(options.reg_param_ua_scale)
            sample_ag[f_type, d] = apply_regularization(r * scale_reg, g, l, t)  # use a smaller regularization term
        g_en_en = apply_regularization(r, g_en_en, l, fg.theta_en_en)
        g_en_de = apply_regularization(r, g_en_de, l, fg.theta_en_de)
    else:
        sample_ag = None
        g_en_en, g_en_de = fg.return_gradient()
    fg.display_timing_info() 
    p  = fg.get_posterior_probs()

    return [sent_id, p, g_en_en, g_en_de, sample_ag]
Пример #7
0
def batch_predictions(training_instance, f_en_en_theta, f_en_de_theta,
                      adapt_phi_en_en, adapt_phi_en_de, lr, en_domain, de2id,
                      en2id, basic_f_en_en, basic_f_en_de, domain2theta):
    j_ti = json.loads(training_instance)
    ti = TrainingInstance.from_dict(j_ti)
    sent_id = ti.current_sent[0].sent_id
    fg = create_factor_graph(ti=ti,
                             learning_rate=lr,
                             theta_en_de=f_en_de_theta,
                             theta_en_en=f_en_en_theta,
                             phi_en_en=adapt_phi_en_en,
                             phi_en_de=adapt_phi_en_de,
                             basic_f_en_en=basic_f_en_en,
                             basic_f_en_de=basic_f_en_de,
                             en_domain=en_domain,
                             de2id=de2id,
                             en2id=en2id,
                             domain2theta=domain2theta)

    fg.initialize()
    fg.treelike_inference(3)
    return fg.get_posterior_probs()
Пример #8
0
def batch_sgd(training_instance, theta_en_en, theta_en_de, phi_en_en,
              phi_en_de, lr, en_domain, de2id, en2id, basic_f_en_en,
              basic_f_en_de, domain2theta):
    j_ti = json.loads(training_instance)
    ti = TrainingInstance.from_dict(j_ti)
    sent_id = ti.current_sent[0].sent_id
    fg = create_factor_graph(ti=ti,
                             learning_rate=lr,
                             theta_en_de=theta_en_de,
                             theta_en_en=theta_en_en,
                             phi_en_en=phi_en_en,
                             phi_en_de=phi_en_de,
                             basic_f_en_en=basic_f_en_en,
                             basic_f_en_de=basic_f_en_de,
                             en_domain=en_domain,
                             de2id=de2id,
                             en2id=en2id,
                             domain2theta=domain2theta)

    fg.initialize()
    # sys.stderr.write('.')
    fg.treelike_inference(3)
    # sys.stderr.write('.')
    # f_en_en_theta, f_en_de_theta = fg.update_theta()
    g_en_en, g_en_de = fg.get_unregularized_gradeint()

    sample_ag = {}
    for f_type, u in fg.active_domains:
        g = g_en_en.copy() if f_type == 'en_en' else g_en_de.copy()
        t = domain2theta[f_type, u]
        r = fg.regularization_param
        l = fg.learning_rate
        sample_ag[f_type, u] = apply_regularization(
            r * 0.001, g, l, t)  # use a smaller regularization term
    g_en_en = apply_regularization(r, g_en_en, l, fg.theta_en_en)
    g_en_de = apply_regularization(r, g_en_de, l, fg.theta_en_de)
    # turn off adapt_phi
    return [sent_id, g_en_en, g_en_de, sample_ag]
Пример #9
0
    # pre_fire_en_en = sparse.csr_matrix(pre_fire_en_en)

    f_en_de_theta = np.zeros((1, len(f_en_de)))
    phi_en_de = np.random.rand(len(en_domain) * len(de_domain), len(f_en_de))
    phi_en_de[phi_en_de > 0.5] = 1.0
    phi_en_de[phi_en_de < 0.2] = 0.0

    load_times = []
    grad_times = []
    inference_times = []
    mp_times = []
    fg = None
    split_ratio = int(len(training_instances) * 0.33)
    test_instances = training_instances[:split_ratio]
    all_training_instances = training_instances[split_ratio:]
    lr = 0.1
    for t_idx, training_instance in enumerate(training_instances):
        print t_idx
        j_ti = json.loads(training_instance)
        ti = TrainingInstance.from_dict(j_ti)
        lt = time.time()
        fg = FactorGraph(
            theta_en_en=f_en_en_theta if fg is None else fg.theta_en_en,
            theta_en_de=f_en_de_theta if fg is None else fg.theta_en_de,
            phi_en_en=phi_en_en,
            phi_en_de=phi_en_de)
        fg.learning_rate = lr

        fg = load_fg(fg, ti, en_domain, de2id=de2id, en2id=en2id)
    print 'done checking', len(training_instances)
Пример #10
0
    tag_list = list(set(to.split('\n')[0].split()))
    obs_list = list(set(to.split('\n')[1].split()))
    fac_cell_2feat = {}
    feat2id = {}
    for features_fired_in_factor in factors.strip().split('FACTOR:'):
        feature_lines = features_fired_in_factor.strip().split('\n')
        fac_type = feature_lines[0].strip()
        for fl in feature_lines[1:]:
            items = fl.split()
            label1, label2 = items[0], items[1]
            fac_cell_2feat[fac_type, label1, label2] = [(f_name, 1.0) for f_name in items[2:]]
            for f_fired in items[2:]:
                feat2id[f_fired] = feat2id.get(f_fired, len(feat2id))
    return tag_list, obs_list, fac_cell_2feat, feat2id


if __name__ == '__main__':
    opt = OptionParser()
    # insert options here
    opt.add_option('--ti', dest='training_instances', default='')
    (options, _) = opt.parse_args()
    if options.training_instances == '':
        sys.stderr.write("Usage: jython macaronic-tagger.py --ti [training instances file]\n")
        exit(1)
    for line in codecs.open(options.training_instances, 'r', 'utf8').readlines():
        jti = json.loads(line)
        ti = TrainingInstance.from_dict(jti)
        pdb.set_trace()