예제 #1
0
def apply_GenMod(L_train):
    """
    Applies generative model on label matrix
    :param L_train: Label matrix
    :return: None
    """
    gen_model = GenerativeModel()
    # gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6)
    gen_model.train(L_train, cardinality=3)
    # print(gen_model.weights.lf_accuracy)
    train_marginals = gen_model.marginals(L_train)
    report.append('\n#Gen Model Stats\n')
    report.append(gen_model.learned_lf_stats().to_csv(sep=' ', index=False, header=True))
    save_marginals(session, L_train, train_marginals)
예제 #2
0
def score_gen_model(predicate_resume,
                    session,
                    gen_model_name=None,
                    parallelism=16):
    if gen_model_name is None:
        model_name = "G" + predicate_resume["predicate_name"] + "Latest"
    logging.info("Stats logging")
    key_group = predicate_resume["label_group"]
    train_cids_query = get_train_cids_with_span(predicate_resume, session)
    L_train = load_ltrain(predicate_resume, session)
    gen_model = GenerativeModel()
    gen_model.load(model_name)
    gen_model.train(L_train,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_train.shape[0],
                    reg_param=1e-6)
    logging.info(gen_model.weights.lf_accuracy)
    print(gen_model.weights.lf_accuracy)
    train_marginals = gen_model.marginals(L_train)
    fig = plt.figure()
    #hist=plt.hist(train_marginals, bins=20)
    #plt.savefig("plt"+strftime("%d-%m-%Y_%H_%M_%S", gmtime())+".png", dpi=fig.dpi)
    gen_model.learned_lf_stats()
예제 #3
0
def score_lfs(predicate_resume,
              L_gold_test,
              session,
              date_time,
              parallelism=8):
    dump_file_path = "./results/" + "lfs_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"

    key_group = predicate_resume["label_group"]
    LFs = get_labelling_functions(predicate_resume)
    labeler = LabelAnnotator(lfs=LFs)
    test_cids_query = get_test_cids_with_span(predicate_resume, session)
    L_test = labeler.apply(parallelism=parallelism,
                           cids_query=test_cids_query,
                           key_group=key_group,
                           clear=True,
                           replace_key_set=False)

    data_frame = L_test.lf_stats(session)
    print(data_frame)
    logging.info(data_frame)
    data_frame.to_csv(dump_file_path)

    gen_model = GenerativeModel()
    gen_model.train(L_test,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_test.shape[0],
                    reg_param=1e-6)

    p, r, f1 = gen_model.score(L_test, L_gold_test)
    print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))
    logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(
        p, r, f1))
    dump_file_path1 = "./results/" + "test_gen_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    with open(dump_file_path1, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["Precision", "Recall", "F1"])
        writer.writerow(
            ["{0:.3f}".format(p), "{0:.3f}".format(r), "{0:.3f}".format(f1)])

    test_marginals = gen_model.marginals(L_test)

    dump_file_path2 = "./results/" + "plt_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    #plt.hist(test_marginals, bins=20)
    #plt.savefig(dump_file_path2)
    #plt.show()

    dump_file_path3 = "./results/" + "gen_2_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    data_frame3 = gen_model.learned_lf_stats()
    data_frame3.to_csv(dump_file_path3)

    dump_file_path4 = "./results/" + "gen_3_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    tp, fp, tn, fn = gen_model.error_analysis(session, L_test, L_gold_test)
    with open(dump_file_path4, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["TP", "FP", "TN", "FN"])
        writer.writerow(
            [str(len(tp)),
             str(len(fp)),
             str(len(tn)),
             str(len(fn))])

    dump_file_path5 = "./results/" + "gen_4_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    data_frame4 = L_test.lf_stats(session, L_gold_test,
                                  gen_model.learned_lf_stats()['Accuracy'])
    data_frame4.to_csv(dump_file_path5)
print(L_train.lf_stats(session))


# generative model, training_marginals are probabilistic training labels
gen_model = GenerativeModel()
gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6)


print(gen_model.weights.lf_accuracy)

train_marginals = gen_model.marginals(L_train)

plt.hist(train_marginals, bins=20)
plt.show()

print(gen_model.learned_lf_stats())


#L_dev = labeler.apply_existing()

##### writing down labels


matchpath='/Users/marcel/Documents/RECHERCHE/STUDENTS/Willeme/sqlshare-cuts-snorkel-tmp.csv'
#matchpath='/Users/marcel/Documents/RECHERCHE/STUDENTS/Willeme/check_match.csv'
with open(matchpath, mode='w') as match_file:
    match_writer = csv.writer(match_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    match_writer.writerow(['idSession','idQuery','cut'])
    i=0
    for c in session.query(pairs):
        #print('i=',i)
# In[ ]:

from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
get_ipython().magic(
    u'time gen_model.train(L_train, epochs=10, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6, threads=50, verbose=True)'
)

# In[ ]:

get_ipython().magic(u'time train_marginals = gen_model.marginals(L_train)')

# In[ ]:

gen_model.learned_lf_stats()

# In[ ]:

plt.hist(train_marginals, bins=20)
plt.title("Training Marginals for Gibbs Sampler")
plt.show()

# # Save Training Marginals

# Save the training marginals for [Notebook 4](4.data-disc-model).

# In[ ]:

get_ipython().magic(u'time save_marginals(session, L_train, train_marginals)')
예제 #6
0
for L in [L_train_BC, L_train_BD, L_train_BD, L_train_BD]:
    ds = DependencySelector()
    deps = ds.select(L, threshold=0.1)
    len(deps)
    Ldeps.append(deps)

gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(L_train,
                deps=deps,
                decay=0.95,
                step_size=0.1 / L_train.shape[0],
                reg_param=0.0)
train_marginals = gen_model.marginals(L_train)
plt.hist(train_marginals, bins=20)
plt.show()
gen_model.learned_lf_stats()
save_marginals(session, L_train, train_marginals)
load_external_labels(session,
                     BiomarkerCondition,
                     'Biomarker',
                     'Condition',
                     'articles/disease_gold_labels.tsv',
                     dev_cands,
                     annotator_name='gold')

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev
L_dev = labeler.apply_existing(split=1)
_ = gen_model.score(session, L_dev, L_gold_dev)
L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy'])