def apply_GenMod(L_train): """ Applies generative model on label matrix :param L_train: Label matrix :return: None """ gen_model = GenerativeModel() # gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) gen_model.train(L_train, cardinality=3) # print(gen_model.weights.lf_accuracy) train_marginals = gen_model.marginals(L_train) report.append('\n#Gen Model Stats\n') report.append(gen_model.learned_lf_stats().to_csv(sep=' ', index=False, header=True)) save_marginals(session, L_train, train_marginals)
def score_gen_model(predicate_resume, session, gen_model_name=None, parallelism=16): if gen_model_name is None: model_name = "G" + predicate_resume["predicate_name"] + "Latest" logging.info("Stats logging") key_group = predicate_resume["label_group"] train_cids_query = get_train_cids_with_span(predicate_resume, session) L_train = load_ltrain(predicate_resume, session) gen_model = GenerativeModel() gen_model.load(model_name) gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) logging.info(gen_model.weights.lf_accuracy) print(gen_model.weights.lf_accuracy) train_marginals = gen_model.marginals(L_train) fig = plt.figure() #hist=plt.hist(train_marginals, bins=20) #plt.savefig("plt"+strftime("%d-%m-%Y_%H_%M_%S", gmtime())+".png", dpi=fig.dpi) gen_model.learned_lf_stats()
def score_lfs(predicate_resume, L_gold_test, session, date_time, parallelism=8): dump_file_path = "./results/" + "lfs_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" key_group = predicate_resume["label_group"] LFs = get_labelling_functions(predicate_resume) labeler = LabelAnnotator(lfs=LFs) test_cids_query = get_test_cids_with_span(predicate_resume, session) L_test = labeler.apply(parallelism=parallelism, cids_query=test_cids_query, key_group=key_group, clear=True, replace_key_set=False) data_frame = L_test.lf_stats(session) print(data_frame) logging.info(data_frame) data_frame.to_csv(dump_file_path) gen_model = GenerativeModel() gen_model.train(L_test, epochs=100, decay=0.95, step_size=0.1 / L_test.shape[0], reg_param=1e-6) p, r, f1 = gen_model.score(L_test, L_gold_test) print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1)) logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format( p, r, f1)) dump_file_path1 = "./results/" + "test_gen_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" with open(dump_file_path1, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Precision", "Recall", "F1"]) writer.writerow( ["{0:.3f}".format(p), "{0:.3f}".format(r), "{0:.3f}".format(f1)]) test_marginals = gen_model.marginals(L_test) dump_file_path2 = "./results/" + "plt_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" #plt.hist(test_marginals, bins=20) #plt.savefig(dump_file_path2) #plt.show() dump_file_path3 = "./results/" + "gen_2_" + predicate_resume[ "predicate_name"] + date_time + ".csv" data_frame3 = gen_model.learned_lf_stats() data_frame3.to_csv(dump_file_path3) dump_file_path4 = "./results/" + "gen_3_" + predicate_resume[ "predicate_name"] + date_time + ".csv" tp, fp, tn, fn = gen_model.error_analysis(session, L_test, L_gold_test) with open(dump_file_path4, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["TP", "FP", "TN", "FN"]) writer.writerow( [str(len(tp)), str(len(fp)), str(len(tn)), str(len(fn))]) dump_file_path5 = "./results/" + "gen_4_" + predicate_resume[ "predicate_name"] + date_time + ".csv" data_frame4 = L_test.lf_stats(session, L_gold_test, gen_model.learned_lf_stats()['Accuracy']) data_frame4.to_csv(dump_file_path5)
print(L_train.lf_stats(session)) # generative model, training_marginals are probabilistic training labels gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) print(gen_model.weights.lf_accuracy) train_marginals = gen_model.marginals(L_train) plt.hist(train_marginals, bins=20) plt.show() print(gen_model.learned_lf_stats()) #L_dev = labeler.apply_existing() ##### writing down labels matchpath='/Users/marcel/Documents/RECHERCHE/STUDENTS/Willeme/sqlshare-cuts-snorkel-tmp.csv' #matchpath='/Users/marcel/Documents/RECHERCHE/STUDENTS/Willeme/check_match.csv' with open(matchpath, mode='w') as match_file: match_writer = csv.writer(match_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) match_writer.writerow(['idSession','idQuery','cut']) i=0 for c in session.query(pairs): #print('i=',i)
# In[ ]: from snorkel.learning import GenerativeModel gen_model = GenerativeModel() get_ipython().magic( u'time gen_model.train(L_train, epochs=10, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6, threads=50, verbose=True)' ) # In[ ]: get_ipython().magic(u'time train_marginals = gen_model.marginals(L_train)') # In[ ]: gen_model.learned_lf_stats() # In[ ]: plt.hist(train_marginals, bins=20) plt.title("Training Marginals for Gibbs Sampler") plt.show() # # Save Training Marginals # Save the training marginals for [Notebook 4](4.data-disc-model). # In[ ]: get_ipython().magic(u'time save_marginals(session, L_train, train_marginals)')
for L in [L_train_BC, L_train_BD, L_train_BD, L_train_BD]: ds = DependencySelector() deps = ds.select(L, threshold=0.1) len(deps) Ldeps.append(deps) gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L_train, deps=deps, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=0.0) train_marginals = gen_model.marginals(L_train) plt.hist(train_marginals, bins=20) plt.show() gen_model.learned_lf_stats() save_marginals(session, L_train, train_marginals) load_external_labels(session, BiomarkerCondition, 'Biomarker', 'Condition', 'articles/disease_gold_labels.tsv', dev_cands, annotator_name='gold') L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) L_gold_dev L_dev = labeler.apply_existing(split=1) _ = gen_model.score(session, L_dev, L_gold_dev) L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy'])