예제 #1
0
def load_ltrain(predicate_resume, session):
    key_group = predicate_resume["label_group"]
    LFs = get_labelling_functions(predicate_resume)
    labeler = LabelAnnotator(lfs=LFs)
    train_cids_query = get_train_cids_with_span(predicate_resume, session)
    L_train = labeler.load_matrix(session,
                                  cids_query=train_cids_query,
                                  key_group=key_group)
    return L_train
def get_L_train(LFs, parallelism=2, split=0):
    L_train = None
    labeler = None
    np.random.seed(1701)
    labeler = LabelAnnotator(lfs=LFs)
    print(datetime.datetime.now())
    L_train = labeler.apply(
        split=split
    )  # ,cids_query=session.query(Candidate.id).filter(Candidate.get_parent().id %10==1))
    print(datetime.datetime.now())
    print(type(L_train))
    print(L_train.shape)
    # print("**Total non_overlapping_coverage on L_train (percentage of labelled over all)**  "+str(L_train.non_overlapping_coverage()))
    return L_train
예제 #3
0
def apply_LF(lf_file):
    """
    Load labeling functions and applies on the candidates extracted in train set
    :param lf_file: labeling functions python file
    :return: L_train
    """
    labeling_func = __import__(lf_file)
    LF_list = [o[1] for o in getmembers(labeling_func) if isfunction(o[1])]
    labeler = LabelAnnotator(lfs=LF_list)
    np.random.seed(1701)
    L_train = labeler.apply(split=0)
    L_train.todense()
    report.append('\n#LF Stats\n')
    report.append(L_train.lf_stats(session).to_csv(sep=' ', index=False, header=True))
    return L_train
예제 #4
0
def predicate_candidate_labelling(predicate_resume,
                                  parallelism=1,
                                  limit=None,
                                  replace_key_set=False):
    logging.info("Starting labeling ")
    session = SnorkelSession()
    try:
        candidate_subclass = predicate_resume["candidate_subclass"]
        key_group = predicate_resume["label_group"]

        cids_query = session.query(
            candidate_subclass.id).filter(candidate_subclass.split == 0)

        ##skip cands already extracted
        #alreadyExistsGroup=session.query(LabelKey).filter(LabelKey.group==key_group).count()>0
        #if alreadyExistsGroup:
        #    cids_query= get_train_cids_not_labeled(predicate_resume,session)

        #if limit !=None:
        #    cids_query=cids_query.filter(candidate_subclass.id<limit)

        LFs = get_labelling_functions(predicate_resume)

        labeler = LabelAnnotator(lfs=LFs)
        np.random.seed(1701)

        ##if first run or adding a new labeling functionS is needed to set replace key set to True
        #if not replace_key_set:
        #    replace_key_set=not alreadyExistsGroup
        L_train = labeler.apply(parallelism=parallelism,
                                cids_query=cids_query,
                                key_group=key_group,
                                clear=True,
                                replace_key_set=True)
        print(L_train.lf_stats(session))
        logging.info(L_train.lf_stats(session))

    finally:
        logging.info("Finished labeling ")
예제 #5
0
def _get_labeler(predicate_resume):
    LFs = get_labelling_functions(predicate_resume)
    labeler = LabelAnnotator(lfs=LFs)
    return labeler
예제 #6
0
def score_lfs(predicate_resume,
              L_gold_test,
              session,
              date_time,
              parallelism=8):
    dump_file_path = "./results/" + "lfs_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"

    key_group = predicate_resume["label_group"]
    LFs = get_labelling_functions(predicate_resume)
    labeler = LabelAnnotator(lfs=LFs)
    test_cids_query = get_test_cids_with_span(predicate_resume, session)
    L_test = labeler.apply(parallelism=parallelism,
                           cids_query=test_cids_query,
                           key_group=key_group,
                           clear=True,
                           replace_key_set=False)

    data_frame = L_test.lf_stats(session)
    print(data_frame)
    logging.info(data_frame)
    data_frame.to_csv(dump_file_path)

    gen_model = GenerativeModel()
    gen_model.train(L_test,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_test.shape[0],
                    reg_param=1e-6)

    p, r, f1 = gen_model.score(L_test, L_gold_test)
    print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))
    logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(
        p, r, f1))
    dump_file_path1 = "./results/" + "test_gen_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    with open(dump_file_path1, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["Precision", "Recall", "F1"])
        writer.writerow(
            ["{0:.3f}".format(p), "{0:.3f}".format(r), "{0:.3f}".format(f1)])

    test_marginals = gen_model.marginals(L_test)

    dump_file_path2 = "./results/" + "plt_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    #plt.hist(test_marginals, bins=20)
    #plt.savefig(dump_file_path2)
    #plt.show()

    dump_file_path3 = "./results/" + "gen_2_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    data_frame3 = gen_model.learned_lf_stats()
    data_frame3.to_csv(dump_file_path3)

    dump_file_path4 = "./results/" + "gen_3_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    tp, fp, tn, fn = gen_model.error_analysis(session, L_test, L_gold_test)
    with open(dump_file_path4, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["TP", "FP", "TN", "FN"])
        writer.writerow(
            [str(len(tp)),
             str(len(fp)),
             str(len(tn)),
             str(len(fn))])

    dump_file_path5 = "./results/" + "gen_4_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    data_frame4 = L_test.lf_stats(session, L_gold_test,
                                  gen_model.learned_lf_stats()['Accuracy'])
    data_frame4.to_csv(dump_file_path5)
예제 #7
0

def LF_distant_supervision(c):
    v, h = c.virus.get_span(), c.host.get_span()
    return 1 if (v, h) in known_pairs else 0


# list of all LFs
LFs = [
    LF_detect, LF_infect, LF_isolate, LF_positive, LF_positive2, LF_misc,
    LF_v_cause_h, LF_v_h, LF_h_v, LF_other_verbs, LF_far_v_h, LF_far_h_v,
    LF_neg_h, LF_neg_assertions, LF_distant_supervision
]

# set up the label annotator class
labeler = LabelAnnotator(lfs=LFs)

# -------------------------------------------

# START CROSS VALIDATION SPLIT in a loop:

# Make an array of indexes (should equal number of documents 88). In a loop, split the index array into   train, test, and dev arrays. The sentences get added to the respective t,t,d sets and the candidates are extracted.

index_array = np.arange(0, 88)

# for roc
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# for recording prec, rec, f1 scores
예제 #8
0
from utils.label_functions.compound_disease_lf import CD_LFS
#from utils.gene_gene_lf import GG_LFS


# # Label The Candidates

# Label each candidate based on the provided labels above. This code runs with realtive ease, but optimization is definitely needed when the number of label functions increases linearly.

# In[ ]:


label_functions = list(CG_LFS["CbG_DB"].values()) + 
                  list(CG_LFS["CbG_TEXT"].values()) +   
                  list(DG_LFS["DaG_TEXT"].values())

labeler = LabelAnnotator(lfs=label_functions)


# # Quickly Relabel Candidates

# Use this block here to re-label candidates that have already been labled from the above process.

# In[ ]:


train_df = pd.read_excel(spreadsheet_names['train'])
train_cids = train_df.candidate_id.astype(int).tolist()
train_df.head(2)


# In[ ]:
예제 #9
0
    tweet = Tweet(tweet=raw_text, split=split)
    session.add(tweet)

session.commit()

print("Commit to snorkel database done...")


#writing label generator
def worker_label_generator(t):
    for worker_id in cand_dict[t.tweet.stable_id]:
        yield worker_id, cand_dict[t.tweet.stable_id][worker_id]


np.random.seed(1701)
labeler = LabelAnnotator(label_generator=worker_label_generator)
L_train = labeler.apply(split=0)

print(L_train.lf_stats(session))

print("Creat training data done...")
print(" -train data shape", (L_train.shape))

print("Start to train a generative model")
gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(L_train, reg_type=2, reg_param=0.1, epochs=30)

#doing statistics
print(gen_model.learned_lf_stats())

print("Train a genetive model done...!")
예제 #10
0
    LF_common_1000, LF_common_2000
]
LFs_BD = [
    LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000,
    LF_same_thing_BD
]
LFs_BM = [
    LF_distance_far, LF_colon, LF_known_abs, LF_single_letter,
    LF_roman_numeral, LF_common_2000, LF_same_thing
]
LFs_BT = [
    LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000,
    LF_same_thing
]

labeler_BC = LabelAnnotator(lfs=LFs_BC)
labeler_BD = LabelAnnotator(lfs=LFs_BD)
labeler_BM = LabelAnnotator(lfs=LFs_BM)
labeler_BT = LabelAnnotator(lfs=LFs_BT)

# Training
L_train_BC = labeler_BC.apply(split=0)
L_train_BD = labeler_BD.apply(split=0)
L_train_BM = labeler_BM.apply(split=0)
L_train_BT = labeler_BT.apply(split=0)
L_train_BC
L_train_BD
L_train_BM
L_train_BT

# Labeling Function Performance - Coverage, Overlaps, Conflicts
    BiomarkerCondition).filter(BiomarkerCondition.split == 1).count()
session.commit()

# In[ ]:

from LF import *
LFs_BC = [
    LF_markerDatabase, LF_keyword, LF_distance, LF_abstract_titleWord,
    LF_single_letter, LF_auxpass, LF_known_abs, LF_same_thing_BC,
    LF_common_1000, LF_common_2000
]

# In[ ]:

from snorkel.annotations import LabelAnnotator
BC_labeler = LabelAnnotator(lfs=LFs_BC)

# In[ ]:

np.random.seed(1701)
get_ipython().magic(u'time L_train_BC = BC_labeler.apply(split=0)')
L_train_BC

# In[ ]:

get_ipython().magic(
    u'time L_train_BC = BC_labeler.load_matrix(session, split=0)')
L_train_BC

# In[ ]:
예제 #12
0
#candidates = [session.query(DiseaseGene).filter(DiseaseGene.id == ids).one() for ids in [19817,19818,19830,19862,19980,20001,20004]]

for c in candidates:
    if c[0].get_parent().id != 14264:
        continue
    print c
    print get_tagged_text(c)
    print c[1].sentence.entity_cids[c[1].get_word_start()]

# # Label The Candidates

# This block of code will run through the label functions and label each candidate in the training and development groups.

# In[ ]:

labeler = LabelAnnotator(f=LFs)

get_ipython().magic(u'time L_train = labeler.apply(split=0)')
get_ipython().magic(u'time L_dev = labeler.apply_existing(split=1)')
get_ipython().magic(u'time L_test = labeler.apply_existing(split=2)')

# In[ ]:

featurizer = FeatureAnnotator()

get_ipython().magic(u'time F_train = featurizer.apply(split=0)')
get_ipython().magic(u'time F_dev = featurizer.apply_existing(split=1)')
get_ipython().magic(u'time F_test = featurizer.apply_existing(split=2)')

# # Generate Coverage Stats
예제 #13
0
    GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2'])
elif edge_type == "cg":
    CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene'])
elif edge_type == "cd":
    CompoundDisease = candidate_subclass('CompoundDisease',
                                         ['Compound', 'Disease'])
else:
    print("Please pick a valid edge type")

# # Load the data

# Here is where we load the test dataset in conjunction with the previously trained disc models. Each algorithm will output a probability of a candidate being a true candidate.

# In[6]:

labeler = LabelAnnotator(lfs=[])

# In[7]:

get_ipython().run_cell_magic(u'time', u'',
                             u'L_test = labeler.load_matrix(session,split=2)')

# In[8]:

L_test.shape

# In[9]:

marginal_files = [
    "stratified_data/lstm_disease_gene_holdout/LR_data/LR_test_marginals.csv",
    "stratified_data/lstm_disease_gene_holdout/lstm_one_test_marginals.csv",
예제 #14
0
# In[ ]:

database_str = "sqlite:///" + os.environ['WORKINGPATH'] + "/Database/epilepsy.db"
os.environ['SNORKELDB'] = database_str


session = SnorkelSession()


# # Load preprocessed data 

# To save time, this code will automatically load our labels that were generated in the previous file.

# In[ ]:

labeler = LabelAnnotator(f=None)

L_train = labeler.load_matrix(session,split=0)
L_dev = labeler.load_matrix(session,split=1)
L_test = labeler.load_matrix(session,split=2)


# In[ ]:

print "Total Data Shape:"
print L_train.shape
print L_dev.shape
print L_test.shape
print

print "The number of positive candiadtes (in KB) for each division:"