예제 #1
0
def load_ltrain(predicate_resume, session):
    key_group = predicate_resume["label_group"]
    LFs = get_labelling_functions(predicate_resume)
    labeler = LabelAnnotator(lfs=LFs)
    train_cids_query = get_train_cids_with_span(predicate_resume, session)
    L_train = labeler.load_matrix(session,
                                  cids_query=train_cids_query,
                                  key_group=key_group)
    return L_train
def get_L_train(LFs, parallelism=2, split=0):
    L_train = None
    labeler = None
    np.random.seed(1701)
    labeler = LabelAnnotator(lfs=LFs)
    print(datetime.datetime.now())
    L_train = labeler.apply(
        split=split
    )  # ,cids_query=session.query(Candidate.id).filter(Candidate.get_parent().id %10==1))
    print(datetime.datetime.now())
    print(type(L_train))
    print(L_train.shape)
    # print("**Total non_overlapping_coverage on L_train (percentage of labelled over all)**  "+str(L_train.non_overlapping_coverage()))
    return L_train
예제 #3
0
def apply_LF(lf_file):
    """
    Load labeling functions and applies on the candidates extracted in train set
    :param lf_file: labeling functions python file
    :return: L_train
    """
    labeling_func = __import__(lf_file)
    LF_list = [o[1] for o in getmembers(labeling_func) if isfunction(o[1])]
    labeler = LabelAnnotator(lfs=LF_list)
    np.random.seed(1701)
    L_train = labeler.apply(split=0)
    L_train.todense()
    report.append('\n#LF Stats\n')
    report.append(L_train.lf_stats(session).to_csv(sep=' ', index=False, header=True))
    return L_train
예제 #4
0
def predicate_candidate_labelling(predicate_resume,
                                  parallelism=1,
                                  limit=None,
                                  replace_key_set=False):
    logging.info("Starting labeling ")
    session = SnorkelSession()
    try:
        candidate_subclass = predicate_resume["candidate_subclass"]
        key_group = predicate_resume["label_group"]

        cids_query = session.query(
            candidate_subclass.id).filter(candidate_subclass.split == 0)

        ##skip cands already extracted
        #alreadyExistsGroup=session.query(LabelKey).filter(LabelKey.group==key_group).count()>0
        #if alreadyExistsGroup:
        #    cids_query= get_train_cids_not_labeled(predicate_resume,session)

        #if limit !=None:
        #    cids_query=cids_query.filter(candidate_subclass.id<limit)

        LFs = get_labelling_functions(predicate_resume)

        labeler = LabelAnnotator(lfs=LFs)
        np.random.seed(1701)

        ##if first run or adding a new labeling functionS is needed to set replace key set to True
        #if not replace_key_set:
        #    replace_key_set=not alreadyExistsGroup
        L_train = labeler.apply(parallelism=parallelism,
                                cids_query=cids_query,
                                key_group=key_group,
                                clear=True,
                                replace_key_set=True)
        print(L_train.lf_stats(session))
        logging.info(L_train.lf_stats(session))

    finally:
        logging.info("Finished labeling ")
예제 #5
0
def score_lfs(predicate_resume,
              L_gold_test,
              session,
              date_time,
              parallelism=8):
    dump_file_path = "./results/" + "lfs_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"

    key_group = predicate_resume["label_group"]
    LFs = get_labelling_functions(predicate_resume)
    labeler = LabelAnnotator(lfs=LFs)
    test_cids_query = get_test_cids_with_span(predicate_resume, session)
    L_test = labeler.apply(parallelism=parallelism,
                           cids_query=test_cids_query,
                           key_group=key_group,
                           clear=True,
                           replace_key_set=False)

    data_frame = L_test.lf_stats(session)
    print(data_frame)
    logging.info(data_frame)
    data_frame.to_csv(dump_file_path)

    gen_model = GenerativeModel()
    gen_model.train(L_test,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_test.shape[0],
                    reg_param=1e-6)

    p, r, f1 = gen_model.score(L_test, L_gold_test)
    print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))
    logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(
        p, r, f1))
    dump_file_path1 = "./results/" + "test_gen_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    with open(dump_file_path1, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["Precision", "Recall", "F1"])
        writer.writerow(
            ["{0:.3f}".format(p), "{0:.3f}".format(r), "{0:.3f}".format(f1)])

    test_marginals = gen_model.marginals(L_test)

    dump_file_path2 = "./results/" + "plt_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    #plt.hist(test_marginals, bins=20)
    #plt.savefig(dump_file_path2)
    #plt.show()

    dump_file_path3 = "./results/" + "gen_2_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    data_frame3 = gen_model.learned_lf_stats()
    data_frame3.to_csv(dump_file_path3)

    dump_file_path4 = "./results/" + "gen_3_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    tp, fp, tn, fn = gen_model.error_analysis(session, L_test, L_gold_test)
    with open(dump_file_path4, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["TP", "FP", "TN", "FN"])
        writer.writerow(
            [str(len(tp)),
             str(len(fp)),
             str(len(tn)),
             str(len(fn))])

    dump_file_path5 = "./results/" + "gen_4_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    data_frame4 = L_test.lf_stats(session, L_gold_test,
                                  gen_model.learned_lf_stats()['Accuracy'])
    data_frame4.to_csv(dump_file_path5)
예제 #6
0

def LF_distant_supervision(c):
    v, h = c.virus.get_span(), c.host.get_span()
    return 1 if (v, h) in known_pairs else 0


# list of all LFs
LFs = [
    LF_detect, LF_infect, LF_isolate, LF_positive, LF_positive2, LF_misc,
    LF_v_cause_h, LF_v_h, LF_h_v, LF_other_verbs, LF_far_v_h, LF_far_h_v,
    LF_neg_h, LF_neg_assertions, LF_distant_supervision
]

# set up the label annotator class
labeler = LabelAnnotator(lfs=LFs)

# -------------------------------------------

# START CROSS VALIDATION SPLIT in a loop:

# Make an array of indexes (should equal number of documents 88). In a loop, split the index array into   train, test, and dev arrays. The sentences get added to the respective t,t,d sets and the candidates are extracted.

index_array = np.arange(0, 88)

# for roc
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# for recording prec, rec, f1 scores
candidates = session.query(DiseaseGene).filter(
    DiseaseGene.split == 0).limit(1).all()
LF_DEBUG(candidates[0])

# In[ ]:

LFs = get_lfs()

# # Label The Candidates

# Label each candidate based on the provided labels above. This code runs with realtive ease, but optimization is definitely needed when the number of label functions increases linearly.

# In[ ]:

labeler = LabelAnnotator(lfs=LFs)

cids = session.query(Candidate.id).filter(Candidate.split == 0)
get_ipython().magic(
    u'time L_train = labeler.apply(split=0, cids_query=cids, parallelism=5)')

cids = session.query(Candidate.id).filter(Candidate.split == 1)
get_ipython().magic(
    u'time L_dev = labeler.apply_existing(split=1, cids_query=cids, parallelism=5, clear=False)'
)

cids = session.query(Candidate.id).filter(Candidate.split == 2)
get_ipython().magic(
    u'time L_test = labeler.apply_existing(split=2, cids_query=cids, parallelism=5, clear=False)'
)
예제 #8
0
from utils.label_functions.compound_disease_lf import CD_LFS
#from utils.gene_gene_lf import GG_LFS


# # Label The Candidates

# Label each candidate based on the provided labels above. This code runs with realtive ease, but optimization is definitely needed when the number of label functions increases linearly.

# In[ ]:


label_functions = list(CG_LFS["CbG_DB"].values()) + 
                  list(CG_LFS["CbG_TEXT"].values()) +   
                  list(DG_LFS["DaG_TEXT"].values())

labeler = LabelAnnotator(lfs=label_functions)


# # Quickly Relabel Candidates

# Use this block here to re-label candidates that have already been labled from the above process.

# In[ ]:


train_df = pd.read_excel(spreadsheet_names['train'])
train_cids = train_df.candidate_id.astype(int).tolist()
train_df.head(2)


# In[ ]:
docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents=set();
for i,doc in enumerate(docs):
    for s in doc.sentences:
        sents.add(s)


cand_extractor.apply(sents)

print("Number of candidates:", session.query(pairs).count())


labeler = LabelAnnotator(lfs=LFs)

L_train = labeler.apply()

print(L_train.lf_stats(session))


# generative model, training_marginals are probabilistic training labels
gen_model = GenerativeModel()
gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6)


print(gen_model.weights.lf_accuracy)

train_marginals = gen_model.marginals(L_train)
candidate_dfs = {
    key: load_candidate_dataframes(spreadsheet_names[key])
    for key in spreadsheet_names
}

for key in candidate_dfs:
    print("Size of {} set: {}".format(key, candidate_dfs[key].shape[0]))

# In[8]:

label_functions = (list(DG_LFS["DaG_DB"].values()) +
                   list(DG_LFS["DaG_TEXT"].values()))

if quick_load:
    labeler = LabelAnnotator(lfs=[])

    label_matricies = {
        key:
        labeler.load_matrix(session,
                            cids_query=make_cids_query(session,
                                                       candidate_dfs[key]))
        for key in candidate_dfs
    }

else:
    labeler = LabelAnnotator(lfs=label_functions)

    label_matricies = {
        key:
        label_candidates(labeler,
예제 #11
0
    LF_common_1000, LF_common_2000
]
LFs_BD = [
    LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000,
    LF_same_thing_BD
]
LFs_BM = [
    LF_distance_far, LF_colon, LF_known_abs, LF_single_letter,
    LF_roman_numeral, LF_common_2000, LF_same_thing
]
LFs_BT = [
    LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000,
    LF_same_thing
]

labeler_BC = LabelAnnotator(lfs=LFs_BC)
labeler_BD = LabelAnnotator(lfs=LFs_BD)
labeler_BM = LabelAnnotator(lfs=LFs_BM)
labeler_BT = LabelAnnotator(lfs=LFs_BT)

# Training
L_train_BC = labeler_BC.apply(split=0)
L_train_BD = labeler_BD.apply(split=0)
L_train_BM = labeler_BM.apply(split=0)
L_train_BT = labeler_BT.apply(split=0)
L_train_BC
L_train_BD
L_train_BM
L_train_BT

# Labeling Function Performance - Coverage, Overlaps, Conflicts
예제 #12
0
def _get_labeler(predicate_resume):
    LFs = get_labelling_functions(predicate_resume)
    labeler = LabelAnnotator(lfs=LFs)
    return labeler
    "Number of candidates:",
    session.query(candidate_class).filter(candidate_class.split == 0).count())
print("==============================")

# Split to pull eval candidates from
eval_split = 0

# Executing query for eval candidates
eval_cands = session.query(candidate_class).filter(
    candidate_class.split == eval_split).order_by(candidate_class.id).all()
print(f'Loaded {len(eval_cands)} candidates...')

# Applying LFs
print("Applying LFs...")
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)
L_eval = labeler.apply(split=eval_split, parallelism=parallelism)

# defining model
from snorkel.learning import GenerativeModel
# Creating generative model
gen_model = GenerativeModel()

# defining saved weights directory and name
model_name = 'Price_Gen_20K'  # this was provided when the model was saved!
save_dir = '/dfs/scratch0/jdunnmon/data/memex-data/extractor_checkpoints/Price_Gen_20K'  # this was provided when the model was saved!

# loading
print("Loading generative model...")
gen_model.load(model_name=model_name, save_dir=save_dir, verbose=True)
    BiomarkerCondition).filter(BiomarkerCondition.split == 1).count()
session.commit()

# In[ ]:

from LF import *
LFs_BC = [
    LF_markerDatabase, LF_keyword, LF_distance, LF_abstract_titleWord,
    LF_single_letter, LF_auxpass, LF_known_abs, LF_same_thing_BC,
    LF_common_1000, LF_common_2000
]

# In[ ]:

from snorkel.annotations import LabelAnnotator
BC_labeler = LabelAnnotator(lfs=LFs_BC)

# In[ ]:

np.random.seed(1701)
get_ipython().magic(u'time L_train_BC = BC_labeler.apply(split=0)')
L_train_BC

# In[ ]:

get_ipython().magic(
    u'time L_train_BC = BC_labeler.load_matrix(session, split=0)')
L_train_BC

# In[ ]:
예제 #15
0
#candidates = [session.query(DiseaseGene).filter(DiseaseGene.id == ids).one() for ids in [19817,19818,19830,19862,19980,20001,20004]]

for c in candidates:
    if c[0].get_parent().id != 14264:
        continue
    print c
    print get_tagged_text(c)
    print c[1].sentence.entity_cids[c[1].get_word_start()]

# # Label The Candidates

# This block of code will run through the label functions and label each candidate in the training and development groups.

# In[ ]:

labeler = LabelAnnotator(f=LFs)

get_ipython().magic(u'time L_train = labeler.apply(split=0)')
get_ipython().magic(u'time L_dev = labeler.apply_existing(split=1)')
get_ipython().magic(u'time L_test = labeler.apply_existing(split=2)')

# In[ ]:

featurizer = FeatureAnnotator()

get_ipython().magic(u'time F_train = featurizer.apply(split=0)')
get_ipython().magic(u'time F_dev = featurizer.apply_existing(split=1)')
get_ipython().magic(u'time F_test = featurizer.apply_existing(split=2)')

# # Generate Coverage Stats
예제 #16
0
    GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2'])
elif edge_type == "cg":
    CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene'])
elif edge_type == "cd":
    CompoundDisease = candidate_subclass('CompoundDisease',
                                         ['Compound', 'Disease'])
else:
    print("Please pick a valid edge type")

# # Load the data

# Here is where we load the test dataset in conjunction with the previously trained disc models. Each algorithm will output a probability of a candidate being a true candidate.

# In[6]:

labeler = LabelAnnotator(lfs=[])

# In[7]:

get_ipython().run_cell_magic(u'time', u'',
                             u'L_test = labeler.load_matrix(session,split=2)')

# In[8]:

L_test.shape

# In[9]:

marginal_files = [
    "stratified_data/lstm_disease_gene_holdout/LR_data/LR_test_marginals.csv",
    "stratified_data/lstm_disease_gene_holdout/lstm_one_test_marginals.csv",
예제 #17
0
# In[ ]:

database_str = "sqlite:///" + os.environ['WORKINGPATH'] + "/Database/epilepsy.db"
os.environ['SNORKELDB'] = database_str


session = SnorkelSession()


# # Load preprocessed data 

# To save time, this code will automatically load our labels that were generated in the previous file.

# In[ ]:

labeler = LabelAnnotator(f=None)

L_train = labeler.load_matrix(session,split=0)
L_dev = labeler.load_matrix(session,split=1)
L_test = labeler.load_matrix(session,split=2)


# In[ ]:

print "Total Data Shape:"
print L_train.shape
print L_dev.shape
print L_test.shape
print

print "The number of positive candiadtes (in KB) for each division:"
예제 #18
0
    tweet = Tweet(tweet=raw_text, split=split)
    session.add(tweet)

session.commit()

print("Commit to snorkel database done...")


#writing label generator
def worker_label_generator(t):
    for worker_id in cand_dict[t.tweet.stable_id]:
        yield worker_id, cand_dict[t.tweet.stable_id][worker_id]


np.random.seed(1701)
labeler = LabelAnnotator(label_generator=worker_label_generator)
L_train = labeler.apply(split=0)

print(L_train.lf_stats(session))

print("Creat training data done...")
print(" -train data shape", (L_train.shape))

print("Start to train a generative model")
gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(L_train, reg_type=2, reg_param=0.1, epochs=30)

#doing statistics
print(gen_model.learned_lf_stats())

print("Train a genetive model done...!")
예제 #19
0

def LF_police_at_location_left(c):
    if len(police_tags.intersection(get_right_tokens(c[1], window=7))) > 0:
        return 1
    else:
        return 0


LFs = [
    LF_crime_detect, LF_location_left_window, LF_police_at_location_left,
    LF_location_left_per_vic_window
]

from snorkel.annotations import LabelAnnotator
labeler1 = LabelAnnotator(f=LFs)

np.random.seed(1701)
get_ipython().magic(u'time L_train = labeler1.apply(split=0)')
L_train

L_train = labeler1.load_matrix(session, split=0)
L_train

for docno in session.query(Document).all():
    print(docno)
    for i in range(L_train.shape[0]):
        if (L_train[i, :].toarray()[0][0] == 1.0):
            if (session.query(LocationPer).filter(LocationPer.split == 0)
                [i].get_parent().get_parent() == docno):
                print(