def extract_binary_candidates(predicate_resume, clear=False, parallelism=8,
                              split=None, documents_titles=None, limit=None,
                              page_size=10000):
    #create span and candidates
    logging.info("Starting candidates extraction ")
    subject_ne=predicate_resume['subject_ne']
    object_ne=predicate_resume['object_ne']

    session = SnorkelSession()
    CandidateSubclass = predicate_resume["candidate_subclass"]


    ngrams= Ngrams(n_max=7)
    subject_matcher = get_matcher(subject_ne)
    object_matcher = get_matcher(object_ne)
    cand_extractor = CandidateExtractor(CandidateSubclass,
                                        [ngrams, ngrams],
                                        [subject_matcher,object_matcher])

    #skip sentences already extracted
    logging.info("Count candidates")
    sents_query_id = session.query(Sentence.id)
    candidates_count = session.query(CandidateSubclass).count()
    #logging.info("Delete span orphans")
    #delete_orphan_spans()
    if documents_titles==None and candidates_count>1 and clear==False:
        sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session)
    elif documents_titles != None:
        #delete candidates for test and dev
        logging.info("Deleting candidates")
        update_candidates_by_page_titles(predicate_resume,documents_titles, split)
        sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles)

    if limit is not None and documents_titles is None:
        sents_query_id=sents_query_id.limit(limit)


    sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id))


    logging.info("Counting sentences")
    sents_count=sents_query.count()
    logging.info("Sents count"+str(sents_count))
    print("Sents count"+str(sents_count))
    if sents_count > page_size:
        page=page_size
    else:
        page=sents_count
    i=1
    while(True):
        set_name=""
        if split == None:
            set_name="train"
            split2=0
        else:
            set_name=str(split)
            split2=split

        logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name)
        sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all()
        logging.info("Extracting")
        if sents == None or len(sents) < 1 :
            break
        cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism)
        logging.info('\t\tcandidates extracted for %s',  CandidateSubclass.__name__)
        i=i+1
        clear=False
    logging.info("Finished candidates extraction ")
예제 #2
0
# This section embeds all candidate sentences. For each sentence, we place tags around each mention, tokenized the sentence and then matched each token to their corresponding word index. Any words missing from our vocab receive a index of 1. Lastly, the embedded sentences are exported as a sparse dataframe.

# In[ ]:

word_dict_df = pd.read_table("output/gene_interacts_gene_word_dict.tsv")
word_dict = {word[0]: word[1] for word in word_dict_df.values.tolist()}
fixed_word_dict = {word: word_dict[word] + 2 for word in word_dict}

# In[ ]:

limit = 1000000
total_candidate_count = total_candidates_df.shape[0]

for offset in list(range(0, total_candidate_count, limit)):
    candidates = (session.query(GeneGene).filter(
        GeneGene.id.in_(total_candidates_df.candidate_id.astype(
            int).tolist())).offset(offset).limit(limit).all())

    max_length = total_candidates_df.sen_length.max()

    # if first iteration create the file
    if offset == 0:
        (generate_embedded_df(candidates,
                              fixed_word_dict,
                              max_length=max_length).to_csv(
                                  "output/all_embedded_gg_sentences.tsv",
                                  index=False,
                                  sep="\t",
                                  mode="w"))

    # else append don't overwrite
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism, verbose=False)


# Checking the number of parsed documents and sentences in the database.

# In[ ]:


from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())


# Separating into train, dev, and test sets

# In[11]:


from dataset_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location', gold_dict=None, dev_frac=0.1, test_frac=0.1)
예제 #4
0
corpus_parser.apply(doc_preprocessor)

Sensitive = candidate_subclass('Sensitive', ['sensitive'],
                               values=[
                                   'person', 'job', 'event', 'place', 'date',
                                   'time', 'product', 'email', 'phone',
                                   'quantity', 'address', 'url', 'org', 'file',
                                   'password', False
                               ])
# generating candidates.
ngrams = Ngrams(n_max=6)
ngramMatcher = NgramMatcher(longest_match_only=False)

cand_extractor = CandidateExtractor(Sensitive, [ngrams], [ngramMatcher],
                                    symmetric_relations=False)
sents = session.query(Sentence).all()
cand_extractor.apply(sents, split=0)
train_cands = session.query(Sensitive).filter(Sensitive.split == 0).all()
finder = FinderAcora()


def find(array, word):
    return [i for i, each in enumerate(array) if each == word]


def LF_product(c):
    if len(c.sensitive.get_attrib_tokens("words")) == len(
            find(c.sensitive.get_attrib_tokens("ner_tags"), "PRODUCT")):
        print "PRODUCT:" + c.sensitive.get_span()
        return "product"
예제 #5
0
#term = r'(\$?\d\d\d?.*?per|\$?\d\d\d?.*?hours?|\$?\d\d\d?.*?half|\$?\d\d\d?.*?minutes?)'
term = r'([Ll]ocation:.{0,100}|[cC]ity:.{0,100}|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)'

# Doc length in characters, remove to have no max
max_doc_length = None

# Setting preprocessor
print(f'Preprocessing folder: {data_loc}')
doc_preprocessor = set_preprocessor(data_source,
                                    data_loc,
                                    max_docs=max_docs,
                                    verbose=False,
                                    clean_docs=False,
                                    content_fields=['raw_content', 'url'],
                                    term=term,
                                    max_doc_length=max_doc_length)

# Setting parser and applying corpus preprocessor
parser = SimpleTokenizer(delim='<|>')
corpus_parser = CorpusParser(parser=parser)
corpus_parser.apply(list(doc_preprocessor),
                    parallelism=parallelism,
                    verbose=False)

# Printing number of docs/sentences
print("==============================")
print(f"DB creation results for {postgres_db_name}:")
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())
print("==============================")
        label_candidates(session, (total_candidates_df.query(
            "split==0&disease_mention_count==1&gene_mention_count==1").
                                   candidate_id.values.tolist()),
                         lfs,
                         lf_names,
                         num_threads=10,
                         batch_size=50000,
                         multitask=False)
    }

# In[12]:

if not quick_load:
    # Check to make sure the label functions match up with the candidate objects
    ids = label_matricies['train'].candidate_id.head(5).tolist()
    candidate_list = session.query(DiseaseGene).filter(
        DiseaseGene.id.in_(ids)).all()
    for candidate in tqdm_notebook(candidate_list):
        correct_output = list(map(lambda fn: fn(candidate), lfs))
        test_output = label_matricies['train'].query(
            "[email protected]").fillna(0).values.tolist()[0]
        for pair in zip(correct_output, test_output[:-1]):
            assert pair[0] == pair[1]

# In[13]:

if not quick_load:
    label_matricies.update({
        key:
        label_candidates(session,
                         candidate_dfs[key]['candidate_id'].values.tolist(),
                         lfs,
예제 #7
0
# In[ ]:

category_list = np.random.choice([0, 1, 2], total_sentences, p=[0.7, 0.2, 0.1])

# In[ ]:

# Divide the sentences into train, dev and test sets

#Grab the sentences!!!
train_sens = set()
dev_sens = set()
test_sens = set()

offset = 0
category_index = 0
sql_query = session.query(Document).limit(chunk_size)

#divde and insert into the database
while True:
    documents = list(sql_query.offset(offset).all())

    if not documents:
        break

    for doc in tqdm.tqdm(documents):
        for s in doc.sentences:

            # Stratify the data into train, dev, test
            category = category_list[category_index]
            category_index = category_index + 1
예제 #8
0
cutoff = 300
total_candidates_df = (
    pd.read_table("../dataset_statistics/results/all_ctd_map.tsv.xz").query(
        "sen_length < 300"))
total_candidates_df.head(2)

# # Train Word Vectors

# This section trains the word vectors using the specifications described above.

# In[10]:

words_to_embed = []
candidates = (session.query(CompoundDisease).filter(
    CompoundDisease.id.in_(
        total_candidates_df.candidate_id.astype(int).tolist())).all())

# In[11]:

for cand in tqdm_notebook(candidates):
    args = [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
            (cand[1].get_word_start(), cand[1].get_word_end(), 2)]
    words_to_embed.append(mark_sentence(candidate_to_tokens(cand), args))

# In[12]:

model = FastText(words_to_embed,
                 window=2,
                 negative=10,
                 iter=50,
예제 #9
0
for corpus_name in ['Emails Development', 'Emails Test']:
    #corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
    sentences = set()
    for document in corpus:
        for sentence in document.sentences:
            if number_of_people(sentence) < 5:
                sentences.add(sentence)
    
    %time c = ce.extract(sentences, corpus_name + ' Candidates', session)
    session.add(c)
session.commit()

from snorkel.models import CandidateSet

train = session.query(CandidateSet).filter(CandidateSet.name == 'Emails Training Candidates').one()
dev = session.query(CandidateSet).filter(CandidateSet.name == 'Emails Development Candidates').one()

from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.create(session, c, 'Train Features')


#To load existing use ..
#%time F_train = feature_manager.load(session, train, 'Train Features')						
						
from snorkel.annotations import LabelManager

label_manager = LabelManager()
예제 #10
0
cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams],
                                    [GM, PM],
                                    symmetric_relations=True)

print "Splitting Docs..."
pathname = 'small_data/' if os.environ[
    'AGP_DATA_SIZE'] == 'small-data' else 'data/'
with open(pathname + 'pmcids_400.pkl', 'rb') as f:
    sent_dicts = cPickle.load(f)
train_ids, dev_ids, test_ids = set(sent_dicts['train']), set(
    sent_dicts['dev']), set(sent_dicts['test'])
all_ids = train_ids.union(dev_ids).union(test_ids)
# 40, 10, 10
train_sents, dev_sents, test_sents, all_sents = set(), set(), set(), set()
train_docs, dev_docs, test_docs = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()
doc_sents = dict()
for doc_num, doc in enumerate(docs):
    if len(train_docs) >= 40 and len(dev_docs) >= 10 and len(test_docs) >= 10:
        break
    doc_sents[doc_num] = set()
    for s in doc.sentences:
        all_sents.add(s)
        doc_sents[doc_num].add(s)
        name = doc.name.split('-')[0]
        if name in train_ids:
            train_docs.add(name)
            train_sents.add(s)
        elif name in dev_ids:
            dev_docs.add(name)
            dev_sents.add(s)
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams,
        cand1Matcher, cand2Matcher, model_name, output_file_name,
        corpus_parser):
    print "Started"
    session = SnorkelSession()

    # The following line is for testing only. Feel free to ignore it.

    candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2])

    sentences = set()
    docs = session.query(Document).order_by(Document.name).all()
    for doc in docs:
        for s in doc.sentences:
            sentences.add(s)

    cand_1_ngrams = Ngrams(n_max=cand1_ngrams)
    # condition_ngrams = Ngrams(n_max=7)
    cand_2_ngrams = Ngrams(n_max=cand2_ngrams)
    # medium_ngrams = Ngrams(n_max=5)
    # type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
    # # level_ngrams = Ngrams(n_max=1)
    # unit_ngrams = Ngrams(n_max=1)

    # Construct our Matchers

    # cMatcher = matchers.getConditionMatcher()
    # mMatcher = matchers.getMediumMatcher()
    # tMatcher = matchers.getTypeMatcher()
    # lMatcher = matchers.getLevelMatcher()
    # uMatcher = matchers.getUnitMatcher()

    # Building the CandidateExtractors
    # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
    candidate_extractor = CandidateExtractor(candidate_pair,
                                             [cand_1_ngrams, cand_2_ngrams],
                                             [cand1Matcher, cand2Matcher])
    # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
    # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
    # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

    # List of Candidate Sets for each relation type: [train, dev, test]
    candidate_extractor.apply(sentences, split=4, clear=True)
    cands = session.query(candidate_pair).filter(
        candidate_pair.split == 4).order_by(candidate_pair.id).all()
    session.commit()
    # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
    # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
    # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
    # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)

    if (len(cands)) == 0:
        print "No Candidates Found"
        return
    if (pairing_name == 'BiomarkerCondition'):
        # session.rollback()
        # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1])
        add_adj_candidate_BC(session, candidate_pair, cands, 4)
        # fix_specificity(session, BiomarkerCondition, cands_BC[1])
        # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count()
        session.commit()

    lstm = reRNN(seed=1701, n_threads=None)

    lstm.load(model_name)

    predictions = lstm.predictions(cands)
    output_file = open(output_file_name, 'wb')
    import csv
    csvWriter = csv.writer(output_file)
    csvWriter.writerow(
        ['doc_id', 'sentence', candidate1, candidate2, 'prediction'])
    for i in range(len(cands)):
        doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:]
        sentence_string = cands[i].get_parent().text
        cand_1_string = cands[i].get_contexts()[0].get_span()
        cand_2_string = cands[i].get_contexts()[1].get_span()
        prediction = predictions[i]
        csvWriter.writerow([
            unidecode(doc_string),
            unidecode(sentence_string),
            unidecode(cand_1_string),
            unidecode(cand_2_string), prediction
        ])
예제 #12
0
    searcher.fit(F_dev, L_dev, n_epochs=50, rebalance=0.5, print_freq=25)
    disc_models.append(disc_model)
    w = disc_model.save_dict['w']
    f = w.read_value()
    values = f.eval(session = disc_model.session)
    weights.append(values)
    test_marginals.append(disc_model.marginals(F_test))


# # Generate Statistics After Model Training

# ## Grab the feature weights

# In[ ]:

features = session.query(FeatureKey).all()
feat_data = []
for feat, w0, w1 in zip(features,weights[0],weights[1]):
    feat_data.append([feat.name, w0[0], w1[0]])
feat_frame = pd.DataFrame(feat_data, columns= ["Feature", "Model_KB", "Model_KB_CONTEXT"])


# ## Grab the class probabilities

# In[ ]:

test_marginals[0].shape
cand_probs = []
for candidate_id in L_test.candidate_index:
    cand = session.query(Candidate).filter(Candidate.id == candidate_id).one()
    index = L_test.candidate_index[candidate_id]
예제 #13
0
dev_set_df = (entity_level_df.query("split==10&has_sentence==1").merge(
    dev_candidate_df, on=["drugbank_id", "doid_id"]))

test_set_df = (entity_level_df.query("split==11&has_sentence==1").merge(
    test_candidate_df, on=["drugbank_id", "doid_id"]))

# In[16]:

total_candidates_df = (training_set_df.append(dev_set_df).append(
    test_set_df).drop_duplicates("candidate_id"))

# In[17]:

dev_candidates = (session.query(CompoundDisease).filter(
    CompoundDisease.id.in_(
        total_candidates_df.query("split==10").sample(
            10000, random_state=100).candidate_id.tolist())).all())
dev_df = make_sentence_df(dev_candidates)
dev_df.head(2)

# In[18]:

test_candidates = (
    session.query(CompoundDisease).filter(
        CompoundDisease.id.in_(
            total_candidates_df.query("split==11")
            # Black list ethanol, alcohol and alcohold dependence
            # Samples too many sentences with the above entities
            .query("drugbank_id!='DB00898'&doid_id!='DOID:0050741'").sample(
                10000, random_state=120).candidate_id.tolist())).all())
test_df = make_sentence_df(test_candidates)
예제 #14
0
doc_preprocessor = TSVDocPreprocessor('tutorials/intro/data/articles.tsv',
                                      max_docs=max_docs)

# In[5]:

from snorkel.parser import CorpusParser

corpus_parser = CorpusParser()
get_ipython().magic(u'time corpus_parser.apply(doc_preprocessor)')

# In[6]:

from snorkel.models import Document, Sentence

print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()

dict_final = {}

crimetype_murder = [
    'killed', 'kill', 'kills', 'killing', 'murder', 'shot', 'shooting',
    'convicted', 'murdered'
]
crimetype_rape = [
    'rape', 'raped', 'gangraped', 'molested', 'molestation', 'molesting',
    'harassment', 'raping'
]
crimetype_attack = [
    'hurt', 'rioting', 'injured', 'attack', 'beating up', 'attacked'
]
예제 #15
0
                  help="the pair file [default: %default]")
parser.add_option("-s",
                  "--saved-dir",
                  dest="saved_dir",
                  help="directory to save the rank scores [default: %default]")

parser.set_defaults(
    input_pair=
    "/Users/datienguyen/Desktop/coding/data-search/exp-data/dataSEARCH/pair-store/train.csv"  #3_signals.top20doc.csv" #
    ,
    saved_dir="../../data-search/exp-data/dataSearch/pair-store/")

opts, args = parser.parse_args(sys.argv)
input_pair = opts.input_pair

session.query(Context).delete()
session.query(Candidate).delete()

values = ['positive', 'negative']
Tweet = candidate_subclass('Tweet', ['tweet'], values=values)

#item_id,worker_id,query_id,doc1,doc2,annotation
cand_dict = {}

with open(input_pair, "r") as myFile:
    reader = csv.reader(myFile)
    for row in reader:
        #print(row)
        item_id = row[0]
        worker_id = row[1]
        anno = row[5]
예제 #16
0
# In[ ]:

DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

# # Look at potential Candidates

# Use this to look at loaded candidates from a given set. The constants represent the index to retrieve the training set, development set and testing set.

# In[ ]:

TRAIN = 0
DEV = 1
TEST = 2

candidates = session.query(DiseaseGene).filter(
    DiseaseGene.split == TRAIN).all()
sv = SentenceNgramViewer(candidates, session)

# In[ ]:

sv

# # Label Functions

# Here is the fundamental part of the project. Below are the label functions that are used to give a candidate a label of 1,0 or -1 which corresponds to correct relation, not sure and incorrection relation. The goal here is to develop functions that can label as many candidates as possible.

# In[ ]:

gene_list = pd.read_csv('epilepsy-genes.tsv', sep="\t")

# In[ ]:
예제 #17
0
n_docs = 500

doc_preprocessor = TSVDocPreprocessor(
    'pdfs_big.tsv', max_docs=n_docs)  # new files (88 papers)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor, count=n_docs)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d=virus_list)
animals_matcher = DictionaryMatch(d=animals_list)
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [
                                    virus_matcher, animals_matcher], nested_relations=True)

docs = session.query(Document).order_by(Document.name).all()

# Text Pattern based labeling functions, which look for certain keywords

# List to parenthetical


def ltp(x):
    return '(' + '|'.join(x) + ')'


# --------------------------------

# Positive LFs:

detect = {'detect', 'detects', 'detected',
예제 #18
0
cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, 'Protein Training')

for name, path in [('Protein Development', 'data/protein_dev.tsv'),
                   ('Protein Test', 'data/protein_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()


from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Protein Training').one()
corpus

sentences = set()
for document in corpus:
    for sentence in document.sentences:
        sentences.add(sentence)

from snorkel.candidates import Ngrams
from snorkel.models import candidate_subclass
#entity = candidate_subclass('entity', ['entity1', 'entity2'])
import pandas as pd
ROOT = 'data/dicts/'
proteins   = set(pd.read_csv(ROOT + 'protein_names.csv', header=None, index_col=0, encoding='utf-8').dropna()[1])
ngrams = Ngrams(n_max=1)
from snorkel.matchers import DictionaryMatch
예제 #19
0
        if print_stats:
            self.print_report(speaker, apologize, ask_agency, give_agency, \
                gratitude, please)

        return {
            'apologize': apologize,
            'ask_agency': ask_agency,
            'give_agency': give_agency,
            'gratitude': gratitude,
            'please': please
        }


if __name__ == '__main__':
    session = SnorkelSession()
    docs = session.query(ReconDocument)
    i = 0
    p = PolitenessExtractor()
    '''
		Stupid code because I don't know how to get the length 
		of the array of documents returned by the query. 

		i.e. for i in range(len(docs)): does not work because
			 len(docs) is not a thing...
	'''

    while True:
        try:
            p.compute_score(docs[i], print_stats=False)
            i += 1
        except:
예제 #20
0
    for line in open(filename, "r", errors='ignore').readlines():
        if line.split("\t")[0] in docID or len(line.split("\t")) != 2:
            continue
        docID.add(line.split("\t")[0])
        fout.write(line.replace("\n", " ").strip() + "\n")

print("total docID count", len(docID))
doc_preprocessor = TSVDocPreprocessor(newfile,
                                      encoding="utf-8",
                                      max_docs=n_docs)

from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence  # defined in context.py file

if session.query(Document).count() == 0:
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor, count=n_docs)  # ,parallelism=5)

print("Documents:", session.query(Document).count())

from snorkel import SnorkelSession
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from collections import defaultdict
import numpy as np

session = SnorkelSession()
docs = session.query(Document).all()
sents = session.query(Sentence).all()  # get all sentences from snorkel.db
예제 #21
0
def get_gold_labels(session: SnorkelSession,
                    annotator_name: str = "gold") -> List[dict]:
    # define relationship in case it is not defined
    ak = session.query(GoldLabelKey).filter(
        GoldLabelKey.name == annotator_name).first()
    return session.query(GoldLabel).filter(GoldLabel.key == ak).all()
LFs = [lf_preceding_half_missing_quantity, lf_preceding_half]

# Setting extraction type -- should be a subfield in your data source extractions field!
from dataset_utils import create_candidate_class
extraction_type = 'price'

# Creating candidate class
candidate_class, candidate_class_name = create_candidate_class(extraction_type)

# Printing number of docs/sentences
from snorkel.models import Document, Sentence
print("==============================")
print(f"DB contents for {postgres_db_name}:")
print(
    "Number of candidates:",
    session.query(candidate_class).filter(candidate_class.split == 0).count())
print("==============================")

# Split to pull eval candidates from
eval_split = 0

# Executing query for eval candidates
eval_cands = session.query(candidate_class).filter(
    candidate_class.split == eval_split).order_by(candidate_class.id).all()
print(f'Loaded {len(eval_cands)} candidates...')

# Applying LFs
print("Applying LFs...")
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)
L_eval = labeler.apply(split=eval_split, parallelism=parallelism)
session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(path)

corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)


pairs = candidate_subclass('pairs1', ['queryPair'])
regexpmatch=RegexMatchSpan(rgx=".*")
cs=queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])


docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents=set();
for i,doc in enumerate(docs):
    for s in doc.sentences:
        sents.add(s)


cand_extractor.apply(sents)

print("Number of candidates:", session.query(pairs).count())


labeler = LabelAnnotator(lfs=LFs)
예제 #24
0
def main(argv):
    parser = argparse.ArgumentParser(description='Process some arguments.')
    parser.add_argument('--dbPath',
                        type=str,
                        default=os.getcwd() + os.sep + 'snorkel.db',
                        help='the path of snorkel database')
    parser.add_argument(
        '--lfPath',
        type=str,
        default=os.getcwd() + os.sep + 'util_default.py',
        help='the path of util.py file where labelling functions were defined')

    args = parser.parse_args()

    # Connect to db, and get session

    util_module = imp.load_source("module.name", args.lfPath)
    train_doc_breakdown_map = dict(
    )  # maps doc_id into a dict of ["Background", "Purpose", "Mechanism", "Method", "Finding"]
    test_doc_breakdown_map = dict()

    SnorkelSession = create_session_with_conn("sqlite:///" + args.dbPath)
    session = SnorkelSession()

    print("Documents:", session.query(Document).count())
    print("Sentences:", session.query(Sentence).count())

    sents = session.query(Sentence).all()
    n_max_corpus = 0
    for sent in sents:
        n_max_corpus = max(n_max_corpus, len(sent.words))

    print("The longest sentence has " + str(n_max_corpus) + " tokens.")

    ngrams = Ngrams(n_max=n_max_corpus)

    # from util import number_of_people

    docs = session.query(Document).all()

    train_sents = set()
    dev_sents = set()
    test_sents = set()

    for i, doc in enumerate(docs):
        for s in doc.sentences:
            if i % 10 == 8 and "cscw18" != doc.name[:6]:
                dev_sents.add(s)
            elif "cscw18" == doc.name[:
                                      6]:  # replace the earlier 10% test documents as cscw'18 annotation guideline 10 examples
                test_sents.add(s)
            elif "cscw18" != doc.name[:6]:
                train_sents.add(s)

    General, general_extractor = util_module.get_segment_class_and_matcher(
        "General", ngrams)
    general_cands = extract_and_display(
        train_sents,
        dev_sents,
        test_sents,
        session,
        general_extractor,
        General,
        "General",
        train_doc_breakdown_map=train_doc_breakdown_map,
        test_doc_breakdown_map=test_doc_breakdown_map)

    input("Finished general ")

    # load segment_candidate_class and corresponding_matcher, e.g. (Background, non_comma_dict_background_matcher)
    Background, background_matcher = util_module.get_segment_class_and_matcher(
        "Background", ngrams)
    background_cands = extract_and_display(
        train_sents,
        dev_sents,
        test_sents,
        session,
        background_matcher,
        Background,
        "Background",
        train_doc_breakdown_map=train_doc_breakdown_map,
        test_doc_breakdown_map=test_doc_breakdown_map)

    debug_sess_eval(session, Background, background_matcher)
예제 #25
0
train_df = pd.read_excel(spreadsheet_names['train'])
train_cids = train_df.candidate_id.astype(int).tolist()
train_df.head(2)


# In[ ]:


dev_df = pd.read_excel(spreadsheet_names['dev'])
dev_df = dev_df[dev_df.curated_dsh.notnull()]
dev_cids = list(map(int, dev_df.candidate_id.values))
dev_df.head(2)


# In[ ]:


train_hand_df = pd.read_excel(spreadsheet_names['train_hand_label'])
train_hand_cids = train_hand_df[train_hand_df.curated_dsh.notnull()].candidate_id.astype(int).tolist()
train_hand_df.head(2)


# In[ ]:


for cid_list in [train_cids, train_hand_cids, dev_cids]:
    cids = session.query(CompoundGene.id).filter(CompoundGene.id.in_(cid_list))
    get_ipython().magic(u'time labeler.apply(cids_query=cids, parallelism=5)')

예제 #26
0
# Parsing
corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)

# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()

# Assigning each sentence to {train,dev,test}-set based on Stable ID
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(
                doc.name))

#----------------------
예제 #27
0
) as candidate_sen inner join sentence on candidate_sen.sentence_id=sentence.id
'''
candidate_sentence_df = pd.read_sql(sql, database_str).astype(
    {"entrez_gene_id": int})
candidate_sentence_df.head(2)

# In[10]:

total_candidates_df = (edge_level_df.merge(candidate_sentence_df,
                                           on=["doid_id", "entrez_gene_id"]))
total_candidates_df.head(2)

# In[11]:

dev_candidates = (session.query(DiseaseGene).filter(
    DiseaseGene.id.in_(
        total_candidates_df.query("split==1").sample(
            10000, random_state=100).candidate_id.tolist())).all())
dev_df = make_sentence_df(dev_candidates)
dev_df.head(2)

# In[12]:

test_candidates = (session.query(DiseaseGene).filter(
    DiseaseGene.id.in_(
        total_candidates_df.query("split==2").sample(
            10000, random_state=120).candidate_id.tolist())).all())
test_df = make_sentence_df(test_candidates)
test_df.head(2)

# In[13]:
예제 #28
0
dev_set_df = (entity_level_df.query("split==7&has_sentence==1").merge(
    dev_candidate_df.astype({"entrez_gene_id": int}),
    on=["drugbank_id", "entrez_gene_id"]))

test_set_df = (entity_level_df.query("split==8&has_sentence==1").merge(
    test_candidate_df.astype({"entrez_gene_id": int}),
    on=["drugbank_id", "entrez_gene_id"]))

# In[16]:

total_candidates_df = (training_set_df.append(dev_set_df).append(test_set_df))

# In[17]:

dev_candidates = (session.query(CompoundGene).filter(
    CompoundGene.id.in_(
        total_candidates_df.query("split==7").sample(
            10000, random_state=100).candidate_id.tolist())).all())
dev_df = make_sentence_df(dev_candidates)
dev_df.head(2)

# In[18]:

test_candidates = (session.query(CompoundGene).filter(
    CompoundGene.id.in_(
        total_candidates_df.query("split==8").sample(
            10000, random_state=100).candidate_id.tolist())).all())
test_df = make_sentence_df(test_candidates)
test_df.head(2)

# In[ ]: