Python SnorkelSession.query примеры, snorkel.SnorkelSession.query Python примеры использования

Пример #1

0

Показать файл

Файл: candidate_extraction.py Проект: lorenzoranucci/sentimantic

def extract_binary_candidates(predicate_resume, clear=False, parallelism=8,
                              split=None, documents_titles=None, limit=None,
                              page_size=10000):
    #create span and candidates
    logging.info("Starting candidates extraction ")
    subject_ne=predicate_resume['subject_ne']
    object_ne=predicate_resume['object_ne']

    session = SnorkelSession()
    CandidateSubclass = predicate_resume["candidate_subclass"]


    ngrams= Ngrams(n_max=7)
    subject_matcher = get_matcher(subject_ne)
    object_matcher = get_matcher(object_ne)
    cand_extractor = CandidateExtractor(CandidateSubclass,
                                        [ngrams, ngrams],
                                        [subject_matcher,object_matcher])

    #skip sentences already extracted
    logging.info("Count candidates")
    sents_query_id = session.query(Sentence.id)
    candidates_count = session.query(CandidateSubclass).count()
    #logging.info("Delete span orphans")
    #delete_orphan_spans()
    if documents_titles==None and candidates_count>1 and clear==False:
        sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session)
    elif documents_titles != None:
        #delete candidates for test and dev
        logging.info("Deleting candidates")
        update_candidates_by_page_titles(predicate_resume,documents_titles, split)
        sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles)

    if limit is not None and documents_titles is None:
        sents_query_id=sents_query_id.limit(limit)


    sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id))


    logging.info("Counting sentences")
    sents_count=sents_query.count()
    logging.info("Sents count"+str(sents_count))
    print("Sents count"+str(sents_count))
    if sents_count > page_size:
        page=page_size
    else:
        page=sents_count
    i=1
    while(True):
        set_name=""
        if split == None:
            set_name="train"
            split2=0
        else:
            set_name=str(split)
            split2=split

        logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name)
        sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all()
        logging.info("Extracting")
        if sents == None or len(sents) < 1 :
            break
        cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism)
        logging.info('\t\tcandidates extracted for %s',  CandidateSubclass.__name__)
        i=i+1
        clear=False
    logging.info("Finished candidates extraction ")

Пример #2

0

Показать файл

# This section embeds all candidate sentences. For each sentence, we place tags around each mention, tokenized the sentence and then matched each token to their corresponding word index. Any words missing from our vocab receive a index of 1. Lastly, the embedded sentences are exported as a sparse dataframe.

# In[ ]:

word_dict_df = pd.read_table("output/gene_interacts_gene_word_dict.tsv")
word_dict = {word[0]: word[1] for word in word_dict_df.values.tolist()}
fixed_word_dict = {word: word_dict[word] + 2 for word in word_dict}

# In[ ]:

limit = 1000000
total_candidate_count = total_candidates_df.shape[0]

for offset in list(range(0, total_candidate_count, limit)):
    candidates = (session.query(GeneGene).filter(
        GeneGene.id.in_(total_candidates_df.candidate_id.astype(
            int).tolist())).offset(offset).limit(limit).all())

    max_length = total_candidates_df.sen_length.max()

    # if first iteration create the file
    if offset == 0:
        (generate_embedded_df(candidates,
                              fixed_word_dict,
                              max_length=max_length).to_csv(
                                  "output/all_embedded_gg_sentences.tsv",
                                  index=False,
                                  sep="\t",
                                  mode="w"))

    # else append don't overwrite

Пример #3

0

Показать файл

Файл: Snorkel_Pipeline_Version_NB1.py Проект: CHTAP/extractors

from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism, verbose=False)


# Checking the number of parsed documents and sentences in the database.

# In[ ]:


from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())


# Separating into train, dev, and test sets

# In[11]:


from dataset_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location', gold_dict=None, dev_frac=0.1, test_frac=0.1)

Пример #4

0

Показать файл

corpus_parser.apply(doc_preprocessor)

Sensitive = candidate_subclass('Sensitive', ['sensitive'],
                               values=[
                                   'person', 'job', 'event', 'place', 'date',
                                   'time', 'product', 'email', 'phone',
                                   'quantity', 'address', 'url', 'org', 'file',
                                   'password', False
                               ])
# generating candidates.
ngrams = Ngrams(n_max=6)
ngramMatcher = NgramMatcher(longest_match_only=False)

cand_extractor = CandidateExtractor(Sensitive, [ngrams], [ngramMatcher],
                                    symmetric_relations=False)
sents = session.query(Sentence).all()
cand_extractor.apply(sents, split=0)
train_cands = session.query(Sensitive).filter(Sensitive.split == 0).all()
finder = FinderAcora()


def find(array, word):
    return [i for i, each in enumerate(array) if each == word]


def LF_product(c):
    if len(c.sensitive.get_attrib_tokens("words")) == len(
            find(c.sensitive.get_attrib_tokens("ner_tags"), "PRODUCT")):
        print "PRODUCT:" + c.sensitive.get_span()
        return "product"

Пример #5

0

Показать файл

#term = r'(\$?\d\d\d?.*?per|\$?\d\d\d?.*?hours?|\$?\d\d\d?.*?half|\$?\d\d\d?.*?minutes?)'
term = r'([Ll]ocation:.{0,100}|[cC]ity:.{0,100}|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)'

# Doc length in characters, remove to have no max
max_doc_length = None

# Setting preprocessor
print(f'Preprocessing folder: {data_loc}')
doc_preprocessor = set_preprocessor(data_source,
                                    data_loc,
                                    max_docs=max_docs,
                                    verbose=False,
                                    clean_docs=False,
                                    content_fields=['raw_content', 'url'],
                                    term=term,
                                    max_doc_length=max_doc_length)

# Setting parser and applying corpus preprocessor
parser = SimpleTokenizer(delim='<|>')
corpus_parser = CorpusParser(parser=parser)
corpus_parser.apply(list(doc_preprocessor),
                    parallelism=parallelism,
                    verbose=False)

# Printing number of docs/sentences
print("==============================")
print(f"DB creation results for {postgres_db_name}:")
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())
print("==============================")

Пример #6

0

Показать файл

Файл: single_task_experiment.py Проект: strategist922/snorkeling

        label_candidates(session, (total_candidates_df.query(
            "split==0&disease_mention_count==1&gene_mention_count==1").
                                   candidate_id.values.tolist()),
                         lfs,
                         lf_names,
                         num_threads=10,
                         batch_size=50000,
                         multitask=False)
    }

# In[12]:

if not quick_load:
    # Check to make sure the label functions match up with the candidate objects
    ids = label_matricies['train'].candidate_id.head(5).tolist()
    candidate_list = session.query(DiseaseGene).filter(
        DiseaseGene.id.in_(ids)).all()
    for candidate in tqdm_notebook(candidate_list):
        correct_output = list(map(lambda fn: fn(candidate), lfs))
        test_output = label_matricies['train'].query(
            "[email protected]").fillna(0).values.tolist()[0]
        for pair in zip(correct_output, test_output[:-1]):
            assert pair[0] == pair[1]

# In[13]:

if not quick_load:
    label_matricies.update({
        key:
        label_candidates(session,
                         candidate_dfs[key]['candidate_id'].values.tolist(),
                         lfs,

Пример #7

0

Показать файл

# In[ ]:

category_list = np.random.choice([0, 1, 2], total_sentences, p=[0.7, 0.2, 0.1])

# In[ ]:

# Divide the sentences into train, dev and test sets

#Grab the sentences!!!
train_sens = set()
dev_sens = set()
test_sens = set()

offset = 0
category_index = 0
sql_query = session.query(Document).limit(chunk_size)

#divde and insert into the database
while True:
    documents = list(sql_query.offset(offset).all())

    if not documents:
        break

    for doc in tqdm.tqdm(documents):
        for s in doc.sentences:

            # Stratify the data into train, dev, test
            category = category_list[category_index]
            category_index = category_index + 1

Пример #8

0

Показать файл

cutoff = 300
total_candidates_df = (
    pd.read_table("../dataset_statistics/results/all_ctd_map.tsv.xz").query(
        "sen_length < 300"))
total_candidates_df.head(2)

# # Train Word Vectors

# This section trains the word vectors using the specifications described above.

# In[10]:

words_to_embed = []
candidates = (session.query(CompoundDisease).filter(
    CompoundDisease.id.in_(
        total_candidates_df.candidate_id.astype(int).tolist())).all())

# In[11]:

for cand in tqdm_notebook(candidates):
    args = [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
            (cand[1].get_word_start(), cand[1].get_word_end(), 2)]
    words_to_embed.append(mark_sentence(candidate_to_tokens(cand), args))

# In[12]:

model = FastText(words_to_embed,
                 window=2,
                 negative=10,
                 iter=50,

Пример #9

0

Показать файл

Файл: clinton_emails.py Проект: vinodma/snorkel_projects

for corpus_name in ['Emails Development', 'Emails Test']:
    #corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
    sentences = set()
    for document in corpus:
        for sentence in document.sentences:
            if number_of_people(sentence) < 5:
                sentences.add(sentence)
    
    %time c = ce.extract(sentences, corpus_name + ' Candidates', session)
    session.add(c)
session.commit()

from snorkel.models import CandidateSet

train = session.query(CandidateSet).filter(CandidateSet.name == 'Emails Training Candidates').one()
dev = session.query(CandidateSet).filter(CandidateSet.name == 'Emails Development Candidates').one()

from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.create(session, c, 'Train Features')


#To load existing use ..
#%time F_train = feature_manager.load(session, train, 'Train Features')						
						
from snorkel.annotations import LabelManager

label_manager = LabelManager()

Пример #10

0

Показать файл

cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams],
                                    [GM, PM],
                                    symmetric_relations=True)

print "Splitting Docs..."
pathname = 'small_data/' if os.environ[
    'AGP_DATA_SIZE'] == 'small-data' else 'data/'
with open(pathname + 'pmcids_400.pkl', 'rb') as f:
    sent_dicts = cPickle.load(f)
train_ids, dev_ids, test_ids = set(sent_dicts['train']), set(
    sent_dicts['dev']), set(sent_dicts['test'])
all_ids = train_ids.union(dev_ids).union(test_ids)
# 40, 10, 10
train_sents, dev_sents, test_sents, all_sents = set(), set(), set(), set()
train_docs, dev_docs, test_docs = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()
doc_sents = dict()
for doc_num, doc in enumerate(docs):
    if len(train_docs) >= 40 and len(dev_docs) >= 10 and len(test_docs) >= 10:
        break
    doc_sents[doc_num] = set()
    for s in doc.sentences:
        all_sents.add(s)
        doc_sents[doc_num].add(s)
        name = doc.name.split('-')[0]
        if name in train_ids:
            train_docs.add(name)
            train_sents.add(s)
        elif name in dev_ids:
            dev_docs.add(name)
            dev_sents.add(s)

Пример #11

0

Показать файл

Файл: pipeline.py Проект: varun-tandon/MarkerVilleBackendAlpha

def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams,
        cand1Matcher, cand2Matcher, model_name, output_file_name,
        corpus_parser):
    print "Started"
    session = SnorkelSession()

    # The following line is for testing only. Feel free to ignore it.

    candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2])

    sentences = set()
    docs = session.query(Document).order_by(Document.name).all()
    for doc in docs:
        for s in doc.sentences:
            sentences.add(s)

    cand_1_ngrams = Ngrams(n_max=cand1_ngrams)
    # condition_ngrams = Ngrams(n_max=7)
    cand_2_ngrams = Ngrams(n_max=cand2_ngrams)
    # medium_ngrams = Ngrams(n_max=5)
    # type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
    # # level_ngrams = Ngrams(n_max=1)
    # unit_ngrams = Ngrams(n_max=1)

    # Construct our Matchers

    # cMatcher = matchers.getConditionMatcher()
    # mMatcher = matchers.getMediumMatcher()
    # tMatcher = matchers.getTypeMatcher()
    # lMatcher = matchers.getLevelMatcher()
    # uMatcher = matchers.getUnitMatcher()

    # Building the CandidateExtractors
    # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
    candidate_extractor = CandidateExtractor(candidate_pair,
                                             [cand_1_ngrams, cand_2_ngrams],
                                             [cand1Matcher, cand2Matcher])
    # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
    # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
    # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

    # List of Candidate Sets for each relation type: [train, dev, test]
    candidate_extractor.apply(sentences, split=4, clear=True)
    cands = session.query(candidate_pair).filter(
        candidate_pair.split == 4).order_by(candidate_pair.id).all()
    session.commit()
    # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
    # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
    # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
    # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)

    if (len(cands)) == 0:
        print "No Candidates Found"
        return
    if (pairing_name == 'BiomarkerCondition'):
        # session.rollback()
        # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1])
        add_adj_candidate_BC(session, candidate_pair, cands, 4)
        # fix_specificity(session, BiomarkerCondition, cands_BC[1])
        # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count()
        session.commit()

    lstm = reRNN(seed=1701, n_threads=None)

    lstm.load(model_name)

    predictions = lstm.predictions(cands)
    output_file = open(output_file_name, 'wb')
    import csv
    csvWriter = csv.writer(output_file)
    csvWriter.writerow(
        ['doc_id', 'sentence', candidate1, candidate2, 'prediction'])
    for i in range(len(cands)):
        doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:]
        sentence_string = cands[i].get_parent().text
        cand_1_string = cands[i].get_contexts()[0].get_span()
        cand_2_string = cands[i].get_contexts()[1].get_span()
        prediction = predictions[i]
        csvWriter.writerow([
            unidecode(doc_string),
            unidecode(sentence_string),
            unidecode(cand_1_string),
            unidecode(cand_2_string), prediction
        ])

Пример #12

0

Показать файл

    searcher.fit(F_dev, L_dev, n_epochs=50, rebalance=0.5, print_freq=25)
    disc_models.append(disc_model)
    w = disc_model.save_dict['w']
    f = w.read_value()
    values = f.eval(session = disc_model.session)
    weights.append(values)
    test_marginals.append(disc_model.marginals(F_test))


# # Generate Statistics After Model Training

# ## Grab the feature weights

# In[ ]:

features = session.query(FeatureKey).all()
feat_data = []
for feat, w0, w1 in zip(features,weights[0],weights[1]):
    feat_data.append([feat.name, w0[0], w1[0]])
feat_frame = pd.DataFrame(feat_data, columns= ["Feature", "Model_KB", "Model_KB_CONTEXT"])


# ## Grab the class probabilities

# In[ ]:

test_marginals[0].shape
cand_probs = []
for candidate_id in L_test.candidate_index:
    cand = session.query(Candidate).filter(Candidate.id == candidate_id).one()
    index = L_test.candidate_index[candidate_id]

Пример #13

0

Показать файл

Файл: dataset_statistics.py Проект: hubayirp/snorkeling

dev_set_df = (entity_level_df.query("split==10&has_sentence==1").merge(
    dev_candidate_df, on=["drugbank_id", "doid_id"]))

test_set_df = (entity_level_df.query("split==11&has_sentence==1").merge(
    test_candidate_df, on=["drugbank_id", "doid_id"]))

# In[16]:

total_candidates_df = (training_set_df.append(dev_set_df).append(
    test_set_df).drop_duplicates("candidate_id"))

# In[17]:

dev_candidates = (session.query(CompoundDisease).filter(
    CompoundDisease.id.in_(
        total_candidates_df.query("split==10").sample(
            10000, random_state=100).candidate_id.tolist())).all())
dev_df = make_sentence_df(dev_candidates)
dev_df.head(2)

# In[18]:

test_candidates = (
    session.query(CompoundDisease).filter(
        CompoundDisease.id.in_(
            total_candidates_df.query("split==11")
            # Black list ethanol, alcohol and alcohold dependence
            # Samples too many sentences with the above entities
            .query("drugbank_id!='DB00898'&doid_id!='DOID:0050741'").sample(
                10000, random_state=120).candidate_id.tolist())).all())
test_df = make_sentence_df(test_candidates)

Пример #14

0

Показать файл

Файл: final_code.py Проект: aaditya-thakkar/Crime_Extractor

doc_preprocessor = TSVDocPreprocessor('tutorials/intro/data/articles.tsv',
                                      max_docs=max_docs)

# In[5]:

from snorkel.parser import CorpusParser

corpus_parser = CorpusParser()
get_ipython().magic(u'time corpus_parser.apply(doc_preprocessor)')

# In[6]:

from snorkel.models import Document, Sentence

print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()

dict_final = {}

crimetype_murder = [
    'killed', 'kill', 'kills', 'killing', 'murder', 'shot', 'shooting',
    'convicted', 'murdered'
]
crimetype_rape = [
    'rape', 'raped', 'gangraped', 'molested', 'molestation', 'molesting',
    'harassment', 'raping'
]
crimetype_attack = [
    'hurt', 'rioting', 'injured', 'attack', 'beating up', 'attacked'
]

Пример #15

0

Показать файл

Файл: ir_signals_denoising.py Проект: dkrigel/snorkel

                  help="the pair file [default: %default]")
parser.add_option("-s",
                  "--saved-dir",
                  dest="saved_dir",
                  help="directory to save the rank scores [default: %default]")

parser.set_defaults(
    input_pair=
    "/Users/datienguyen/Desktop/coding/data-search/exp-data/dataSEARCH/pair-store/train.csv"  #3_signals.top20doc.csv" #
    ,
    saved_dir="../../data-search/exp-data/dataSearch/pair-store/")

opts, args = parser.parse_args(sys.argv)
input_pair = opts.input_pair

session.query(Context).delete()
session.query(Candidate).delete()

values = ['positive', 'negative']
Tweet = candidate_subclass('Tweet', ['tweet'], values=values)

#item_id,worker_id,query_id,doc1,doc2,annotation
cand_dict = {}

with open(input_pair, "r") as myFile:
    reader = csv.reader(myFile)
    for row in reader:
        #print(row)
        item_id = row[0]
        worker_id = row[1]
        anno = row[5]

Пример #16

0

Показать файл

Файл: 2.epilepsy-labeler.py Проект: strategist922/snorkeling

# In[ ]:

DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

# # Look at potential Candidates

# Use this to look at loaded candidates from a given set. The constants represent the index to retrieve the training set, development set and testing set.

# In[ ]:

TRAIN = 0
DEV = 1
TEST = 2

candidates = session.query(DiseaseGene).filter(
    DiseaseGene.split == TRAIN).all()
sv = SentenceNgramViewer(candidates, session)

# In[ ]:

sv

# # Label Functions

# Here is the fundamental part of the project. Below are the label functions that are used to give a candidate a label of 1,0 or -1 which corresponds to correct relation, not sure and incorrection relation. The goal here is to develop functions that can label as many candidates as possible.

# In[ ]:

gene_list = pd.read_csv('epilepsy-genes.tsv', sep="\t")

# In[ ]:

Пример #17

0

Показать файл

Файл: cv_script_LSTM.py Проект: EricaXia/snorkel

n_docs = 500

doc_preprocessor = TSVDocPreprocessor(
    'pdfs_big.tsv', max_docs=n_docs)  # new files (88 papers)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor, count=n_docs)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d=virus_list)
animals_matcher = DictionaryMatch(d=animals_list)
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [
                                    virus_matcher, animals_matcher], nested_relations=True)

docs = session.query(Document).order_by(Document.name).all()

# Text Pattern based labeling functions, which look for certain keywords

# List to parenthetical


def ltp(x):
    return '(' + '|'.join(x) + ')'


# --------------------------------

# Positive LFs:

detect = {'detect', 'detects', 'detected',

Пример #18

0

Показать файл

cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, 'Protein Training')

for name, path in [('Protein Development', 'data/protein_dev.tsv'),
                   ('Protein Test', 'data/protein_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()


from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Protein Training').one()
corpus

sentences = set()
for document in corpus:
    for sentence in document.sentences:
        sentences.add(sentence)

from snorkel.candidates import Ngrams
from snorkel.models import candidate_subclass
#entity = candidate_subclass('entity', ['entity1', 'entity2'])
import pandas as pd
ROOT = 'data/dicts/'
proteins   = set(pd.read_csv(ROOT + 'protein_names.csv', header=None, index_col=0, encoding='utf-8').dropna()[1])
ngrams = Ngrams(n_max=1)
from snorkel.matchers import DictionaryMatch

Пример #19

0

Показать файл

Файл: politeness.py Проект: chrisflucas/respect

        if print_stats:
            self.print_report(speaker, apologize, ask_agency, give_agency, \
                gratitude, please)

        return {
            'apologize': apologize,
            'ask_agency': ask_agency,
            'give_agency': give_agency,
            'gratitude': gratitude,
            'please': please
        }


if __name__ == '__main__':
    session = SnorkelSession()
    docs = session.query(ReconDocument)
    i = 0
    p = PolitenessExtractor()
    '''
		Stupid code because I don't know how to get the length 
		of the array of documents returned by the query. 

		i.e. for i in range(len(docs)): does not work because
			 len(docs) is not a thing...
	'''

    while True:
        try:
            p.compute_score(docs[i], print_stats=False)
            i += 1
        except:

Пример #20

0

Показать файл

    for line in open(filename, "r", errors='ignore').readlines():
        if line.split("\t")[0] in docID or len(line.split("\t")) != 2:
            continue
        docID.add(line.split("\t")[0])
        fout.write(line.replace("\n", " ").strip() + "\n")

print("total docID count", len(docID))
doc_preprocessor = TSVDocPreprocessor(newfile,
                                      encoding="utf-8",
                                      max_docs=n_docs)

from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence  # defined in context.py file

if session.query(Document).count() == 0:
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor, count=n_docs)  # ,parallelism=5)

print("Documents:", session.query(Document).count())

from snorkel import SnorkelSession
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from collections import defaultdict
import numpy as np

session = SnorkelSession()
docs = session.query(Document).all()
sents = session.query(Sentence).all()  # get all sentences from snorkel.db

Пример #21

0

Показать файл

def get_gold_labels(session: SnorkelSession,
                    annotator_name: str = "gold") -> List[dict]:
    # define relationship in case it is not defined
    ak = session.query(GoldLabelKey).filter(
        GoldLabelKey.name == annotator_name).first()
    return session.query(GoldLabel).filter(GoldLabel.key == ak).all()

Пример #22

0

Показать файл

Файл: evaluate_price_extractor_snorkel.py Проект: CHTAP/extractors

LFs = [lf_preceding_half_missing_quantity, lf_preceding_half]

# Setting extraction type -- should be a subfield in your data source extractions field!
from dataset_utils import create_candidate_class
extraction_type = 'price'

# Creating candidate class
candidate_class, candidate_class_name = create_candidate_class(extraction_type)

# Printing number of docs/sentences
from snorkel.models import Document, Sentence
print("==============================")
print(f"DB contents for {postgres_db_name}:")
print(
    "Number of candidates:",
    session.query(candidate_class).filter(candidate_class.split == 0).count())
print("==============================")

# Split to pull eval candidates from
eval_split = 0

# Executing query for eval candidates
eval_cands = session.query(candidate_class).filter(
    candidate_class.split == eval_split).order_by(candidate_class.id).all()
print(f'Loaded {len(eval_cands)} candidates...')

# Applying LFs
print("Applying LFs...")
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)
L_eval = labeler.apply(split=eval_split, parallelism=parallelism)

Пример #23

0

Показать файл

Файл: snorkelDetectsCuts.py Проект: patrickmarcel/SQLWL-segmentation

session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(path)

corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)


pairs = candidate_subclass('pairs1', ['queryPair'])
regexpmatch=RegexMatchSpan(rgx=".*")
cs=queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])


docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents=set();
for i,doc in enumerate(docs):
    for s in doc.sentences:
        sents.add(s)


cand_extractor.apply(sents)

print("Number of candidates:", session.query(pairs).count())


labeler = LabelAnnotator(lfs=LFs)

Пример #24

0

Показать файл

def main(argv):
    parser = argparse.ArgumentParser(description='Process some arguments.')
    parser.add_argument('--dbPath',
                        type=str,
                        default=os.getcwd() + os.sep + 'snorkel.db',
                        help='the path of snorkel database')
    parser.add_argument(
        '--lfPath',
        type=str,
        default=os.getcwd() + os.sep + 'util_default.py',
        help='the path of util.py file where labelling functions were defined')

    args = parser.parse_args()

    # Connect to db, and get session

    util_module = imp.load_source("module.name", args.lfPath)
    train_doc_breakdown_map = dict(
    )  # maps doc_id into a dict of ["Background", "Purpose", "Mechanism", "Method", "Finding"]
    test_doc_breakdown_map = dict()

    SnorkelSession = create_session_with_conn("sqlite:///" + args.dbPath)
    session = SnorkelSession()

    print("Documents:", session.query(Document).count())
    print("Sentences:", session.query(Sentence).count())

    sents = session.query(Sentence).all()
    n_max_corpus = 0
    for sent in sents:
        n_max_corpus = max(n_max_corpus, len(sent.words))

    print("The longest sentence has " + str(n_max_corpus) + " tokens.")

    ngrams = Ngrams(n_max=n_max_corpus)

    # from util import number_of_people

    docs = session.query(Document).all()

    train_sents = set()
    dev_sents = set()
    test_sents = set()

    for i, doc in enumerate(docs):
        for s in doc.sentences:
            if i % 10 == 8 and "cscw18" != doc.name[:6]:
                dev_sents.add(s)
            elif "cscw18" == doc.name[:
                                      6]:  # replace the earlier 10% test documents as cscw'18 annotation guideline 10 examples
                test_sents.add(s)
            elif "cscw18" != doc.name[:6]:
                train_sents.add(s)

    General, general_extractor = util_module.get_segment_class_and_matcher(
        "General", ngrams)
    general_cands = extract_and_display(
        train_sents,
        dev_sents,
        test_sents,
        session,
        general_extractor,
        General,
        "General",
        train_doc_breakdown_map=train_doc_breakdown_map,
        test_doc_breakdown_map=test_doc_breakdown_map)

    input("Finished general ")

    # load segment_candidate_class and corresponding_matcher, e.g. (Background, non_comma_dict_background_matcher)
    Background, background_matcher = util_module.get_segment_class_and_matcher(
        "Background", ngrams)
    background_cands = extract_and_display(
        train_sents,
        dev_sents,
        test_sents,
        session,
        background_matcher,
        Background,
        "Background",
        train_doc_breakdown_map=train_doc_breakdown_map,
        test_doc_breakdown_map=test_doc_breakdown_map)

    debug_sess_eval(session, Background, background_matcher)

Пример #25

0

Показать файл

train_df = pd.read_excel(spreadsheet_names['train'])
train_cids = train_df.candidate_id.astype(int).tolist()
train_df.head(2)


# In[ ]:


dev_df = pd.read_excel(spreadsheet_names['dev'])
dev_df = dev_df[dev_df.curated_dsh.notnull()]
dev_cids = list(map(int, dev_df.candidate_id.values))
dev_df.head(2)


# In[ ]:


train_hand_df = pd.read_excel(spreadsheet_names['train_hand_label'])
train_hand_cids = train_hand_df[train_hand_df.curated_dsh.notnull()].candidate_id.astype(int).tolist()
train_hand_df.head(2)


# In[ ]:


for cid_list in [train_cids, train_hand_cids, dev_cids]:
    cids = session.query(CompoundGene.id).filter(CompoundGene.id.in_(cid_list))
    get_ipython().magic(u'time labeler.apply(cids_query=cids, parallelism=5)')

Пример #26

0

Показать файл

Файл: main.py Проект: gmachiraju/markerville-backend

# Parsing
corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)

# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()

# Assigning each sentence to {train,dev,test}-set based on Stable ID
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(
                doc.name))

#----------------------

Пример #27

0

Показать файл

) as candidate_sen inner join sentence on candidate_sen.sentence_id=sentence.id
'''
candidate_sentence_df = pd.read_sql(sql, database_str).astype(
    {"entrez_gene_id": int})
candidate_sentence_df.head(2)

# In[10]:

total_candidates_df = (edge_level_df.merge(candidate_sentence_df,
                                           on=["doid_id", "entrez_gene_id"]))
total_candidates_df.head(2)

# In[11]:

dev_candidates = (session.query(DiseaseGene).filter(
    DiseaseGene.id.in_(
        total_candidates_df.query("split==1").sample(
            10000, random_state=100).candidate_id.tolist())).all())
dev_df = make_sentence_df(dev_candidates)
dev_df.head(2)

# In[12]:

test_candidates = (session.query(DiseaseGene).filter(
    DiseaseGene.id.in_(
        total_candidates_df.query("split==2").sample(
            10000, random_state=120).candidate_id.tolist())).all())
test_df = make_sentence_df(test_candidates)
test_df.head(2)

# In[13]:

Пример #28

0

Показать файл

dev_set_df = (entity_level_df.query("split==7&has_sentence==1").merge(
    dev_candidate_df.astype({"entrez_gene_id": int}),
    on=["drugbank_id", "entrez_gene_id"]))

test_set_df = (entity_level_df.query("split==8&has_sentence==1").merge(
    test_candidate_df.astype({"entrez_gene_id": int}),
    on=["drugbank_id", "entrez_gene_id"]))

# In[16]:

total_candidates_df = (training_set_df.append(dev_set_df).append(test_set_df))

# In[17]:

dev_candidates = (session.query(CompoundGene).filter(
    CompoundGene.id.in_(
        total_candidates_df.query("split==7").sample(
            10000, random_state=100).candidate_id.tolist())).all())
dev_df = make_sentence_df(dev_candidates)
dev_df.head(2)

# In[18]:

test_candidates = (session.query(CompoundGene).filter(
    CompoundGene.id.in_(
        total_candidates_df.query("split==8").sample(
            10000, random_state=100).candidate_id.tolist())).all())
test_df = make_sentence_df(test_candidates)
test_df.head(2)

# In[ ]:

Python SnorkelSession.query примеры использования