#entity = candidate_subclass('entity', ['entity1', 'entity2']) import pandas as pd ROOT = 'data/dicts/' proteins = set(pd.read_csv(ROOT + 'protein_names.csv', header=None, index_col=0, encoding='utf-8').dropna()[1]) ngrams = Ngrams(n_max=1) from snorkel.matchers import DictionaryMatch longest_match_only = True dict_proteins = DictionaryMatch(d=proteins, ignore_case=True, longest_match_only=longest_match_only) #misc_matcher = MiscMatcher(longest_match_only=True) from snorkel.candidates import CandidateExtractor ce = CandidateExtractor(entity, [ngrams, ngrams], [dict_proteins, dict_proteins], symmetric_relations=False, nested_relations=False, self_relations=False) %time c = ce.extract(sentences, 'Protein1 Training Candidates', session) for corpus_name in ['Protein Development']: corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one() sentences = set() for document in corpus: for sentence in document.sentences: sentences.add(sentence) %time c = ce.extract(sentences, 'Protein1 Development Candidates', session) session.add(c) session.commit()
ngrams = Ngrams(n_max=3) from snorkel.matchers import PersonMatcher from snorkel.matchers import OrganizationMatcher person_matcher = PersonMatcher(longest_match_only=True) org_matcher = OrganizationMatcher(longest_match_only=True) from snorkel.candidates import CandidateExtractor ce = CandidateExtractor(Title, [ngrams, ngrams], [person_matcher, org_matcher], symmetric_relations=False, nested_relations=False, self_relations=False) %time c = ce.extract(sentences, 'Emails Training Candidates', session) print "Number of candidates:", len(c) session.add(c) session.commit() for corpus_name in ['Emails Development', 'Emails Test']: #corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one() sentences = set() for document in corpus: for sentence in document.sentences: if number_of_people(sentence) < 5: sentences.add(sentence) %time c = ce.extract(sentences, corpus_name + ' Candidates', session) session.add(c)