def setUpClass(cls): # This is a hack to create a session to a different DB after Snorkel has # already been imported. It does not work in general because e.g., the UDF # constructor uses Snorkel's new_sessionmaker on different processes. # In general, the connection should still be set via the SNORKELDB # environment variable dir_path = os.path.dirname(os.path.realpath(__file__)) snorkel_engine = create_engine(os.path.join('sqlite:///' + dir_path, 'spouses.db')) SnorkelSession = sessionmaker(bind=snorkel_engine) cls.session = SnorkelSession() Spouse = candidate_subclass('Spouse', ['person1', 'person2']) cls.train_marginals = load_marginals(cls.session, split=0) cls.train_cands = cls.session.query(Spouse).filter(Spouse.split == 0).order_by(Spouse.id).all() cls.dev_cands = cls.session.query(Spouse).filter(Spouse.split == 1).order_by(Spouse.id).all() cls.test_cands = cls.session.query(Spouse).filter(Spouse.split == 2).order_by(Spouse.id).all() # Each candidate is featurized as 10 floats. The first five are between # -.25 and 1 if the class label is True and between -1 and .25 if False. # The remaining five are between -1 and 1. cls.F_train = load_feature_matrix(cls.session, split=0, coerce_int=False) cls.F_dev = load_feature_matrix(cls.session, split=1, coerce_int=False) cls.F_test = load_feature_matrix(cls.session, split=2, coerce_int=False) cls.L_gold_dev = load_gold_labels(cls.session, annotator_name='gold', split=1) cls.L_gold_test = load_gold_labels(cls.session, annotator_name='gold', split=2)
help="directory to save the rank scores [default: %default]") parser.set_defaults( input_pair= "/Users/datienguyen/Desktop/coding/data-search/exp-data/dataSEARCH/pair-store/train.csv" #3_signals.top20doc.csv" # , saved_dir="../../data-search/exp-data/dataSearch/pair-store/") opts, args = parser.parse_args(sys.argv) input_pair = opts.input_pair session.query(Context).delete() session.query(Candidate).delete() values = ['positive', 'negative'] Tweet = candidate_subclass('Tweet', ['tweet'], values=values) #item_id,worker_id,query_id,doc1,doc2,annotation cand_dict = {} with open(input_pair, "r") as myFile: reader = csv.reader(myFile) for row in reader: #print(row) item_id = row[0] worker_id = row[1] anno = row[5] if item_id not in cand_dict: cand_dict[item_id] = {} cand_dict[item_id][worker_id] = anno
password = "******" dbname = "pubmeddb" #Path subject to change for different os database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(username, password, dbname) os.environ['SNORKELDB'] = database_str from snorkel import SnorkelSession session = SnorkelSession() # In[3]: from snorkel.models import candidate_subclass, Candidate CompoundDisease = candidate_subclass('CompoundDisease', ['Compound', 'Disease']) # In[4]: from utils.notebook_utils.dataframe_helper import write_candidates_to_excel, make_sentence_df # ## Load and Merge DataFrames # In[5]: edge_level_df = ( pd.read_table("input/compound_treats_disease.tsv.xz")
from snorkel.models import candidate_subclass from .dictionaries import * from .helper import make_regex candidate_indices = [ 8, 14, 15, 20, 24 ] # Indices for reading the corresponding column in the "comments_bewertungen_new_ids.csv" File # Candidates Intelligenz_BB3c = candidate_subclass('Intelligenz', ['signal_word']) WirkungaufBRD_BB3i = candidate_subclass('WirkungaufBRD', ['signal_word']) Implikation_BB4 = candidate_subclass( 'Implikation', ['signal_word'] ) # Gehören, sollten, etc. Abfrage mit Reflexiven Personalpronomen, Indefinite Pronomen, Imperative, kein Verb Beschimpfung_BB6a = candidate_subclass('Beschimpfung', ['signal_word']) Entmenschlichung_BB6e = candidate_subclass('Entmenschlichung', ['signal_word']) # Regexes intelligenz_regex = make_regex(intelligenz_signal_words) wirkungaufbrd_regex = make_regex(wirkungaufbrd_signal_words) implikation_regex = make_regex(implikation_signal_words) beschimpfung_regex = make_regex(beschimpfung_signal_words, offense_signal_words, refugee_related_signal_words, negative_signal_words) entmenschlichung_regex = make_regex(entmenschlichung_signal_words, animal_signal_words) candidate_classes = [ Intelligenz_BB3c, WirkungaufBRD_BB3i, Implikation_BB4, Beschimpfung_BB6a,
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format( username, password, dbname) os.environ['SNORKELDB'] = database_str from snorkel import SnorkelSession session = SnorkelSession() # In[3]: from snorkel.learning.pytorch.rnn.rnn_base import mark_sentence from snorkel.learning.pytorch.rnn.utils import candidate_to_tokens from snorkel.models import Candidate, candidate_subclass # In[4]: CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene']) # # Compound Binds Gene # This section loads the dataframe that contains all compound binds gene candidate sentences and their respective dataset assignments. # In[5]: cutoff = 300 total_candidates_df = ( pd.read_table("../dataset_statistics/data/all_cbg_candidates.tsv.xz" ).query("sen_length < @cutoff")) total_candidates_df.head(2) # # Train Word Vectors
from snorkel import SnorkelSession session = SnorkelSession() # In[3]: from snorkel.learning.pytorch.rnn.utils import candidate_to_tokens from snorkel.models import Candidate, candidate_subclass # In[4]: GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2']) # In[5]: total_candidates_df = ( pd .read_table("input/all_gig_candidates.tsv.xz") .sort_values("candidate_id") ) total_candidates_df.head(2) # In[6]:
crimetype = 'Drugs related Crime' break print(doc, date, crimetype) dict_final[doc] = { 'docno': doc, 'date': date, 'crimetype': crimetype, 'location': [] } # In[2]: # In[7]: from snorkel.models import candidate_subclass LocationPer = candidate_subclass('LocationPer', ['location', 'person']) # Location = candidate_subclass('Location', ['location']) # In[8]: from snorkel.candidates import Ngrams, CandidateExtractor from snorkel.matchers import PersonMatcher, LocationMatcher ngrams = Ngrams(n_max=3) person_matcher = PersonMatcher(longest_match_only=True) location_matcher = LocationMatcher(longest_match_only=True) cand_extractor = CandidateExtractor(LocationPer, [ngrams, ngrams], [person_matcher, location_matcher], symmetric_relations=False) # cand_extractor2 = CandidateExtractor(Location,
#LFs=[LF_edit_index, LF_jackard_index, LF_Common_Tables_Index] # best so far #LFs=[LF_edit_index,LF_jackard_index,LF_cosine_index,LF_Common_Tables_Index,LF_common_fragment_index] #LFs=[LF_edit_index,LF_jackard_index,LF_cosine_index,LF_common_fragment_index] #LFs = [LF_recall_projections2, LF_recall_selections2,LF_recall_tables2, LF_edit_index, LF_jackard_index,LF_common_fragment_index, LF_Common_Tables_Index] LFs = [LF_recall_projections2, LF_edit_index, LF_jackard_index] ##### snorkeling session = SnorkelSession() doc_preprocessor = TSVDocPreprocessor(path) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) pairs = candidate_subclass('pairs1', ['queryPair']) regexpmatch = RegexMatchSpan(rgx=".*") cs = queryCandidate() cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch]) docs = session.query(Document).order_by(Document.name).all() sentences = session.query(Sentence).all() #print(sentences) sents = set() for i, doc in enumerate(docs): for s in doc.sentences: sents.add(s) cand_extractor.apply(sents)
reg_param = RangeParameter('mu', 1e-8, 1e-2, step=1, log_base=10) disc_model = LogReg() %load_ext autoreload %autoreload 2 %matplotlib inline from snorkel import SnorkelSession session = SnorkelSession() from snorkel.models import CandidateSet from snorkel.models import candidate_subclass from snorkel.annotations import FeatureManager feature_manager = FeatureManager() entity = candidate_subclass('entity', ['entity1', 'entity2']) dev = session.query(CandidateSet).filter(CandidateSet.name == 'Protein1 Development Candidates').one() %time F_dev = feature_manager.update(session, dev, 'Train1 Features', False) from snorkel.annotations import LabelManager label_manager = LabelManager() L_gold_dev = label_manager.load(session, dev, "Sotera User") gold_dev_set = session.query(CandidateSet).filter(CandidateSet.name == 'Protein Development Candidates').one() from snorkel.learning import LogReg from snorkel.learning_utils import RandomSearch, ListParameter, RangeParameter iter_param = ListParameter('n_iter', [250, 500, 1000, 2000]) rate_param = RangeParameter('rate', 1e-4, 1e-2, step=0.75, log_base=10)
from snorkel.candidates import Ngrams, CandidateExtractor from snorkel.viewer import SentenceNgramViewer from snorkel.annotations import LabelAnnotator, load_gold_labels, FeatureAnnotator, save_marginals, load_marginals from snorkel.learning import SparseLogisticRegression, GenerativeModel, RandomSearch from snorkel.learning.structure import DependencySelector from snorkel.learning.utils import MentionScorer # from snorkel.contrib.rnn import reRNN import matchers import LF from candidate_adjective_fixer import * from load_external_annotations_new import load_external_labels session = SnorkelSession() BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition']) # Helper functions # In[ ]: #------------------ # Helper Functions #------------------ def grabCandidates(extractor, schema): # Candidate Counts for k, sents in enumerate([train_sents, dev_sents, test_sents]): extractor.apply(sents, split=k, clear=False) print "Number of candidates: ", session.query(schema).filter(
%time corpus = cp.parse_corpus(session, name) session.commit() sentences = set() for document in corpus: for sentence in document.sentences: if number_of_people(sentence) < 5: sentences.add(sentence) from snorkel.models import candidate_subclass Title = candidate_subclass('Person_Org', ['person1', 'organization']) from snorkel.candidates import Ngrams ngrams = Ngrams(n_max=3) from snorkel.matchers import PersonMatcher from snorkel.matchers import OrganizationMatcher person_matcher = PersonMatcher(longest_match_only=True) org_matcher = OrganizationMatcher(longest_match_only=True) from snorkel.candidates import CandidateExtractor
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams, cand1Matcher, cand2Matcher, model_name, output_file_name, corpus_parser): print "Started" session = SnorkelSession() # The following line is for testing only. Feel free to ignore it. candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2]) sentences = set() docs = session.query(Document).order_by(Document.name).all() for doc in docs: for s in doc.sentences: sentences.add(s) cand_1_ngrams = Ngrams(n_max=cand1_ngrams) # condition_ngrams = Ngrams(n_max=7) cand_2_ngrams = Ngrams(n_max=cand2_ngrams) # medium_ngrams = Ngrams(n_max=5) # type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # # level_ngrams = Ngrams(n_max=1) # unit_ngrams = Ngrams(n_max=1) # Construct our Matchers # cMatcher = matchers.getConditionMatcher() # mMatcher = matchers.getMediumMatcher() # tMatcher = matchers.getTypeMatcher() # lMatcher = matchers.getLevelMatcher() # uMatcher = matchers.getUnitMatcher() # Building the CandidateExtractors # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher]) candidate_extractor = CandidateExtractor(candidate_pair, [cand_1_ngrams, cand_2_ngrams], [cand1Matcher, cand2Matcher]) # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher]) # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher]) # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher]) # List of Candidate Sets for each relation type: [train, dev, test] candidate_extractor.apply(sentences, split=4, clear=True) cands = session.query(candidate_pair).filter( candidate_pair.split == 4).order_by(candidate_pair.id).all() session.commit() # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug) # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium) # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType) # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit) if (len(cands)) == 0: print "No Candidates Found" return if (pairing_name == 'BiomarkerCondition'): # session.rollback() # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1]) add_adj_candidate_BC(session, candidate_pair, cands, 4) # fix_specificity(session, BiomarkerCondition, cands_BC[1]) # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count() session.commit() lstm = reRNN(seed=1701, n_threads=None) lstm.load(model_name) predictions = lstm.predictions(cands) output_file = open(output_file_name, 'wb') import csv csvWriter = csv.writer(output_file) csvWriter.writerow( ['doc_id', 'sentence', candidate1, candidate2, 'prediction']) for i in range(len(cands)): doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:] sentence_string = cands[i].get_parent().text cand_1_string = cands[i].get_contexts()[0].get_span() cand_2_string = cands[i].get_contexts()[1].get_span() prediction = predictions[i] csvWriter.writerow([ unidecode(doc_string), unidecode(sentence_string), unidecode(cand_1_string), unidecode(cand_2_string), prediction ])
sys.path.insert(1, '../snorkel') from snorkel import SnorkelSession from snorkel.matchers import DictionaryMatch from final_candidates import GM, PM from snorkel.candidates import Ngrams, CandidateSpace, CandidateExtractor from snorkel.models import Document, Sentence, candidate_subclass from snorkel.viewer import SentenceNgramViewer SPLIT_ON_DOCS = False ALL_DOCS = True # if true, create train dev and test. if false, push everything to dev cands. session = SnorkelSession() GenePhenoPair = candidate_subclass('GenePhenoPair2', ['gene', 'pheno']) gene_ngrams = Ngrams(n_max=5) pheno_ngrams = Ngrams(n_max=10) cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams], [GM, PM], symmetric_relations=True) print "Splitting Docs..." pathname = 'small_data/' if os.environ[ 'AGP_DATA_SIZE'] == 'small-data' else 'data/' with open(pathname + 'pmcids_400.pkl', 'rb') as f: sent_dicts = cPickle.load(f) train_ids, dev_ids, test_ids = set(sent_dicts['train']), set( sent_dicts['dev']), set(sent_dicts['test']) all_ids = train_ids.union(dev_ids).union(test_ids)
rule_regex_search_before_A, rule_regex_search_before_B, ) # A ContextSpace defines the "space" of all candidates we even potentially consider; in this case we use the Ngrams subclass, and look for all n-grams up to 7 words long session = SnorkelSession() doc_preprocessor = TSVDocPreprocessor('/Users/fanglinchen/Desktop/PersonalDataStack/DeepScrub/DeepScrub/algorithms/input.tsv', max_docs=350) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) Sensitive = candidate_subclass('Sensitive', ['sensitive'], values = ['person', 'job', 'event', 'place', 'date', 'time', 'product', 'email', 'phone', 'quantity', 'address', 'url', 'org', 'file', 'password', False]) # generating candidates. ngrams = Ngrams(n_max=6) ngramMatcher = NgramMatcher(longest_match_only = False) cand_extractor = CandidateExtractor( Sensitive, [ngrams], [ngramMatcher], symmetric_relations=False ) sents = session.query(Sentence).all() cand_extractor.apply(sents, split=0)
experiment_name = '_exp3' # experiment_name = '25similar' print "Setting up variables & DB connection for experiment:\n" print "*******************\n%s\n*******************" % experiment_name # point to appropriate DBs, pickle files etc. pkl_paths = 'pickles/%s/' % experiment_name path_candidate_dict_pkl = pkl_paths + 'candidate_dict.pickle' # TODO rename that path_pubmed_ids_pkl = pkl_paths + 'pubmed_ids.pickle' path_base_learners = pkl_paths + 'base_learner_predictions' # TODO create dirs # Shortcuts to connect to database, initialize candidate subclass and return snorkel session import os #TODO: set experiment_name and restructure dir os.environ['SNORKELDB'] = 'postgres:///snorkel' + experiment_name from snorkel import SnorkelSession session = SnorkelSession() from snorkel.models import Document, Sentence import matplotlib.pyplot as plt from snorkel.annotations import save_marginals from snorkel.models import Candidate, candidate_subclass REGULATOR = candidate_subclass('REGULATOR', ['Chemical', 'Gene']) print "\nSnorkel session connected to: ", os.environ['SNORKELDB']
virus_list.remove(word) # ------------------------------------------ # START SNORKEL SESSION session = SnorkelSession() n_docs = 500 doc_preprocessor = TSVDocPreprocessor( 'pdfs_big.tsv', max_docs=n_docs) # new files (88 papers) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) VirusHost = candidate_subclass('VirusHost', ['virus', 'host']) ngrams = Ngrams(n_max=10) virus_matcher = DictionaryMatch(d=virus_list) animals_matcher = DictionaryMatch(d=animals_list) cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [ virus_matcher, animals_matcher], nested_relations=True) docs = session.query(Document).order_by(Document.name).all() # Text Pattern based labeling functions, which look for certain keywords # List to parenthetical def ltp(x):
from snorkel.annotations import FeatureAnnotator, LabelAnnotator, save_marginals from snorkel.learning import GenerativeModel from snorkel.learning.utils import MentionScorer from snorkel.models import Candidate, FeatureKey, candidate_subclass from snorkel.utils import get_as_dict from tree_structs import corenlp_to_xmltree from treedlib import compile_relation_feature_generator # In[ ]: edge_type = "dg" # In[ ]: if edge_type == "dg": DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene']) elif edge_type == "gg": GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2']) elif edge_type == "cg": CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene']) elif edge_type == "cd": CompoundDisease = candidate_subclass('CompoundDisease', ['Compound', 'Disease']) else: print("Please pick a valid edge type") # # Load preprocessed data # This code will load the label matrix that was generated in the previous notebook ([Notebook 2](2.data-labeler.ipynb)). **Disclaimer**: this block might break, which means that the snorkel code is still using its old code. The problem with the old code is that sqlalchemy will attempt to load all the labels into memory. Doesn't sound bad if you keep the amount of labels small, but doesn't scale when the amount of labels increases exponentially. Good news is that there is a pull request to fix this issue. [Check it out here!](https://github.com/HazyResearch/snorkel/pull/789) # In[ ]:
docs_per_bucket = args.docs_per_bucket sents_split = defaultdict(lambda: []) for ind, doc in enumerate(docs): bucket = int(ind / docs_per_bucket) for s in doc.sentences: sents_split[bucket] += [s] print("Number of buckets: (should have around ~100 buckets??)", len(sents_split)) from snorkel.models import candidate_subclass from snorkel.candidates import Ngrams, CandidateExtractor from snorkel.matchers import * import datetime Unigram = candidate_subclass('Unigram', ['unigram_cue'], values=['PP', 'MN', 'NULL']) ngrams = Ngrams(n_max=1) ngram_matcher = NgramMatcher() unigram_segment_extractor = CandidateExtractor(Unigram, [ngrams], [ngram_matcher]) # from snorkel.lf_helpers import * from snorkel.annotations import LabelAnnotator # from LF.util_common_default_categorical import purpose_LFs,mechanism_LFs,null_LFs from LF.util_common_default_categorical_onset_1026 import * # purpose_LFs,mechanism_LFs,null_LFs print("total LF count", len(purpose_LFs + mechanism_LFs + null_LFs), "unique count", len(set(purpose_LFs + mechanism_LFs + null_LFs)), "purpose_LFs",
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format( username, password, dbname) os.environ['SNORKELDB'] = database_str from snorkel import SnorkelSession session = SnorkelSession() # In[3]: from snorkel.learning.pytorch.rnn.rnn_base import mark_sentence from snorkel.learning.pytorch.rnn.utils import candidate_to_tokens from snorkel.models import Candidate, candidate_subclass # In[4]: DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene']) # # Disease Associates Disease # This section loads the dataframe that contains all disease associates gene candidate sentences and their respective dataset assignments. # In[5]: cutoff = 300 total_candidates_df = ( pd.read_table("../dataset_statistics/data/all_dg_candidates_map.tsv.xz" ).query("sen_length < @cutoff")) total_candidates_df.head(2) # # Embed All Disease Gene Sentences
if doc.name in train_ids: train_sents.add(s) elif doc.name in dev_ids: dev_sents.add(s) elif doc.name in test_ids: test_sents.add(s) else: raise Exception('ID <{0}> not found in any id set'.format( doc.name)) #---------------------- # Candidate Extraction #---------------------- # Defining the Candidate Schemas BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition']) BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug']) BiomarkerMedium = candidate_subclass('BiomarkerMedium', ['biomarker', 'medium']) # N-grams: the probabilistic search space of our entities biomarker_ngrams = Ngrams(n_max=1) condition_ngrams = Ngrams(n_max=7) drug_ngrams = Ngrams(n_max=5) medium_ngrams = Ngrams(n_max=5) type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # Construct our Matchers bMatcher = matchers.getBiomarkerMatcher() cMatcher = matchers.getDiseaseMatcher() dMatcher = matchers.getDrugMatcher()