def reload_external_labels(session: SnorkelSession, input_file: Union[str, Path], annotator_name: str = "gold"): Education = get_candidate_class() with open(str(input_file), "r") as f: lbls = ujson.load(f) for lbl in lbls: # we check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join((lbl['person'], lbl['organization'])) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=lbl['value'])) # commit session session.commit() # reload annotator labels reload_annotator_labels(session, Education, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, Education, annotator_name, split=2, filter_label_split=False)
def create_collection(predicate_resume, split): session = SnorkelSession() CandidateSubclass = predicate_resume["candidate_subclass"] if split == None or (split != 1 and split != 2): print("No split selected") logging.error("No split selected") cids_query = session.query( CandidateSubclass.id).filter(CandidateSubclass.split == split) brat = BratAnnotator(session, CandidateSubclass, encoding='utf-8') collection_name = get_collection_name(predicate_resume, split) brat.init_collection(collection_name, cid_query=cids_query) return collection_name
def learn_generative(y_data): """ Uses Snorkel to learn a generative model of the relative accuracies of LFs. It learns one generative model for each class, and combines them into a set of noisy labels """ labels = [[], [], [], [], [], [], [], [], [], [], [], [], []] for ex in y_data: for i in range(0, 13): label_i = [] for vote in ex: label_i.append(int(vote[i])) labels[i].append(np.array(label_i)) labels = map(lambda x: np.array(x), labels) labels = np.array(labels) n_labels = [] n_stats = [] for i, class_lbl in enumerate(labels): print("learning generative model for label: {}".format(i)) session = SnorkelSession() gen_model = GenerativeModel() gen_model.train(class_lbl, epochs=100, decay=0.95, step_size=0.1 / class_lbl.shape[0], reg_param=1e-6, cardinality=2) train_marginals = gen_model.marginals(csr_matrix(class_lbl)) n_labels.append(train_marginals) n_stats.append(gen_model.learned_lf_stats()) for i, stats in enumerate(n_stats): stats.to_csv("./results/lf_stats/" + int_to_label[i], sep=',', encoding='utf-8') return np.array(n_labels).T
def main(): from snorkel import SnorkelSession session = SnorkelSession() import os from snorkel.parser import XMLMultiDocPreprocessor # The following line is for testing only. Feel free to ignore it. file_path = 'data/CDR.BioC.small.xml' if 'CI' in os.environ else 'data/CDR.BioC.xml' doc_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//document', text='.//passage/text/text()', id='.//id/text()') from snorkel.parser import CorpusParser from utils import TaggerOneTagger tagger_one = TaggerOneTagger() corpus_parser = CorpusParser(fn=tagger_one.tag) corpus_parser.apply(list(doc_preprocessor)[:100]) # parsed result saved in session return doc_preprocessor, corpus_parser, session
def main(args): session = SnorkelSession() # --------------------------------------- # 1: Split into blocks # --------------------------------------- split_pubtator_corpus(args.input_file, split_size=args.split_size) # --------------------------------------- # 2: Parse documents # --------------------------------------- filelist = glob.glob("{}.splits_{}/*".format(args.input_file, args.split_size)) # Iterate through the splits start_ts = time() for fp in filelist: doc_preprocessor = PubTatorDocPreprocessor(fp) parser = Spacy() if args.parser == "spacy" else StanfordCoreNLPServer() corpus_parser = CorpusParser(parser=parser) corpus_parser.apply(doc_preprocessor, parallelism=args.num_procs, clear=False) end_ts = time() print "Split completed in [%s]" % (time() - end_ts, ) # pubtator_tags = PubTatorTagProcessor() # for fp in filelist: # # load entity tags # pubtator_tags.load_data(session, fp) print "\nDONE in [%s]" % (time() - start_ts, )
def __init__(self, name, version=0.1): """ Create DB connection :param name: Database name """ self.session = SnorkelSession() self.name = name self.version = version if os.path.isfile(self.name + '/output.db'): os.remove(self.name + '/output.db') self.conn = sqlite3.connect(self.name + '/output.db')
def predicate_candidate_labelling(predicate_resume, parallelism=1, limit=None, replace_key_set=False): logging.info("Starting labeling ") session = SnorkelSession() try: candidate_subclass = predicate_resume["candidate_subclass"] key_group = predicate_resume["label_group"] cids_query = session.query( candidate_subclass.id).filter(candidate_subclass.split == 0) ##skip cands already extracted #alreadyExistsGroup=session.query(LabelKey).filter(LabelKey.group==key_group).count()>0 #if alreadyExistsGroup: # cids_query= get_train_cids_not_labeled(predicate_resume,session) #if limit !=None: # cids_query=cids_query.filter(candidate_subclass.id<limit) LFs = get_labelling_functions(predicate_resume) labeler = LabelAnnotator(lfs=LFs) np.random.seed(1701) ##if first run or adding a new labeling functionS is needed to set replace key set to True #if not replace_key_set: # replace_key_set=not alreadyExistsGroup L_train = labeler.apply(parallelism=parallelism, cids_query=cids_query, key_group=key_group, clear=True, replace_key_set=True) print(L_train.lf_stats(session)) logging.info(L_train.lf_stats(session)) finally: logging.info("Finished labeling ")
def train_disc_model(predicate_resume, parallelism=8): logging.info("Start training disc ") session = SnorkelSession() train_cids_query = get_train_cids_with_marginals_and_span(predicate_resume, session) logging.info("Loading marginals ") train_marginals = load_marginals(session, split=0, cids_query=train_cids_query) train_kwargs = { 'lr': 0.01, 'dim': 50, 'n_epochs': 10, 'dropout': 0.25, 'print_freq': 1, 'max_sentence_length': 100 } logging.info("Querying train cands") candidate_subclass=predicate_resume["candidate_subclass"] train_cands = session.query(candidate_subclass).filter(candidate_subclass.split == 0).order_by(candidate_subclass.id).all()#get_train_cands_with_marginals_and_span(predicate_resume, session).all() logging.info("Querying dev cands") dev_cands = get_dev_cands_with_span(predicate_resume, session).all() logging.info("Querying gold labels") L_gold_dev = get_gold_dev_matrix(predicate_resume, session) logging.info("Training") lstm = reRNN(seed=1701, n_threads=int(parallelism)) lstm.train(train_cands, train_marginals, **train_kwargs) logging.info("Saving") _save_model(predicate_resume, lstm) #test model candidate_subclass=predicate_resume["candidate_subclass"] test_cands = session.query(candidate_subclass).filter(candidate_subclass.split == 2).order_by(candidate_subclass.id).all() L_gold_test = get_gold_test_matrix(predicate_resume,session) p, r, f1 = lstm.score(test_cands, L_gold_test) print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1)) logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1)) lstm.save_marginals(session, test_cands)
def extract_triples(predicate_resume, disc_model_name=None): date_time = strftime("%Y-%m-%d_%H_%M_%S", gmtime()) session = SnorkelSession() if disc_model_name is None: disc_model_name = "D" + predicate_resume["predicate_name"] + "Latest" test_cands_query = get_test_cids_with_span(predicate_resume, session) test_cands = test_cands_query.all() lstm = reRNN() logging.info("Loading marginals ") lstm.load(disc_model_name) predictions = lstm.predictions(test_cands) dump_file_path3 = "./results/" + "triples_" + predicate_resume[ "predicate_name"] + date_time + ".csv" subject_type = predicate_resume["subject_type"] object_type = predicate_resume["object_type"] subject_type_split = subject_type.split('/') object_type_split = object_type.split('/') subject_type_end = subject_type_split[len(subject_type_split) - 1] object_type_end = object_type_split[len(object_type_split) - 1] with open(dump_file_path3, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["text", "marginal", "prediction"]) i = 0 for c in test_cands: if predictions[i] == 1: subject_span = getattr(c, "subject").get_span() object_span = getattr(c, "object").get_span() subject_uri = get_dbpedia_node(subject_span, subject_type_end) object_uri = get_dbpedia_node(object_span, object_type_end) predicate_uri = predicate_resume["predicate_URI"] if subject_uri is not None and object_uri is not None: row = [ str(subject_uri), str(predicate_uri), str(object_uri) ] writer.writerow(row) i = i + 1
def parse_wikipedia_dump( dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/', clear=False, parallelism=8): logging.info("Corpus parsing start") session = SnorkelSession() corpus_parser = CorpusParser(parser=Spacy()) onlyfiles = [ f for f in listdir(dumps_folder_path) if isfile(join(dumps_folder_path, f)) ] i = 0 for file in onlyfiles: if file.endswith(".xml"): print file doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path + file, doc='.//doc', text='./text()', id='./@title') if i > 0: clear = False try: corpus_parser.apply(doc_preprocessor, clear=clear, parallelism=parallelism) except IntegrityError as e: print("Already parsed " + file) logging.error("Already parsed " + file) i = i + 1 #logging.debug("Documents: %d", session.query(Document).count()) #logging.debug("Sentences: %d", session.query(Sentence).count()) logging.info("Corpus parsing end")
# In[ ]: #Set up the environment username = "******" password = "******" dbname = "pubmeddb" #Path subject to change for different os database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format( username, password, dbname) os.environ['SNORKELDB'] = database_str from snorkel import SnorkelSession session = SnorkelSession() # In[ ]: from snorkel.annotations import LabelAnnotator, load_marginals from snorkel.annotations import load_gold_labels from snorkel.learning.pytorch import LSTM from snorkel.models import Candidate, FeatureKey, candidate_subclass # In[ ]: edge_type = "dg" # In[ ]: if edge_type == "dg":
import pandas as pd from snorkel import SnorkelSession from snorkel.candidates import PretaggedCandidateExtractor from snorkel.models import Document, Sentence, candidate_subclass from snorkel.parser import CorpusParser from snorkel.viewer import SentenceNgramViewer import tqdm # In[ ]: #Set up the environment database_str = "sqlite:///" + os.environ[ 'WORKINGPATH'] + "/Database/epilepsy.db" os.environ['SNORKELDB'] = database_str session = SnorkelSession() # # Parse the Pubmed Abstracts # The code below is designed to read and parse data gathered from pubtator. Pubtator outputs their annotated text in xml format, so that is the standard file format we are going to use. # In[ ]: working_path = os.environ['WORKINGPATH'] xml_parser = XMLMultiDocPreprocessor(path=working_path + '/Database/epilepsy_data.xml', doc='.//document', text='.//passage/text/text()', id='.//id/text()') # In[ ]:
import tqdm # In[ ]: #Set up the environment username = "******" password = "******" dbname = "pubmeddb" #Path subject to change for different os database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format( username, password, dbname) os.environ['SNORKELDB'] = database_str from snorkel import SnorkelSession session = SnorkelSession() # In[ ]: from snorkel.annotations import FeatureAnnotator, LabelAnnotator from snorkel.features import get_span_feats from snorkel.models import candidate_subclass from snorkel.models import Candidate from snorkel.viewer import SentenceNgramViewer # In[ ]: edge_type = "dg" debug = False # In[ ]:
sns.set(rc={'figure.figsize': (12, 6), "font.size": 17}) # In[2]: #Set up the environment username = "******" password = "******" dbname = "pubmeddb" #Path subject to change for different os database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format( username, password, dbname) os.environ['SNORKELDB'] = database_str from snorkel import SnorkelSession session = SnorkelSession() # In[3]: from snorkel.models import candidate_subclass, Candidate DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene']) # In[4]: from utils.notebook_utils.dataframe_helper import write_candidates_to_excel, make_sentence_df # ## Load and Merge DataFrames # In[5]: edge_level_df = pd.read_csv("input/disease_associates_gene.tsv.xz", sep="\t")
# In[ ]: #Set up the environment username = "******" password = "******" dbname = "pubmeddb" #Path subject to change for different os database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(username, password, dbname) os.environ['SNORKELDB'] = database_str from snorkel import SnorkelSession session = SnorkelSession() # In[ ]: from snorkel.annotations import LabelAnnotator from snorkel.models import candidate_subclass from snorkel.models import Candidate from snorkel.viewer import SentenceNgramViewer # In[ ]: DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])
with open('/dfs/scratch1/jdunnmon/data/memex-data/config/config.json') as fl: config = json.load(fl) # Changing directory to code area os.chdir(config['homedir']) #For PostgreSQL postgres_db_name = os.path.split(args['file'])[-1].split('.')[0] os.environ['SNORKELDB'] = os.path.join(config['postgres_location'], postgres_db_name) print(f"Env: {os.environ['SNORKELDB']}") # Start Snorkel session from snorkel import SnorkelSession session = SnorkelSession() # Setting parallelism parallelism = config['parallelism'] # Setting random seed seed = config['seed'] random.seed(seed) np.random.seed(seed) # Set data source: options are 'content.tsv', 'memex_jsons', 'es' data_source = config['data_source'] # Setting max number of docs to ingest max_docs = config['max_docs']