def main(): from snorkel import SnorkelSession session = SnorkelSession() import os from snorkel.parser import XMLMultiDocPreprocessor # The following line is for testing only. Feel free to ignore it. file_path = 'data/CDR.BioC.small.xml' if 'CI' in os.environ else 'data/CDR.BioC.xml' doc_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//document', text='.//passage/text/text()', id='.//id/text()') from snorkel.parser import CorpusParser from utils import TaggerOneTagger tagger_one = TaggerOneTagger() corpus_parser = CorpusParser(fn=tagger_one.tag) corpus_parser.apply(list(doc_preprocessor)[:100]) # parsed result saved in session return doc_preprocessor, corpus_parser, session
def main(args): session = SnorkelSession() # --------------------------------------- # 1: Split into blocks # --------------------------------------- split_pubtator_corpus(args.input_file, split_size=args.split_size) # --------------------------------------- # 2: Parse documents # --------------------------------------- filelist = glob.glob("{}.splits_{}/*".format(args.input_file, args.split_size)) # Iterate through the splits start_ts = time() for fp in filelist: doc_preprocessor = PubTatorDocPreprocessor(fp) parser = Spacy() if args.parser == "spacy" else StanfordCoreNLPServer() corpus_parser = CorpusParser(parser=parser) corpus_parser.apply(doc_preprocessor, parallelism=args.num_procs, clear=False) end_ts = time() print "Split completed in [%s]" % (time() - end_ts, ) # pubtator_tags = PubTatorTagProcessor() # for fp in filelist: # # load entity tags # pubtator_tags.load_data(session, fp) print "\nDONE in [%s]" % (time() - start_ts, )
def parse_corpus(to_process_file): file_path = to_process_file doc_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//Article', text='./text/text()', id='./article-id/text()') corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(list(doc_preprocessor)) return corpus_parser
def doc_creation(df_features, session): # write the subset to a .csv and convert it to a .tsv file df_features.to_csv('dataset.csv', header=False) # csv.writer(open('dataset.tsv', 'w+'), delimiter=' ').writerows(csv.reader(open("dataset.csv"))) doc_preprocessor = TSVDocPreprocessor('dataset.tsv') corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count())
def docs_to_sentences(): # Must set SNORKELDB before importing SnorkelSession from snorkel import SnorkelSession from snorkel.parser import TSVDocPreprocessor from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence session = SnorkelSession() pathname = 'small_data/data_400.tsv' if os.environ['AGP_DATA_SIZE'] == 'small-data' else 'data/full_pp.tsv' doc_preprocessor = TSVDocPreprocessor(pathname) corpus_parser = CorpusParser() corpus_parser.apply(doc_preprocessor, parallelism=multiprocessing.cpu_count()) print "Documents:", session.query(Document).count() print "Sentences:", session.query(Sentence).count()
def doc_parse(path): """ Loads TSV file and parses to Snorkel Contexts :param path: Path to TSV file :return: None """ try: doc_preprocessor = TSVDocPreprocessor(path, encoding=u'utf-8', max_docs=2500) corpus_parser = CorpusParser() corpus_parser.apply(doc_preprocessor) print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) except Exception: print('Error loading TSV file')
def parse_wikipedia_dump( dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/', clear=False, parallelism=8): logging.info("Corpus parsing start") session = SnorkelSession() corpus_parser = CorpusParser(parser=Spacy()) onlyfiles = [ f for f in listdir(dumps_folder_path) if isfile(join(dumps_folder_path, f)) ] i = 0 for file in onlyfiles: if file.endswith(".xml"): print file doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path + file, doc='.//doc', text='./text()', id='./@title') if i > 0: clear = False try: corpus_parser.apply(doc_preprocessor, clear=clear, parallelism=parallelism) except IntegrityError as e: print("Already parsed " + file) logging.error("Already parsed " + file) i = i + 1 #logging.debug("Documents: %d", session.query(Document).count()) #logging.debug("Sentences: %d", session.query(Sentence).count()) logging.info("Corpus parsing end")
continue docID.add(line.split("\t")[0]) fout.write(line.replace("\n", " ").strip() + "\n") print("total docID count", len(docID)) doc_preprocessor = TSVDocPreprocessor(newfile, encoding="utf-8", max_docs=n_docs) from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence # defined in context.py file if session.query(Document).count() == 0: corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) # ,parallelism=5) print("Documents:", session.query(Document).count()) from snorkel import SnorkelSession from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence from collections import defaultdict import numpy as np session = SnorkelSession() docs = session.query(Document).all() sents = session.query(Sentence).all() # get all sentences from snorkel.db docs_per_bucket = 150
for word in dont_want2: if word in virus_list: virus_list.remove(word) # ------------------------------------------ # START SNORKEL SESSION session = SnorkelSession() n_docs = 500 doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv', max_docs=n_docs) # new files (88 papers) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) VirusHost = candidate_subclass('VirusHost', ['virus', 'host']) ngrams = Ngrams(n_max=10) virus_matcher = DictionaryMatch(d=virus_list) animals_matcher = DictionaryMatch(d=animals_list) cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [virus_matcher, animals_matcher], nested_relations=True) docs = session.query(Document).order_by(Document.name).all() # Text Pattern based labeling functions, which look for certain keywords
#term = r'(\$?\d\d\d?.*?per|\$?\d\d\d?.*?hours?|\$?\d\d\d?.*?half|\$?\d\d\d?.*?minutes?)' term = r'([Ll]ocation:.{0,100}|[cC]ity:.{0,100}|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)' # Doc length in characters, remove to have no max max_doc_length = None # Setting preprocessor print(f'Preprocessing folder: {data_loc}') doc_preprocessor = set_preprocessor(data_source, data_loc, max_docs=max_docs, verbose=False, clean_docs=False, content_fields=['raw_content', 'url'], term=term, max_doc_length=max_doc_length) # Setting parser and applying corpus preprocessor parser = SimpleTokenizer(delim='<|>') corpus_parser = CorpusParser(parser=parser) corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism, verbose=False) # Printing number of docs/sentences print("==============================") print(f"DB creation results for {postgres_db_name}:") print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) print("==============================")
##### LIST OF LF FUNCTIONS TO CHECK LFs=[LF_edit_index,LF_recall_projections2,LF_jackard_index] #LFs=[LF_edit_index,LF_jackard_index] ##### snorkeling session = SnorkelSession() doc_preprocessor = TSVDocPreprocessor(path) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) pairs = candidate_subclass('pairs1', ['queryPair']) regexpmatch=RegexMatchSpan(rgx=".*") cs=queryCandidate() cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch]) docs = session.query(Document).order_by(Document.name).all() sentences = session.query(Sentence).all() #print(sentences) sents=set(); for i,doc in enumerate(docs): for s in doc.sentences:
dg_tagger = Tagger(grouped) # In[ ]: corpus_parser = CorpusParser(fn=dg_tagger.tag) document_chunk = [] for document in tqdm.tqdm(xml_parser.generate()): document_chunk.append(document) # chunk the data because snorkel cannot # scale properly if len(document_chunk) >= 5e4: corpus_parser.apply(document_chunk, parallelism=5, clear=False) document_chunk = [] # If generator exhausts and there are still # document to parse if len(document_chunk) > 0: corpus_parser.apply(data, parallelism=5, clear=False) document_chunk = [] # # Get each candidate relation # After parsing the above abstracts, the next step in this pipeline is to extract candidates from all the tagged sentences. A candidate is considered a candidate if two mentions occur in the same sentence. For this pilot study, we are only considering the follow candidate relationships: Disease-Gene, Gene-Gene, Compound-Gene, Compound-Disease. In conjunction with extracting candidates, this part of the pipeline also stratifies each sentence into three different categories: Train (70%), Dev (20%), and Test (10%). These set categories will be used in subsequent notebooks ([3](3.data-gen-model.ipynb), [4](4.data-disc-model.ipynb), [5](5.data-analysis.ipynb)) for training and testing the machine learning algorithms. # In[ ]: chunk_size = 2e5
dev_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//document', text='.//passage/text/text()', id='.//id/text()') file_path = 'articles/testcorpus.xml' test_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//document', text='.//passage/text/text()', id='.//id/text()') # Parsing corpus_parser = CorpusParser() # Note: Parallelism can be run with a Postgres DBMS, but not SQLite corpus_parser.apply(list(train_preprocessor)) corpus_parser.apply(list(dev_preprocessor), clear=False) corpus_parser.apply(list(test_preprocessor), clear=False) # Retrieving Stable IDs for each of the candidate sentences with open('articles/doc_ids.pkl', 'rb') as f: train_ids, dev_ids, test_ids = load(f) train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids) train_sents, dev_sents, test_sents = set(), set(), set() docs = session.query(Document).order_by(Document.name).all() # Assigning each sentence to {train,dev,test}-set based on Stable ID for i, doc in enumerate(docs): for s in doc.sentences: if doc.name in train_ids: