def filter_only_full_text(corpus): newcorpus = Dataset() for docid, document in corpus.documents.items(): if is_full_text(document): newcorpus.documents[docid] = document return newcorpus
def filter(self, documents): pycrf = PyCRFSuite(self.binary_model) for pmid, doc in documents: dataset = Dataset() dataset.documents[pmid] = doc self.pipeline.execute(dataset) self.labeler.label(dataset) pycrf.tag(dataset, MUT_CLASS_ID) PostProcessing().process(dataset) ExclusiveNLDefiner().define(dataset) total_nl_mentions = [] for part in doc: # print(part.annotations) print_verbose('predicted_annotations:', part.predicted_annotations) nl_mentions = [ (ann.text, ann.subclass, ann.confidence) for ann in part.predicted_annotations if ann.subclass != 0 and ann.confidence <= self.threshold ] total_nl_mentions += nl_mentions if any(total_nl_mentions): print('nl mentions', json.dumps(total_nl_mentions, indent=4)) yield pmid, doc print_verbose('nothing found')
def get_corpora(names, only_class_id=None): dataset = Dataset() for name in names.split(','): dataset.extend_dataset(get_corpus(name, only_class_id=only_class_id)) return dataset
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc # TEXT = "123 45678" # POS = "012345678" # ANN1 = " X " # ANN2 = " XXX " # PAR1 = "XXX " # PAR1 = " XXXXX" part1 = Part('123') part2 = Part('45678') ann1 = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=1, text='2', confidence=0) ann2 = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=1, text='567', confidence=1) ann1.subclass = 0 ann2.subclass = 2 part1.annotations.append(ann1) part2.annotations.append(ann2) cls.doc.parts['s1h1'] = part1 cls.doc.parts['s2p1'] = part2 doc2 = Document() doc3 = Document().parts['someid'] = Part('marmor stein und eisen') cls.dataset2 = Dataset() cls.dataset2.documents['newid'] = doc3 cls.dataset2.documents['testid'] = doc2
def test_DocumentLevelRelationEvaluator_parts_irrelevant(self): evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1) dataset = Dataset() doc_1 = Document() part_1 = Part('_irrelevant_ PART *1*') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 part_2 = Part('_irrelevant_ PART *2*') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_2'] = part_2 part_1.relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")), ] # - part_2.predicted_relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_2, 0, "maynard"), Entity(STUB_E_ID_1, 0, "TOOL")), ] self._apply_pipeline(dataset) # --- evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 1) self.assertEqual(evaluation.fn, 0) self.assertEqual(evaluation.fp, 0) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0)
def read(self): """ :returns: nalaf.structures.data.Dataset """ xmls = [] if os.path.isdir(self.path): xmls = [ os.path.join(root, file) for root, _, files in os.walk(self.path) for file in files if file.startswith('medline') and file.endswith('xml') ] elif self.path.startswith('medline') and self.path.endswith('xml'): xmls = [self.path] dataset = Dataset() for xml in xmls: for child in ET.parse(xml).getroot(): pmid = next(child.iter('PMID')).text document = Document() article = next(child.iter('Article')) title = next(article.iter('ArticleTitle')).text document.parts['title'] = Part(title, is_abstract=False) try: abstract = next(article.iter('AbstractText')).text document.parts['abstract'] = Part(abstract) except StopIteration: pass dataset.documents[pmid] = document return dataset
def read(self): """ :returns: nalaf.structures.data.Dataset """ dataset = Dataset() with open(self.corpus_file, encoding='utf-8') as file: for row in file: columns = row.split("\t") docid = columns[0] typ = columns[1] start = columns[2] end = columns[3] entity_text = columns[7] class_id = None if typ == 'Mutation': class_id = self.mut_class_id elif typ == 'AminoacidResidue': class_id = self.residue_class_id if class_id: document = dataset.documents.get(docid, Document()) part = Part(entity_text) document.parts[typ + '|' + start + '|' + end] = part part.annotations.append( Entity(class_id, int(start), entity_text)) dataset.documents[docid] = document return dataset
def read_file(a_file, filename, dataset=None, whole_basename_as_docid=False): if dataset is None: dataset = Dataset() soup = BeautifulSoup(a_file, "html.parser") document = Document() for part in soup.find_all(id=re.compile('^s')): if re.match(r'^s[3-9]', part['id']): is_abstract = False else: is_abstract = True document.parts[part['id']] = Part(str(part.string), is_abstract=is_abstract) doc_id = os.path.basename(filename).replace('.plain.html', '').replace( '.html', '').replace('.xml', '') if not whole_basename_as_docid and '-' in doc_id: doc_id = doc_id.split('-')[-1] dataset.documents[doc_id] = document return dataset
def __is_acceptable(self, doc_id, doc, annotators): if len(annotators) == 1: return True from itertools import combinations from nalaf.structures.data import Dataset from nalaf.learning.evaluators import MentionLevelEvaluator import math agreement = [] for first, second in combinations(annotators, 2): data = Dataset() data.documents[doc_id] = doc AnnJsonAnnotationReader(first).annotate(data) AnnJsonAnnotationReader(second, is_predicted=True).annotate(data) results = MentionLevelEvaluator().evaluate(data) if not math.isnan(results[-1]): agreement.append(results[-1]) # clean the doc from any annotations we added to calculate agreement for part in doc.parts.values(): part.annotations = [] part.predicted_annotations = [] return agreement and sum(agreement)/len(agreement) >= self.iaa_threshold
def benchmark_nala(member1, member2): itrs = [] # Read the IAA iterations in blocks so that the plain documents are not deleted with the AnnJsonAnnotationReader's for itr in IterationRound.all(): if itr.is_IAA(): dataset = itr.read(read_annotations=False) AnnJsonAnnotationReader( os.path.join(itr.path, "reviewed", member1), read_only_class_id=MUT_CLASS_ID, delete_incomplete_docs=False).annotate(dataset) AnnJsonAnnotationReader(os.path.join(itr.path, "reviewed", member2), read_only_class_id=MUT_CLASS_ID, delete_incomplete_docs=False, is_predicted=True).annotate(dataset) itrs.append(dataset) dataset = None # Then merge the IAA iterations all_itrs_dataset = Dataset() for itr_dataset in itrs: all_itrs_dataset.extend_dataset(itr_dataset) ExclusiveNLDefiner().define(all_itrs_dataset) return (all_itrs_dataset, MentionLevelEvaluator( subclass_analysis=True).evaluate(all_itrs_dataset))
def _create_basic_dataset(self): dataset = Dataset() doc_1 = Document() part_1 = Part('_irrelevant_') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 self._apply_pipeline(dataset) return (dataset, part_1)
def test_DocumentLevelRelationEvaluator_default_entities_case_irrelevant( self): evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1) dataset = Dataset() doc_1 = Document() part_1 = Part('_irrelevant_') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 part_1.relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")), ] # - part_1.predicted_relations = [ # empty ] self._apply_pipeline(dataset) # - evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 0) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 0.0) # --- part_1.predicted_relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")), ] evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 1) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0) # - part_1.predicted_relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "tool"), Entity(STUB_E_ID_2, 0, "MAYNARD")), ] evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 1) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0)
def read(self): """ :returns: nalaf.structures.data.Dataset """ dataset = Dataset() with DownloadArticle() as da: for pmid, doc in da.download(self.pmids): dataset.documents[pmid] = doc return dataset
def setUp(self): part = Part('Make making made. Try tried tries.') part.sentences = [[Token('Make', 0), Token('making', 5), Token('made', 12)], [Token('Try', 18), Token('tried', 22), Token('tries', 28)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part self.generator = PorterStemFeatureGenerator()
def generate_abstracts(self, list_of_pmids): """ Generates list of documents using pmids and the restapi interface from tmtools. Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/" :param list_of_pmids: strings :return nalaf.structures.Dataset: dataset """ # if os.path.isfile('cache.json'): # with open('cache.json') as f: # tm_var = json.load() # else: url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/' url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/' # load cache.json if exists if os.path.exists('cache.json'): with open('cache.json', 'r', encoding='utf-8') as f: tm_var = json.load(f) else: tm_var = {} for pmid in list_of_pmids: if pmid not in tm_var: # if pmid was not already downloaded from tmTools req = requests.get(url_tmvar.format(pmid)) try: tm_var[pmid] = req.json() except ValueError: pass # cache the tmVar annotations so we don't pull them every time with open('cache.json', 'w') as file: json.dump(tm_var, file, indent=4) # for key in tm_var: # print(json.dumps(tm_var[key], indent=4)) dataset = Dataset() for doc_id in list_of_pmids: if doc_id in tm_var: doc = Document() text = tm_var[doc_id]['text'] part = Part(text) denotations = tm_var[doc_id]['denotations'] annotations = [] for deno in denotations: ann = Entity( class_id=self.mut_class_id, offset=int(deno['span']['begin']), text=text[deno['span']['begin']:deno['span']['end']]) annotations.append(ann) # note should the annotations from tmvar go to predicted_annotations or annotations? part.annotations = annotations doc.parts['abstract'] = part dataset.documents[doc_id] = doc return dataset
def read(self): """ :returns: nalaf.structures.data.Dataset """ part = Part(self.string) document = Document() dataset = Dataset() dataset.documents['doc_1'] = document document.parts['part_1'] = part return dataset
class TestSimpleFeatureGenerator(unittest.TestCase): def setUp(self): part = Part('Word1 word2 word3. Word4 word5 word6.') part.sentences = [[ Token('Word1', 0), Token('word2', 6), Token('word3', 12) ], [Token('Word4', 19), Token('word5', 25), Token('word6', 31)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part self.simple_generator = SimpleFeatureGenerator() self.sentence_generator = SentenceMarkerFeatureGenerator() def test_simple_generate(self): self.simple_generator.generate(self.dataset) features = [token.features for token in self.dataset.tokens()] expected = iter([{ 'word[0]': 'Word1' }, { 'word[0]': 'word2' }, { 'word[0]': 'word3' }, { 'word[0]': 'Word4' }, { 'word[0]': 'word5' }, { 'word[0]': 'word6' }]) for feature in features: self.assertEqual(feature, next(expected)) def test_sentence_generate(self): self.sentence_generator.generate(self.dataset) features = [token.features for token in self.dataset.tokens()] expected = iter([{ 'BOS[0]': 1 }, {}, { 'EOS[0]': 1 }, { 'BOS[0]': 1 }, {}, { 'EOS[0]': 1 }]) for feature in features: self.assertEqual(feature, next(expected))
def setUpClass(cls): text1 = "Flowers in the Rain. Are absolutely marvellous. Though i would say this text is stupid. Cheers!" part1 = Part(text1) doc = Document() doc.parts['firstpart'] = part1 dataset = Dataset() dataset.documents['firstdocument'] = doc NLTKSplitter().split(dataset) # TmVarTokenizer().tokenize(dataset) cls.data = dataset cls.testpart = dataset.documents['firstdocument'].parts['firstpart']
def setUp(self): part = Part('Make making made. Try tried tries.') part.sentences = [[ Token('Make', 0), Token('making', 5), Token('made', 12) ], [Token('Try', 18), Token('tried', 22), Token('tries', 28)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part for token in self.dataset.tokens(): token.features['a'] = 'a' token.features['b'] = 'b'
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc # TEXT = "123 45678" # POS = "012345678" # ANN1 = " X " # ANN2 = " XXX " # PAR1 = "XXX " # PAR1 = " XXXXX" cls.part = Part( 'Here is a random sentence for the benefit of your mamma') cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=10, text='random sentence', confidence=0) cls.part.annotations.append(cls.entity) cls.doc.parts['s1h1'] = cls.part # Apply through pipeline NLTKSplitter().split(cls.dataset) NLTK_TOKENIZER.tokenize(cls.dataset) nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(nlp) cls.parser.parse(cls.dataset) # cls.part.percolate_tokens_to_entities() cls.sentence = cls.part.sentences[0]
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing') entities = [ # Sent 1 Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0), Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0), Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0), # Sent 2 Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0), # Sent 3 Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0), # Sent 4 ] for e in entities: part1.annotations.append(e) cls.doc.parts['s1h1'] = part1 cls.splitter = NLTKSplitter() cls.tokenizer = NLTK_TOKENIZER cls.splitter.split(cls.dataset) cls.tokenizer.tokenize(cls.dataset) # assert False, str(list(cls.dataset.sentences())) assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))
def read(self): """ read each .txt file in the directory, parse it and create and instance of Document form a dataset consisting of every document parsed and return it :returns structures.data.Dataset """ dataset = Dataset() with open(self.corpus_file, encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: document = Document() document.parts['abstract'] = Part(row[1]) dataset.documents[row[0]] = document return dataset
def setUp(self): part = Part('Word1 word2 word3. Word4 word5 word6.') part.sentences = [[ Token('Word1', 0), Token('word2', 6), Token('word3', 12) ], [Token('Word4', 19), Token('word5', 25), Token('word6', 31)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part self.simple_generator = SimpleFeatureGenerator() self.sentence_generator = SentenceMarkerFeatureGenerator()
def setUpClass(cls): cls.dataset = Dataset() doc1 = Document() cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1 for s in TEST_SENTENCES_SINGLE_ROOT: part = Part(s) doc1.parts[s] = part doc2 = Document() cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2 for s in TEST_SENTENCES_MULTI_ROOT: part = Part(s) doc2.parts[s] = part cls.nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(cls.nlp) cls.splitter = NLTKSplitter() cls.tokenizer = GenericTokenizer( lambda string: (tok.text for tok in cls.nlp.tokenizer(string))) cls.splitter.split(cls.dataset) cls.tokenizer.tokenize(cls.dataset) cls.parser.parse(cls.dataset) cls.computed_sentences = [] for sentence in cls.dataset.sentences(): dist, then = compute_shortest_paths(sentence) cls.computed_sentences.append((dist, then, sentence))
def __read_directory_localfs(self): dataset = Dataset() filenames = glob.glob(str(self.path + "/**/*.html"), recursive=True) + glob.glob(str(self.path + "/**/*.xml"), recursive=True) for filename in filenames: dataset = self.__read_file_path_localfs(filename, dataset) return dataset
def __read_file_path_localfs(self, filename, dataset=None): if dataset is None: dataset = Dataset() with open(filename, 'rb') as a_file: HTMLReader.read_file(a_file, filename, dataset, self.whole_basename_as_docid) return dataset
def __read_directory_hdfs(self): dataset = Dataset() filenames = walk_hdfs_directory(self.hdfs_client, self.path, lambda fname: fname.endswith(".html") or fname.endswith(".xml")) for filename in filenames: dataset = self.__read_file_path_hdfs(filename, dataset) return dataset
def __read_file_path_hdfs(self, filename, dataset=None): if dataset is None: dataset = Dataset() with self.hdfs_client.read(filename) as reader: HTMLReader.read_file(reader, filename, dataset, self.whole_basename_as_docid) return dataset
def setUpClass(cls): cls.dataset = Dataset() doc = Document() part = Part( 'This is one sentence. This is another one.\n This is the third one; here continues.' ) cls.dataset.documents['doc_1'] = doc doc.parts['part_1'] = part
def read(self): """ :returns: nalaf.structures.data.Dataset """ dataset = Dataset() if os.path.isdir(self.path): for filename in glob.glob(self.path + '/*.txt'): doc_id, doc = self.__process_file(filename) dataset.documents[doc_id] = doc else: if os.path.splitext(self.path)[-1] == '.txt': doc_id, doc = self.__process_file(self.path) dataset.documents[doc_id] = doc else: raise Exception('not a .txt file extension') return dataset