def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing') entities = [ # Sent 1 Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0), Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0), Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0), # Sent 2 Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0), # Sent 3 Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0), # Sent 4 ] for e in entities: part1.annotations.append(e) cls.doc.parts['s1h1'] = part1 cls.splitter = NLTKSplitter() cls.tokenizer = NLTK_TOKENIZER cls.splitter.split(cls.dataset) cls.tokenizer.tokenize(cls.dataset) # assert False, str(list(cls.dataset.sentences())) assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc # TEXT = "123 45678" # POS = "012345678" # ANN1 = " X " # ANN2 = " XXX " # PAR1 = "XXX " # PAR1 = " XXXXX" cls.part = Part( 'Here is a random sentence for the benefit of your mamma') cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=10, text='random sentence', confidence=0) cls.part.annotations.append(cls.entity) cls.doc.parts['s1h1'] = cls.part # Apply through pipeline NLTKSplitter().split(cls.dataset) NLTK_TOKENIZER.tokenize(cls.dataset) nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(nlp) cls.parser.parse(cls.dataset) # cls.part.percolate_tokens_to_entities() cls.sentence = cls.part.sentences[0]
def setUpClass(cls): cls.dataset = Dataset() doc1 = Document() cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1 for s in TEST_SENTENCES_SINGLE_ROOT: part = Part(s) doc1.parts[s] = part doc2 = Document() cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2 for s in TEST_SENTENCES_MULTI_ROOT: part = Part(s) doc2.parts[s] = part cls.nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(cls.nlp) cls.splitter = NLTKSplitter() cls.tokenizer = GenericTokenizer( lambda string: (tok.text for tok in cls.nlp.tokenizer(string))) cls.splitter.split(cls.dataset) cls.tokenizer.tokenize(cls.dataset) cls.parser.parse(cls.dataset) cls.computed_sentences = [] for sentence in cls.dataset.sentences(): dist, then = compute_shortest_paths(sentence) cls.computed_sentences.append((dist, then, sentence))
def __init__(self, splitter=None, tokenizer=None, feature_generators=None): if not splitter: splitter = NLTKSplitter() if not tokenizer: tokenizer = TmVarTokenizer() if feature_generators is None: feature_generators = [SimpleFeatureGenerator(), PorterStemFeatureGenerator(), WindowFeatureGenerator((-3, -2, -1, 1, 2, 3), ['stem[0]'])] if isinstance(splitter, Splitter): self.splitter = splitter else: raise TypeError('not an instance that implements Splitter') if isinstance(tokenizer, Tokenizer): self.tokenizer = tokenizer else: raise TypeError('not an instance that implements Tokenizer') if hasattr(feature_generators, '__iter__'): for index, feature_generator in enumerate(feature_generators): if not isinstance(feature_generator, FeatureGenerator): raise TypeError('not an instance that implements FeatureGenerator at index {}'.format(index)) self.feature_generators = feature_generators elif isinstance(feature_generators, FeatureGenerator): self.feature_generators = [feature_generators] else: raise TypeError('not an instance or iterable of instances that implements FeatureGenerator')
def _apply_pipeline(self, dataset): # Apply through pipeline NLTKSplitter().split(dataset) NLTK_TOKENIZER.tokenize(dataset) # nlp = get_spacy_nlp_english(load_parser=False) # cls.parser = SpacyParser(nlp) # cls.parser.parse(cls.dataset) return dataset
def setUp(self): self.dataset = StringReader( 'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read( ) NLTKSplitter().split(self.dataset) TmVarTokenizer().tokenize(self.dataset) part = list(self.dataset.parts())[0] part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG')) part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))
def test_split(self): NLTKSplitter().split(self.dataset) sentences_ = [] for document in self.dataset.documents.values(): for part in document.parts.values(): sentences_ += part.sentences_ expected = ['This is one sentence.', 'This is another one.', 'This is the third one; here continues.'] self.assertEqual(sentences_, expected)
def setUpClass(cls): text1 = "Flowers in the Rain. Are absolutely marvellous. Though i would say this text is stupid. Cheers!" part1 = Part(text1) doc = Document() doc.parts['firstpart'] = part1 dataset = Dataset() dataset.documents['firstdocument'] = doc NLTKSplitter().split(dataset) # TmVarTokenizer().tokenize(dataset) cls.data = dataset cls.testpart = dataset.documents['firstdocument'].parts['firstpart']
def test_generate_patterns_245(self): dataset = StringReader('token c.A436C token').read() NLTKSplitter().split(dataset) TmVarTokenizer().tokenize(dataset) TmVarDictionaryFeatureGenerator().generate(dataset) token_features = [{key: value for key, value in token.features.items() if value is not 'O'} for token in dataset.tokens()] self.assertEqual(token_features[0], {}) self.assertEqual(token_features[1], {'pattern4[0]': 'B', 'pattern2[0]': 'B'}) self.assertEqual(token_features[2], {'pattern4[0]': 'I', 'pattern2[0]': 'I'}) self.assertEqual(token_features[3], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'B'}) self.assertEqual(token_features[4], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'I'}) self.assertEqual(token_features[5], {'pattern4[0]': 'E', 'pattern2[0]': 'I', 'pattern5[0]': 'E'}) self.assertEqual(token_features[6], {})
def test_tag(self): # todo question is that the proper way? with predicts_classes GNormPlusGeneTagger().tag(self.data, uniprot=True) NLTKSplitter().split(self.data) TmVarTokenizer().tokenize(self.data) StubSameSentenceRelationExtractor(PRO_CLASS_ID, MUT_CLASS_ID, PRO_REL_MUT_CLASS_ID).annotate(self.data) self.assertEqual(len([x for x in self.data.annotations() if x.class_id == PRO_CLASS_ID]), 0) self.assertEqual(len([x for x in self.data.annotations() if x.class_id == MUT_CLASS_ID]), 2) self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0) self.data.purge_false_relationships() self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0) del self.data.documents['15878741'].parts['abstract'].annotations[0] self.assertEqual(len([x for x in self.data.annotations() if x.class_id == MUT_CLASS_ID]), 1) self.data.purge_false_relationships() self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0)
def _get_test_data(self, entity_sentence, assumed_tokens_words=None): if assumed_tokens_words is None: assumed_tokens_words = entity_sentence.split(' ') # Create dataset dataset = StringReader(entity_sentence).read() part = next(dataset.parts()) entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=0, text=entity_sentence) part.annotations.append(entity) # Apply through pipeline NLTKSplitter().split(dataset) NLTK_TOKENIZER.tokenize(dataset) self.parser.parse(dataset) # Rest sentences = part.sentences assert len(sentences) == 1 sentence = sentences[0] assert len(assumed_tokens_words) == len(sentence) for (assumed_token_word, actual_token) in zip(assumed_tokens_words, sentence): assert assumed_token_word == actual_token.word part.compute_tokens_depth() roots = Part.get_sentence_roots(sentence) for r in roots: self._assert_depth_eq(r, 0) part.set_entities_head_tokens() return (dataset, sentence, entity, roots)
def setUpClass(cls): # create a sample dataset to test cls.dataset = Dataset() doc_id1 = Document() doc_id1.parts['t1'] = Part('This title blows your mind') text = str( 'This magic only exists in your dreams. To become reality, you have to work at it. ' 'Thr is only available with the residue threonine and a mutation, ' 'though things can change positions ' 'when adding some more replacements. Between me being sorry ' 'and you being an insertion.') doc_id1.parts['p1'] = Part(text.replace('\n', '')) cls.dataset.documents['doc_id1'] = doc_id1 NLTKSplitter().split(cls.dataset) TmVarTokenizer().tokenize(cls.dataset) cls.feature = NLMentionFeatureGenerator(thr=4) cls.feature.generate(dataset=cls.dataset)
from nala.utils.corpora import get_corpus from nalaf.preprocessing.spliters import NLTKSplitter from nalaf.preprocessing.tokenizers import TmVarTokenizer data = get_corpus('nala_training_1') NLTKSplitter().split(data) TmVarTokenizer().tokenize(data) from nalaf.features.embeddings import BinarizedWordEmbeddingsFeatureGenerator BinarizedWordEmbeddingsFeatureGenerator( '/home/abojchevski/projects/nala/nala/data/word_embeddings_2016-03-28/word_embeddings.model' ).generate(data) for token in data.tokens(): print(token.features, token.end)
def __init__(self): self.data = get_corpus('IDP4+') NLTKSplitter().split(self.data) TmVarTokenizer().tokenize(self.data)
def filter(self, documents, min_found=1, use_nala=False): """ :type documents: collections.Iterable[(str, nalaf.structures.data.Document)] """ _progress = 1 _start_time = time.time() _total_time = 0 _time_avg_per_pattern = 0 _pattern_calls = 0 _time_reg_pattern_total = 0 _time_max_pattern = 0 _low_performant_pattern = "" # NLDefiners init exclusive_definer = ExclusiveNLDefiner() _e_array = [0, 0, 0] inclusive_definer = InclusiveNLDefiner() _i_array = [0, 0] last_found = 0 crf = PyCRFSuite(self.location_binary_model) # counter_to_stop_for_caching = 0 for pmid, doc in documents: # if any part of the document contains any of the keywords # yield that document # if counter_to_stop_for_caching > 400: # break # counter_to_stop_for_caching += 1 # print(counter_to_stop_for_caching) part_offset = 0 data_tmp = Dataset() data_tmp.documents[pmid] = doc data_nala = deepcopy(data_tmp) NLTKSplitter().split(data_tmp) # data_tmvar = TmVarTagger().generate_abstracts([pmid]) if use_nala: self.pipeline.execute(data_nala) self.labeler.label(data_nala) crf.tag(data_nala, MUT_CLASS_ID) PostProcessing().process(data_nala) ExclusiveNLDefiner().define(data_nala) used_regexs = {} positive_sentences = 0 for i, x in enumerate(doc.parts): # print("Part", i) sent_offset = 0 cur_part = doc.parts.get(x) sentences = cur_part.sentences_ for sent in sentences: sent_length = len(sent) new_text = sent.lower() new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text) # new_text = re.sub('\W+', ' ', new_text) found_in_sentence = False for i, reg in enumerate(self.patterns): _lasttime = time.time() # time start var match = reg.search(new_text) # debug bottleneck patterns _time_current_reg = time.time( ) - _lasttime # time end var _pattern_calls += 1 # pattern calls already occured _time_reg_pattern_total += _time_current_reg # total time spent on searching with patterns if _time_reg_pattern_total > 0: _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls # avg spent time per pattern call # todo create pattern performance eval for descending amount of recognized patterns # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg: # print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern) # if _time_max_pattern < _time_current_reg: # _time_max_pattern = _time_current_reg # _low_performant_pattern = reg.pattern # print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern) # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)': # if _time_current_reg > _time_avg_per_pattern * 10: # # print(_time_avg_per_pattern, _time_current_reg) # f.write("BAD_PATTERN\n") # f.write(sent + "\n") # f.write(new_text + "\n") if match: # if pmid in data_tmvar.documents: # anti_doc = data_tmvar.documents.get(pmid) nala_doc = data_nala.documents.get(pmid) start = part_offset + sent_offset + match.span()[0] end = part_offset + sent_offset + match.span()[1] # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end)) # print(not nala_doc.overlaps_with_mention(start, end, annotated=False)) if reg.pattern in used_regexs: used_regexs[reg.pattern] += 1 else: used_regexs[reg.pattern] = 1 print(color.PURPLE + new_text.replace( match.group(), color.BOLD + color.DARKCYAN + color.UNDERLINE + match.group() + color.END + color.PURPLE) + color.END) if not found_in_sentence: positive_sentences += 1 found_in_sentence = True # if not anti_doc.overlaps_with_mention(start, # end) \ # and not nala_doc.overlaps_with_mention(start, end, annotated=False): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True # else: # # if nala not used only tmvar considered # if not anti_doc.overlaps_with_mention(start, end): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True if use_nala: nala_found_mention = nala_doc.overlaps_with_mention( start, end, annotated=False) if nala_found_mention: print_verbose(nala_found_mention) if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold: yield pmid, doc if _lasttime - time.time() > 1: print_verbose('time intensive regex', i) sent_offset += 2 + sent_length # for per sentence positives if found_in_sentence: positive_sentences += 1 part_offset += sent_offset if use_nala: for part in nala_doc: for ann in part.predicted_annotations: if ann.subclass > 0: print_verbose(part.text[:ann.offset] + color.BOLD + ann.text + color.END + part.text[ann.offset + len(ann.text):]) positive_sentences += min_found _old_time = _start_time _start_time = time.time() _one_time = _start_time - _old_time if _one_time > 0.3 and positive_sentences > min_found: _progress += 1 _total_time += _one_time _time_per_doc = _total_time / _progress print_verbose( "PROGRESS: {:.2f} secs ETA per one positive document:" " {:.2f} secs".format(_total_time, _time_per_doc)) print_debug('used regular expressions:', json.dumps(used_regexs, indent=4)) if positive_sentences >= min_found: last_found = 0 print_verbose('YEP', pmid) yield pmid, doc else: print_verbose('NOPE', pmid)