def process_document(self, document: Document, params: Dict[str, Any]): terms_index_name = params.get('terms_index', 'umls_terms') terms = document.labels[terms_index_name] negation_triggers = document.labels['negation_triggers'] all_deps = [] all_upos_tags = [] sentences = [] sentence_texts = [] for sentence in document.labels['sentences']: if len(terms.inside(sentence)) == 0 or len( negation_triggers.inside(sentence)) == 0: continue sentences.append(sentence) sentence_texts.append(sentence.text) stanza_doc = self.nlp(sentence_texts) for (sentence, stanza_sentence) in zip(sentences, stanza_doc.sentences): sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags( sentence, stanza_sentence) all_deps.extend(sentence_deps) all_upos_tags.extend(sentence_upos_tags) document.add_labels('dependencies', all_deps) document.add_labels('upos_tags', all_upos_tags)
def process_document(self, document: mtap.Document, params: Dict[str, Any]): referenced = [ mtap.GenericLabel(0, 1), mtap.GenericLabel(1, 2), mtap.GenericLabel(2, 3), mtap.GenericLabel(3, 4) ] # references can be a map of strings to labels with document.get_labeler('map_references') as label_map_references: label_map_references(0, 4, ref={ 'a': referenced[0], 'b': referenced[1], 'c': referenced[2], 'd': referenced[3] }) # references can be a list of labels with document.get_labeler('list_references') as label_list_references: label_list_references(0, 2, ref=[referenced[0], referenced[1]]) label_list_references(2, 3, ref=[referenced[2], referenced[3]]) # references can be direct with document.get_labeler('references') as label_references: label_references(0, 2, a=referenced[0], b=referenced[1]) label_references(2, 3, a=referenced[2], b=referenced[3]) # referenced labels don't need to be added via "addLabels" or "Labeler.close" before label # indices that reference them. # The Document will delay uploading any label indices to the server until they are. document.add_labels('referenced', referenced)
def copy_document(event: Event, source_document_name: str, target_document_name: str, index_names: Sequence[str] = ...): """Copies one document to another on the same event. Parameters ---------- event: Event The event. source_document_name: str The source document name. target_document_name: str The target document name. index_names: Sequence[str] If specified will only copy the specified label indices, by default all indices will be copied. """ source_document = event.documents[source_document_name] target_document = Document(target_document_name, text=source_document.text) event.add_document(target_document) if index_names is ...: info = source_document.get_label_indices_info() index_names = [i.index_name for i in info] for index_name in index_names: index = source_document.get_label_index(index_name) target_document.add_labels(index_name, index, distinct=index.distinct)
def process_document(self, document: Document, params: Dict[str, Any]): label_trigger = document.get_labeler('negation_triggers') with label_trigger: for sentence in document.get_label_index('sentences'): triggers = self.negex.detect_negex_triggers(sentence.text) for start_index, end_index, tags in triggers: label_trigger(sentence.start_index + start_index, sentence.start_index + end_index, tags=tags)
def process_document(self, document: Document, params: Dict[str, Any]): sentences = document.labels['sentences'] sentences_text = [] for sentence in sentences: sentences_text.append(sentence.text) stanza_doc = self.nlp(sentences_text) all_deps = [] all_upos_tags = [] for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences): dependencies = {} stanza_dependencies = stanza_sentence.dependencies stanza_dependencies = list(stanza_dependencies) i = 0 while len(stanza_dependencies) > 0: i += 1 if i > MAX_ITER: raise ValueError( 'Maximum Iterations reached while processing dependency graph.') head, deprel, dep = stanza_dependencies.pop() head_id = int(head.id) if head_id == 0: head_dep_label = None else: try: head_dep_label = dependencies[head_id] except KeyError: stanza_dependencies.insert(0, (head, deprel, dep)) continue token_begin = sentence.start_index + dep.parent.start_char - stanza_sentence.tokens[ 0].start_char token_end = sentence.start_index + dep.parent.end_char - stanza_sentence.tokens[ 0].start_char dep_label = GenericLabel(token_begin, token_end, head=head_dep_label, deprel=deprel) dep_label.reference_cache['dependents'] = [] dependencies[int(dep.id)] = dep_label if head_dep_label is not None: head_dep_label.dependents.append(dep_label) all_deps.append(dep_label) for word in stanza_sentence.words: token = word.parent token_begin = sentence.start_index + token.start_char - stanza_sentence.tokens[ 0].start_char token_end = sentence.start_index + token.end_char - stanza_sentence.tokens[ 0].start_char all_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos)) document.add_labels('dependencies', all_deps) document.add_labels('upos_tags', all_upos_tags)
def test_labeler_distinct_and_type_id_raises(mocker): with pytest.raises(ValueError): client = mocker.Mock(EventsClient) event = Event(event_id='1', client=client) document = Document( document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) document.get_labeler('index', distinct=True, label_adapter=DistinctGenericLabelAdapter)
def test_yml_serializer(): event = Event(event_id='1') event.metadata['foo'] = "bar" document = Document('plaintext', text='Some text.') event.add_document(document) one = label(start_index=0, end_index=5, x=10) two = label(start_index=6, end_index=10, x=15) document.add_labels('one', [one, two]) document.add_labels('two', [label(start_index=0, end_index=25, a='b', b=one), label(start_index=26, end_index=42, a='c', b=two)]) document.add_labels('three', [ label(start_index=0, end_index=10, foo=True), label(start_index=11, end_index=15, foo=False) ], distinct=True) with TemporaryFile('w+') as tf: YamlSerializer.event_to_file(event, tf) tf.flush() tf.seek(0) e = YamlSerializer.file_to_event(tf) assert e.event_id == event.event_id assert e.metadata['foo'] == 'bar' d = e.documents['plaintext'] assert d.text == document.text index_one = d.labels['one'] assert index_one == [one, two] index_two = d.labels['two'] assert index_two == [label(start_index=0, end_index=25, a='b', b=one), label(start_index=26, end_index=42, a='c', b=two)] index_three = d.labels['three'] assert index_three == [label(start_index=0, end_index=10, foo=True), label(start_index=11, end_index=15, foo=False)]
def process_document(self, document: Document, params: Dict[str, Any]): terms_index_name = params.get('terms_index', 'umls_terms') label_negated = document.get_labeler('negated') terms = document.get_label_index(terms_index_name) triggers = document.labels['negation_triggers'] deps = document.get_label_index('dependencies') upos_tags = document.get_label_index('upos_tags') with label_negated: for sentence in document.get_label_index('sentences'): sentence_terms = terms.inside(sentence) sentence_triggers = triggers.inside(sentence) if len(sentence_triggers) > 0: negations, _ = self.negex.check_sentence( sentence_terms, sentence_triggers, deps, upos_tags) for start_index, end_index in negations: label_negated(start_index, end_index)
def process_document(self, document: Document, params: Dict[str, Any]): terms_index_name = params.get('terms_index', 'umls_terms') label_negated = document.get_labeler('negated') label_trigger = document.get_labeler('negation_trigger') terms = document.get_label_index(terms_index_name) with label_negated, label_trigger: for sentence in document.get_label_index('sentences'): sentence_terms = [(t.start_index - sentence.start_index, t.end_index - sentence.start_index) for t in terms.inside(sentence)] negations, triggers = self.negex.check_sentence( sentence.text, sentence_terms) for start_index, end_index in negations: label_negated(sentence.start_index + start_index, sentence.start_index + end_index) for start_index, end_index in triggers: label_trigger(sentence.start_index + start_index, sentence.start_index + end_index)
def test_copy_document(): e = Event() doc = Document(document_name='first', text='The quick brown fox jumped over the lazy dog.') e.add_document(doc) with doc.get_labeler('some_index') as label: label(0, 3, word='The') label(4, 9, word='quick') label(10, 15, word='brown') processor = CopyDocument('first', 'second') processor.process(e, {}) second = e.documents['second'] assert second is not None assert second.labels['some_index'] == [ GenericLabel(0, 3, word='The'), GenericLabel(4, 9, word='quick'), GenericLabel(10, 15, word='brown') ]
def test_add_labels_distinct(mocker): client = mocker.Mock(EventsClient) event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] l2 = document.add_labels('index', labels, distinct=True) client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=mocker.ANY) assert l2 == labels assert l2.distinct
def process_document(self, document: Document, params: Dict[str, Any]): sentences = document.labels['sentences'] sentences_text = [] for sentence in sentences: sentences_text.append(sentence.text) stanza_doc = self.nlp(sentences_text) all_deps = [] all_upos_tags = [] for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences): sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags( sentence, stanza_sentence) all_deps.extend(sentence_deps) all_upos_tags.extend(sentence_upos_tags) document.add_labels('dependencies', all_deps) document.add_labels('upos_tags', all_upos_tags)
def build_doc(self, event): text = ' '.join(self.text) d = Document(document_name='plaintext', text=text) event.add_document(d) d.add_labels('pos_tags', self.tags, distinct=True) d.add_labels('sentences', self.sentences, distinct=True) return d
def test_labeler_distinct(mocker): client = mocker.Mock(EventsClient) event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) with document.get_labeler('index', distinct=True) as add_generic_label: add_generic_label(0, 10, x=1) add_generic_label(11, 15, x=2) add_generic_label(16, 20, x=3) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] label_adapter = DistinctGenericLabelAdapter client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=label_adapter) assert document.get_label_index('index') == labels
def test_add_labels_not_distinct(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_label_index_info.return_value = [] event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] document.add_labels('index', labels) client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=mocker.ANY) l2 = document.labels['index'] assert l2 == labels assert not l2.distinct
def test_labeler_distinct(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_label_index_info.return_value = [] event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) with document.get_labeler('index', distinct=True) as add_generic_label: add_generic_label(0, 10, x=1) add_generic_label(11, 15, x=2) add_generic_label(16, 20, x=3) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] label_adapter = DISTINCT_GENERIC_ADAPTER client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=label_adapter) assert document.labels['index'] == labels
def main(args=None): parser = ArgumentParser() parser.add_argument('input', metavar='INPUT_FOLDER', help='A folder containing PTB formatted documents.') parser.add_argument('--glob', metavar='GLOB', default='*.mrg') parser.add_argument('--source-name', metavar='DOCUMENT_NAME', default='source', help='What document to dump the PTB text into.') parser.add_argument( '--target-name', metavar='DOCUMENT_NAME', default='plaintext', help='What document to the plaintext and annotations into.') parser.add_argument('--events', metavar='EVENTS', default=None, help='The address of the events service.') parser.add_argument('--ptb-reader', metavar='READER', default=None, help='The address of the PTB Reader.') parser.add_argument('--tnt-trainer', metavar='TRAINER', default=None, help='The address of the TnT trainer.') args = parser.parse_args(args) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('ptb-reader', address=args.ptb_reader, params={ 'source_document_name': args.source_name, 'target_document_name': args.target_name }), RemoteProcessor('biomedicus-tnt-trainer', address=args.tnt_trainer, params={'document_name': args.target_name})) as pipeline: for f in Path(args.input).rglob(args.glob): print('Reading:', f) with f.open('r') as r: text = r.read() with Event(event_id=f.name, client=client) as event: d = Document(args.source_name, text=text) event.add_document(d) pipeline.run(event)
def process_document(self, document: mtap.Document, params: Dict[str, Any]) -> Optional[Dict[str, Any]]: if params['do_work']: with self.started_stopwatch('fetch_time'): text = document.text a_count = text.count('a') b_count = text.count('b') with document.get_labeler('mtap.examples.letter_counts') as label_letter_count: label_letter_count(start_index=0, end_index=len(document.text), letter='a', count=a_count) label_letter_count(start_index=0, end_index=len(document.text), letter='b', count=b_count) return {'answer': 42}
def main(args=None): parser = ArgumentParser() parser.add_argument('input', metavar='INPUT_DIR', help='A folder containing PTB formatted documents.') parser.add_argument('output', metavar='OUTPUT_DIR', help='A folder to write the json files to.') parser.add_argument('--glob', metavar='GLOB', default='*.mrg') parser.add_argument('--events', metavar='EVENTS', default=None, help='The address of the events service.') parser.add_argument('--ptb-reader', metavar='READER', default=None, help='The address of the PTB Reader.') args = parser.parse_args(args) with EventsClient(address=args.events) as client, Pipeline( RemoteProcessor('ptb-reader', address=args.ptb_reader, params={ 'source_document_name': 'source', 'target_document_name': 'gold', 'pos_tags_index': 'gold_tags' }), LocalProcessor(SerializationProcessor(JsonSerializer, output_dir=args.output), component_id='serializer', client=client)) as pipeline: for f in Path(args.input).rglob(args.glob): print('Reading:', f) with f.open('r') as r: text = r.read() with Event(event_id=f.name, client=client) as event: d = Document('source', text=text) event.add_document(d) pipeline.run(event)
def test_sentences_unknown_character(bi_lstm_model): document = Document('plaintext', text='• Sentence which contains unknown character.') bi_lstm_model.process_document(document, {}) assert document.get_label_index('sentences') == [GenericLabel(2, 44)]
# Copyright 2019 Regents of the University of Minnesota. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Hello world tutorial pipeline.""" import sys if __name__ == '__main__': from mtap import Document, Event, EventsClient, Pipeline, RemoteProcessor with EventsClient(address=sys.argv[1]) as client, \ Pipeline( RemoteProcessor(processor_id='hello', address=sys.argv[2]) ) as pipeline: with Event(event_id='1', client=client) as event: document = Document(document_name='name', text='YOUR NAME') event.add_document(document) pipeline.run(document) index = document.get_label_index('hello') for label in index: print(label.response)
def test_text_from_document(): d = Document('plaintext', text='This is text.') assert GenericLabel(5, 7, document=d).text == 'is'
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from mtap import GenericLabel, Location, Document from mtap.data._label_indices import presorted_label_index, label_index document = Document('plaintext', text='blah') @pytest.fixture def tested(): return presorted_label_index([ GenericLabel(0, 5, document=document, i=0), GenericLabel(0, 7, document=document, i=1), GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 7, document=document, i=3), GenericLabel(6, 8, document=document, i=4), GenericLabel(9, 10, document=document, i=5), GenericLabel(9, 13, document=document, i=6), GenericLabel(9, 13, document=document, i=7), ])
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from mtap import GenericLabel, Document document = Document('plaintext', text='foo bar') def test_get_repr(): label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0) rep = repr(label) assert rep.startswith("GenericLabel(0, 20, ") assert 'a="x"' in rep or "a='x'" in rep assert 'y=20' in rep assert 'z=20.0' in rep def test_get_attr(): label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0)
def process_document(self, document: Document, params: Dict[str, Any]): with document.get_labeler('sentences', distinct=True) as add_sentence: for start, end in predict_text(self.model, self.input_mapper, document.text, self.device): add_sentence(start, end)
def process_document(self, document: Document, params: Dict[str, Any]): text = document.text result = self.pool.apply(predict_sentences_async, args=(text, )) with document.get_labeler('sentences', distinct=True) as add_sentence: for start, end in result: add_sentence(start, end)
def process_document(self, document: Document, params: Dict[str, Any]): with document.get_labeler('sentences') as sentence_labeler: for start, end in get_sentences(document.text): sentence_labeler(start, end)
def test_yml_serializer(): event = Event(event_id='1') event.metadata['foo'] = "bar" document = Document('plaintext', text='Some text.') event.add_document(document) document.add_labels('one', [ label(start_index=0, end_index=5, x=10), label(start_index=6, end_index=10, x=15) ]) document.add_labels('two', [ label(start_index=0, end_index=25, a='b'), label(start_index=26, end_index=42, a='c') ]) document.add_labels('three', [ label(start_index=0, end_index=10, foo=True), label(start_index=11, end_index=15, foo=False) ], distinct=True) with TemporaryFile('w+') as tf: YamlSerializer.event_to_file(event, tf) tf.flush() tf.seek(0) o = yaml.load(tf, Loader=Loader) assert o['event_id'] == '1' assert o['metadata']['foo'] == 'bar' d = o['documents']['plaintext'] assert d['text'] == 'Some text.' assert len(d['label_indices']) == 3 assert d['label_indices']['one'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 5, 'x': 10 }, { 'start_index': 6, 'end_index': 10, 'x': 15 }], 'distinct': False } assert d['label_indices']['two'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 25, 'a': 'b' }, { 'start_index': 26, 'end_index': 42, 'a': 'c' }], 'distinct': False } assert d['label_indices']['three'] == { 'json_labels': [{ 'start_index': 0, 'end_index': 10, 'foo': True }, { 'start_index': 11, 'end_index': 15, 'foo': False }], 'distinct': True }