Exemplo n.º 1
0
    def process_document(self, document: Document, params: Dict[str, Any]):
        terms_index_name = params.get('terms_index', 'umls_terms')
        terms = document.labels[terms_index_name]
        negation_triggers = document.labels['negation_triggers']

        all_deps = []
        all_upos_tags = []
        sentences = []
        sentence_texts = []
        for sentence in document.labels['sentences']:
            if len(terms.inside(sentence)) == 0 or len(
                    negation_triggers.inside(sentence)) == 0:
                continue
            sentences.append(sentence)
            sentence_texts.append(sentence.text)

        stanza_doc = self.nlp(sentence_texts)
        for (sentence, stanza_sentence) in zip(sentences,
                                               stanza_doc.sentences):
            sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(
                sentence, stanza_sentence)
            all_deps.extend(sentence_deps)
            all_upos_tags.extend(sentence_upos_tags)

        document.add_labels('dependencies', all_deps)
        document.add_labels('upos_tags', all_upos_tags)
Exemplo n.º 2
0
    def process_document(self, document: mtap.Document, params: Dict[str,
                                                                     Any]):
        referenced = [
            mtap.GenericLabel(0, 1),
            mtap.GenericLabel(1, 2),
            mtap.GenericLabel(2, 3),
            mtap.GenericLabel(3, 4)
        ]

        # references can be a map of strings to labels
        with document.get_labeler('map_references') as label_map_references:
            label_map_references(0,
                                 4,
                                 ref={
                                     'a': referenced[0],
                                     'b': referenced[1],
                                     'c': referenced[2],
                                     'd': referenced[3]
                                 })

        # references can be a list of labels
        with document.get_labeler('list_references') as label_list_references:
            label_list_references(0, 2, ref=[referenced[0], referenced[1]])
            label_list_references(2, 3, ref=[referenced[2], referenced[3]])

        # references can be direct
        with document.get_labeler('references') as label_references:
            label_references(0, 2, a=referenced[0], b=referenced[1])
            label_references(2, 3, a=referenced[2], b=referenced[3])

        # referenced labels don't need to be added via "addLabels" or "Labeler.close" before label
        # indices that reference them.
        # The Document will delay uploading any label indices to the server until they are.
        document.add_labels('referenced', referenced)
Exemplo n.º 3
0
def copy_document(event: Event,
                  source_document_name: str,
                  target_document_name: str,
                  index_names: Sequence[str] = ...):
    """Copies one document to another on the same event.

    Parameters
    ----------
    event: Event
        The event.
    source_document_name: str
        The source document name.
    target_document_name: str
        The target document name.
    index_names: Sequence[str]
        If specified will only copy the specified label indices, by default all indices will be
        copied.
    """
    source_document = event.documents[source_document_name]
    target_document = Document(target_document_name, text=source_document.text)
    event.add_document(target_document)
    if index_names is ...:
        info = source_document.get_label_indices_info()
        index_names = [i.index_name for i in info]
    for index_name in index_names:
        index = source_document.get_label_index(index_name)
        target_document.add_labels(index_name, index, distinct=index.distinct)
Exemplo n.º 4
0
 def process_document(self, document: Document, params: Dict[str, Any]):
     label_trigger = document.get_labeler('negation_triggers')
     with label_trigger:
         for sentence in document.get_label_index('sentences'):
             triggers = self.negex.detect_negex_triggers(sentence.text)
             for start_index, end_index, tags in triggers:
                 label_trigger(sentence.start_index + start_index,
                               sentence.start_index + end_index,
                               tags=tags)
Exemplo n.º 5
0
    def process_document(self,
                         document: Document,
                         params: Dict[str, Any]):
        sentences = document.labels['sentences']

        sentences_text = []
        for sentence in sentences:
            sentences_text.append(sentence.text)

        stanza_doc = self.nlp(sentences_text)

        all_deps = []
        all_upos_tags = []
        for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences):
            dependencies = {}
            stanza_dependencies = stanza_sentence.dependencies
            stanza_dependencies = list(stanza_dependencies)
            i = 0
            while len(stanza_dependencies) > 0:
                i += 1
                if i > MAX_ITER:
                    raise ValueError(
                        'Maximum Iterations reached while processing dependency graph.')
                head, deprel, dep = stanza_dependencies.pop()
                head_id = int(head.id)
                if head_id == 0:
                    head_dep_label = None
                else:
                    try:
                        head_dep_label = dependencies[head_id]
                    except KeyError:
                        stanza_dependencies.insert(0, (head, deprel, dep))
                        continue

                token_begin = sentence.start_index + dep.parent.start_char - stanza_sentence.tokens[
                    0].start_char
                token_end = sentence.start_index + dep.parent.end_char - stanza_sentence.tokens[
                    0].start_char
                dep_label = GenericLabel(token_begin, token_end, head=head_dep_label, deprel=deprel)
                dep_label.reference_cache['dependents'] = []
                dependencies[int(dep.id)] = dep_label
                if head_dep_label is not None:
                    head_dep_label.dependents.append(dep_label)
                all_deps.append(dep_label)

            for word in stanza_sentence.words:
                token = word.parent
                token_begin = sentence.start_index + token.start_char - stanza_sentence.tokens[
                    0].start_char
                token_end = sentence.start_index + token.end_char - stanza_sentence.tokens[
                    0].start_char
                all_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos))

        document.add_labels('dependencies', all_deps)
        document.add_labels('upos_tags', all_upos_tags)
Exemplo n.º 6
0
def test_labeler_distinct_and_type_id_raises(mocker):
    with pytest.raises(ValueError):
        client = mocker.Mock(EventsClient)
        event = Event(event_id='1', client=client)
        document = Document(
            document_name='plaintext',
            text='The quick brown fox jumped over the lazy dog.',
            event=event)
        document.get_labeler('index',
                             distinct=True,
                             label_adapter=DistinctGenericLabelAdapter)
Exemplo n.º 7
0
def test_yml_serializer():
    event = Event(event_id='1')
    event.metadata['foo'] = "bar"
    document = Document('plaintext', text='Some text.')
    event.add_document(document)
    one = label(start_index=0, end_index=5, x=10)
    two = label(start_index=6, end_index=10, x=15)
    document.add_labels('one', [one,
                                two])
    document.add_labels('two', [label(start_index=0, end_index=25, a='b', b=one),
                                label(start_index=26, end_index=42, a='c', b=two)])
    document.add_labels('three', [
        label(start_index=0, end_index=10, foo=True),
        label(start_index=11, end_index=15, foo=False)
    ], distinct=True)

    with TemporaryFile('w+') as tf:
        YamlSerializer.event_to_file(event, tf)
        tf.flush()
        tf.seek(0)
        e = YamlSerializer.file_to_event(tf)

    assert e.event_id == event.event_id
    assert e.metadata['foo'] == 'bar'
    d = e.documents['plaintext']
    assert d.text == document.text
    index_one = d.labels['one']
    assert index_one == [one, two]
    index_two = d.labels['two']
    assert index_two == [label(start_index=0, end_index=25, a='b', b=one),
                         label(start_index=26, end_index=42, a='c', b=two)]
    index_three = d.labels['three']
    assert index_three == [label(start_index=0, end_index=10, foo=True),
                           label(start_index=11, end_index=15, foo=False)]
Exemplo n.º 8
0
 def process_document(self, document: Document, params: Dict[str, Any]):
     terms_index_name = params.get('terms_index', 'umls_terms')
     label_negated = document.get_labeler('negated')
     terms = document.get_label_index(terms_index_name)
     triggers = document.labels['negation_triggers']
     deps = document.get_label_index('dependencies')
     upos_tags = document.get_label_index('upos_tags')
     with label_negated:
         for sentence in document.get_label_index('sentences'):
             sentence_terms = terms.inside(sentence)
             sentence_triggers = triggers.inside(sentence)
             if len(sentence_triggers) > 0:
                 negations, _ = self.negex.check_sentence(
                     sentence_terms, sentence_triggers, deps, upos_tags)
                 for start_index, end_index in negations:
                     label_negated(start_index, end_index)
Exemplo n.º 9
0
 def process_document(self, document: Document, params: Dict[str, Any]):
     terms_index_name = params.get('terms_index', 'umls_terms')
     label_negated = document.get_labeler('negated')
     label_trigger = document.get_labeler('negation_trigger')
     terms = document.get_label_index(terms_index_name)
     with label_negated, label_trigger:
         for sentence in document.get_label_index('sentences'):
             sentence_terms = [(t.start_index - sentence.start_index,
                                t.end_index - sentence.start_index)
                               for t in terms.inside(sentence)]
             negations, triggers = self.negex.check_sentence(
                 sentence.text, sentence_terms)
             for start_index, end_index in negations:
                 label_negated(sentence.start_index + start_index,
                               sentence.start_index + end_index)
             for start_index, end_index in triggers:
                 label_trigger(sentence.start_index + start_index,
                               sentence.start_index + end_index)
Exemplo n.º 10
0
def test_copy_document():
    e = Event()
    doc = Document(document_name='first',
                   text='The quick brown fox jumped over the lazy dog.')
    e.add_document(doc)
    with doc.get_labeler('some_index') as label:
        label(0, 3, word='The')
        label(4, 9, word='quick')
        label(10, 15, word='brown')
    processor = CopyDocument('first', 'second')
    processor.process(e, {})
    second = e.documents['second']
    assert second is not None
    assert second.labels['some_index'] == [
        GenericLabel(0, 3, word='The'),
        GenericLabel(4, 9, word='quick'),
        GenericLabel(10, 15, word='brown')
    ]
Exemplo n.º 11
0
def test_add_labels_distinct(mocker):
    client = mocker.Mock(EventsClient)
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    l2 = document.add_labels('index', labels, distinct=True)
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=mocker.ANY)
    assert l2 == labels
    assert l2.distinct
Exemplo n.º 12
0
    def process_document(self, document: Document, params: Dict[str, Any]):
        sentences = document.labels['sentences']

        sentences_text = []
        for sentence in sentences:
            sentences_text.append(sentence.text)

        stanza_doc = self.nlp(sentences_text)

        all_deps = []
        all_upos_tags = []
        for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences):
            sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(
                sentence, stanza_sentence)
            all_deps.extend(sentence_deps)
            all_upos_tags.extend(sentence_upos_tags)

        document.add_labels('dependencies', all_deps)
        document.add_labels('upos_tags', all_upos_tags)
Exemplo n.º 13
0
 def build_doc(self, event):
     text = ' '.join(self.text)
     d = Document(document_name='plaintext', text=text)
     event.add_document(d)
     d.add_labels('pos_tags', self.tags, distinct=True)
     d.add_labels('sentences', self.sentences, distinct=True)
     return d
Exemplo n.º 14
0
def test_labeler_distinct(mocker):
    client = mocker.Mock(EventsClient)
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    with document.get_labeler('index', distinct=True) as add_generic_label:
        add_generic_label(0, 10, x=1)
        add_generic_label(11, 15, x=2)
        add_generic_label(16, 20, x=3)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    label_adapter = DistinctGenericLabelAdapter
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=label_adapter)
    assert document.get_label_index('index') == labels
Exemplo n.º 15
0
def test_add_labels_not_distinct(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_label_index_info.return_value = []
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    document.add_labels('index', labels)
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=mocker.ANY)
    l2 = document.labels['index']
    assert l2 == labels
    assert not l2.distinct
Exemplo n.º 16
0
def test_labeler_distinct(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_label_index_info.return_value = []
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    with document.get_labeler('index', distinct=True) as add_generic_label:
        add_generic_label(0, 10, x=1)
        add_generic_label(11, 15, x=2)
        add_generic_label(16, 20, x=3)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    label_adapter = DISTINCT_GENERIC_ADAPTER
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=label_adapter)
    assert document.labels['index'] == labels
Exemplo n.º 17
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('input',
                        metavar='INPUT_FOLDER',
                        help='A folder containing PTB formatted documents.')
    parser.add_argument('--glob', metavar='GLOB', default='*.mrg')
    parser.add_argument('--source-name',
                        metavar='DOCUMENT_NAME',
                        default='source',
                        help='What document to dump the PTB text into.')
    parser.add_argument(
        '--target-name',
        metavar='DOCUMENT_NAME',
        default='plaintext',
        help='What document to the plaintext and annotations into.')
    parser.add_argument('--events',
                        metavar='EVENTS',
                        default=None,
                        help='The address of the events service.')
    parser.add_argument('--ptb-reader',
                        metavar='READER',
                        default=None,
                        help='The address of the PTB Reader.')
    parser.add_argument('--tnt-trainer',
                        metavar='TRAINER',
                        default=None,
                        help='The address of the TnT trainer.')
    args = parser.parse_args(args)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('ptb-reader',
                            address=args.ptb_reader,
                            params={
                                'source_document_name': args.source_name,
                                'target_document_name': args.target_name
                            }),
            RemoteProcessor('biomedicus-tnt-trainer',
                            address=args.tnt_trainer,
                            params={'document_name':
                                    args.target_name})) as pipeline:
        for f in Path(args.input).rglob(args.glob):
            print('Reading:', f)
            with f.open('r') as r:
                text = r.read()
            with Event(event_id=f.name, client=client) as event:
                d = Document(args.source_name, text=text)
                event.add_document(d)
                pipeline.run(event)
Exemplo n.º 18
0
    def process_document(self,
                         document: mtap.Document,
                         params: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        if params['do_work']:
            with self.started_stopwatch('fetch_time'):
                text = document.text

            a_count = text.count('a')
            b_count = text.count('b')

            with document.get_labeler('mtap.examples.letter_counts') as label_letter_count:
                label_letter_count(start_index=0, end_index=len(document.text), letter='a',
                                   count=a_count)
                label_letter_count(start_index=0, end_index=len(document.text), letter='b',
                                   count=b_count)

        return {'answer': 42}
Exemplo n.º 19
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('input',
                        metavar='INPUT_DIR',
                        help='A folder containing PTB formatted documents.')
    parser.add_argument('output',
                        metavar='OUTPUT_DIR',
                        help='A folder to write the json files to.')
    parser.add_argument('--glob', metavar='GLOB', default='*.mrg')
    parser.add_argument('--events',
                        metavar='EVENTS',
                        default=None,
                        help='The address of the events service.')
    parser.add_argument('--ptb-reader',
                        metavar='READER',
                        default=None,
                        help='The address of the PTB Reader.')
    args = parser.parse_args(args)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('ptb-reader',
                            address=args.ptb_reader,
                            params={
                                'source_document_name': 'source',
                                'target_document_name': 'gold',
                                'pos_tags_index': 'gold_tags'
                            }),
            LocalProcessor(SerializationProcessor(JsonSerializer,
                                                  output_dir=args.output),
                           component_id='serializer',
                           client=client)) as pipeline:
        for f in Path(args.input).rglob(args.glob):
            print('Reading:', f)
            with f.open('r') as r:
                text = r.read()
            with Event(event_id=f.name, client=client) as event:
                d = Document('source', text=text)
                event.add_document(d)
                pipeline.run(event)
Exemplo n.º 20
0
def test_sentences_unknown_character(bi_lstm_model):
    document = Document('plaintext',
                        text='• Sentence which contains unknown character.')
    bi_lstm_model.process_document(document, {})
    assert document.get_label_index('sentences') == [GenericLabel(2, 44)]
Exemplo n.º 21
0
# Copyright 2019 Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Hello world tutorial pipeline."""
import sys

if __name__ == '__main__':
    from mtap import Document, Event, EventsClient, Pipeline, RemoteProcessor

    with EventsClient(address=sys.argv[1]) as client, \
            Pipeline(
                RemoteProcessor(processor_id='hello', address=sys.argv[2])
            ) as pipeline:
        with Event(event_id='1', client=client) as event:
            document = Document(document_name='name', text='YOUR NAME')
            event.add_document(document)
            pipeline.run(document)
            index = document.get_label_index('hello')
            for label in index:
                print(label.response)
Exemplo n.º 22
0
def test_text_from_document():
    d = Document('plaintext', text='This is text.')
    assert GenericLabel(5, 7, document=d).text == 'is'
Exemplo n.º 23
0
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest

from mtap import GenericLabel, Location, Document
from mtap.data._label_indices import presorted_label_index, label_index

document = Document('plaintext', text='blah')


@pytest.fixture
def tested():
    return presorted_label_index([
        GenericLabel(0, 5, document=document, i=0),
        GenericLabel(0, 7, document=document, i=1),
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(9, 10, document=document, i=5),
        GenericLabel(9, 13, document=document, i=6),
        GenericLabel(9, 13, document=document, i=7),
    ])
Exemplo n.º 24
0
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest

from mtap import GenericLabel, Document

document = Document('plaintext', text='foo bar')


def test_get_repr():
    label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0)

    rep = repr(label)
    assert rep.startswith("GenericLabel(0, 20, ")
    assert 'a="x"' in rep or "a='x'" in rep
    assert 'y=20' in rep
    assert 'z=20.0' in rep


def test_get_attr():
    label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0)
Exemplo n.º 25
0
 def process_document(self, document: Document, params: Dict[str, Any]):
     with document.get_labeler('sentences', distinct=True) as add_sentence:
         for start, end in predict_text(self.model, self.input_mapper,
                                        document.text, self.device):
             add_sentence(start, end)
Exemplo n.º 26
0
 def process_document(self, document: Document, params: Dict[str, Any]):
     text = document.text
     result = self.pool.apply(predict_sentences_async, args=(text, ))
     with document.get_labeler('sentences', distinct=True) as add_sentence:
         for start, end in result:
             add_sentence(start, end)
Exemplo n.º 27
0
 def process_document(self, document: Document, params: Dict[str, Any]):
     with document.get_labeler('sentences') as sentence_labeler:
         for start, end in get_sentences(document.text):
             sentence_labeler(start, end)
Exemplo n.º 28
0
def test_yml_serializer():
    event = Event(event_id='1')
    event.metadata['foo'] = "bar"
    document = Document('plaintext', text='Some text.')
    event.add_document(document)
    document.add_labels('one', [
        label(start_index=0, end_index=5, x=10),
        label(start_index=6, end_index=10, x=15)
    ])
    document.add_labels('two', [
        label(start_index=0, end_index=25, a='b'),
        label(start_index=26, end_index=42, a='c')
    ])
    document.add_labels('three', [
        label(start_index=0, end_index=10, foo=True),
        label(start_index=11, end_index=15, foo=False)
    ],
                        distinct=True)

    with TemporaryFile('w+') as tf:
        YamlSerializer.event_to_file(event, tf)
        tf.flush()
        tf.seek(0)
        o = yaml.load(tf, Loader=Loader)

    assert o['event_id'] == '1'
    assert o['metadata']['foo'] == 'bar'
    d = o['documents']['plaintext']
    assert d['text'] == 'Some text.'
    assert len(d['label_indices']) == 3
    assert d['label_indices']['one'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 5,
            'x': 10
        }, {
            'start_index': 6,
            'end_index': 10,
            'x': 15
        }],
        'distinct':
        False
    }
    assert d['label_indices']['two'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 25,
            'a': 'b'
        }, {
            'start_index': 26,
            'end_index': 42,
            'a': 'c'
        }],
        'distinct':
        False
    }
    assert d['label_indices']['three'] == {
        'json_labels': [{
            'start_index': 0,
            'end_index': 10,
            'foo': True
        }, {
            'start_index': 11,
            'end_index': 15,
            'foo': False
        }],
        'distinct':
        True
    }