예제 #1
0
def test_yml_serializer():
    event = Event(event_id='1')
    event.metadata['foo'] = "bar"
    document = Document('plaintext', text='Some text.')
    event.add_document(document)
    one = label(start_index=0, end_index=5, x=10)
    two = label(start_index=6, end_index=10, x=15)
    document.add_labels('one', [one,
                                two])
    document.add_labels('two', [label(start_index=0, end_index=25, a='b', b=one),
                                label(start_index=26, end_index=42, a='c', b=two)])
    document.add_labels('three', [
        label(start_index=0, end_index=10, foo=True),
        label(start_index=11, end_index=15, foo=False)
    ], distinct=True)

    with TemporaryFile('w+') as tf:
        YamlSerializer.event_to_file(event, tf)
        tf.flush()
        tf.seek(0)
        e = YamlSerializer.file_to_event(tf)

    assert e.event_id == event.event_id
    assert e.metadata['foo'] == 'bar'
    d = e.documents['plaintext']
    assert d.text == document.text
    index_one = d.labels['one']
    assert index_one == [one, two]
    index_two = d.labels['two']
    assert index_two == [label(start_index=0, end_index=25, a='b', b=one),
                         label(start_index=26, end_index=42, a='c', b=two)]
    index_three = d.labels['three']
    assert index_three == [label(start_index=0, end_index=10, foo=True),
                           label(start_index=11, end_index=15, foo=False)]
예제 #2
0
파일: copy_document.py 프로젝트: nlpie/mtap
def copy_document(event: mtap.Event,
                  source_document_name: str,
                  target_document_name: str,
                  index_names: typing.Sequence[str] = ...):
    """Copies one document to another on the same event.

    Parameters
    ----------
    event: Event
        The event.
    source_document_name: str
        The source document name.
    target_document_name: str
        The target document name.
    index_names: Sequence[str]
        If specified will only copy the specified label indices, by default all indices will be
        copied.
    """
    source_document = event.documents[source_document_name]
    target_document = mtap.Document(target_document_name,
                                    text=source_document.text)
    event.add_document(target_document)
    if index_names is ...:
        index_names = list(source_document.labels)
    for index_name in index_names:
        index = source_document.labels[index_name]
        target_document.add_labels(index_name, index, distinct=index.distinct)
예제 #3
0
 def source():
     for path in input_dir.rglob('*.txt'):
         with path.open('r', errors='replace') as f:
             txt = f.read()
         relative = str(path.relative_to(input_dir))
         e = Event(event_id=relative,
                   client=default_pipeline.events_client)
         doc = e.create_document('plaintext', txt)
         yield doc
예제 #4
0
def test_event_to_dict_include_label_text():
    event = Event()
    doc = event.create_document('plaintext', text)
    doc.add_labels('sentences', [label(0, 117)])
    doc.add_labels('tokens', [label(start, end) for start, end in tokens])

    d_event = event_to_dict(event, include_label_text=True)
    d_doc = d_event['documents']['plaintext']
    d_sentences = d_doc['label_indices']['sentences']
    assert d_sentences['json_labels'][0]['_text'] == text
    d_tokens = d_doc['label_indices']['tokens']['json_labels']
    for i, token in enumerate(d_tokens):
        assert token['_text'] == text[tokens[i][0]:tokens[i][1]]
예제 #5
0
파일: test_metrics.py 프로젝트: nlpie/mtap
def test_print_debug_all():
    event = Event()
    doc = event.create_document(
        'test', 'The quick brown fox jumps over the lazy dog.')
    with doc.get_labeler('target') as label_target:
        label_target(16, 19)
    with doc.get_labeler('tested') as label_tested:
        label_tested(10, 15)

    string_io = StringIO()
    metric = FirstTokenConfusion(print_debug='all', debug_handle=string_io)
    metric.update(doc, doc.labels['tested'], doc.labels['target'])
    assert string_io.getvalue(
    ) == 'False Positives\nThe quick {brown} fox jumps over the lazy dog.\n\nFalse Negatives\nThe quick brown {fox} jumps over the lazy dog.\n\n'
예제 #6
0
def test_run_concurrently_with_failure(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_all_document_names.return_value = ['plaintext']
    client.get_all_metadata.return_value = {}
    client.instance_id = 0
    with Pipeline(
            LocalProcessor(Processor('1', ), component_id='processor1'),
            LocalProcessor(Processor('2', ), component_id='processor2'),
            LocalProcessor(Processor('3', ), component_id='processor3'),
            events_client=client
    ) as pipeline:
        events = [Event(event_id=str(i), client=client) for i in range(7)] + [
            Event(event_id='fail_' + str(i), client=client) for i in range(4)]
        with pytest.raises(ValueError) as e_info:
            pipeline.run_multithread(events, show_progress=False, max_failures=2)
예제 #7
0
def test_java_references(python_events, java_references_processor):
    with EventsClient(address=python_events) as client, Pipeline(
        RemoteProcessor('mtap-java-reference-labels-example-processor',
                        address=java_references_processor)
    ) as pipeline:
        with Event(event_id='1', client=client) as event:
            document = event.create_document('plaintext', 'abcd')
            pipeline.run(document)
            references = document.labels['references']
            assert references[0].a == GenericLabel(0, 1)
            assert references[0].b == GenericLabel(1, 2)
            assert references[1].a == GenericLabel(2, 3)
            assert references[1].b == GenericLabel(3, 4)

            map_references = document.labels['map_references']
            assert map_references[0].ref == {
                'a': GenericLabel(0, 1),
                'b': GenericLabel(1, 2),
                'c': GenericLabel(2, 3),
                'd': GenericLabel(3, 4)
            }

            list_references = document.labels['list_references']
            assert list_references[0].ref == [GenericLabel(0, 1), GenericLabel(1, 2)]
            assert list_references[1].ref == [GenericLabel(2, 3), GenericLabel(3, 4)]
예제 #8
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('--events-service')
    parser.add_argument('--sentences-service')
    parser.add_argument('--dependencies-service')
    parser.add_argument('input_file')
    conf = parser.parse_args(args)

    with EventsClient(address=conf.events_service) as client, \
            Pipeline(
                RemoteProcessor('biomedicus-sentences', address=conf.sentences_service),
                RemoteProcessor('biomedicus-dependencies', address=conf.dependencies_service)
            ) as pipeline:
        with open(conf.input_file, 'r') as in_f:
            txt = in_f.read()
        with Event(event_id=Path(conf.input_file).name,
                   client=client) as event:
            document = event.create_document('plaintext', txt)
            pipeline.run(document)
            for sentence in document.labels['sentences']:
                print(sentence.text)
                print('\n')
                for dependency in document.labels['dependencies'].inside(
                        sentence):
                    print((dependency.text, dependency.deprel,
                           dependency.head.text
                           if dependency.head is not None else 'ROOT'))
                print('\n')
예제 #9
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('input',
                        metavar='INPUT_FILE',
                        help='The input GENIA XML file.')
    parser.add_argument('--events',
                        metavar='EVENTS',
                        default=None,
                        help='The address of the events service.')
    parser.add_argument('--tnt-trainer',
                        metavar='TRAINER',
                        default=None,
                        help='The address of the TnT trainer.')
    args = parser.parse_args(args)
    etree = ElementTree.parse(args.input)
    set = etree.getroot()
    with EventsClient(args.events) as client, Pipeline(
            RemoteProcessor('biomedicus-tnt-trainer',
                            address=args.tnt_trainer)) as pipeline:
        for article in set.findall('article'):
            id = list(article.find('articleinfo'))[0].text
            with Event(id, client) as event:
                db = DocumentBuilder()
                for sentence in article.find('title').findall(
                        'sentence') + article.find('abstract').findall(
                            'sentence'):
                    db.add_sentence(sentence)
                d = db.build_doc(event)
                pipeline.run(d)
예제 #10
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument("input_directory", metavar="INPUT_DIR")
    parser.add_argument("output_directory", metavar="OUTPUT_DIR")
    parser.add_argument("--events")
    parser.add_argument("--tagger")
    parser.add_argument("--sentences")
    parser.add_argument("--acronyms")
    parser.add_argument("--norms")
    parser.add_argument("--concepts")
    args = parser.parse_args(args)

    input_dir = Path(args.input_directory)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('biomedicus-sentences', address=args.sentences),
            RemoteProcessor('biomedicus-tnt-tagger', address=args.tagger),
            RemoteProcessor('biomedicus-acronyms', address=args.acronyms),
            RemoteProcessor('biomedicus-concepts', address=args.concepts),
            LocalProcessor(SerializationProcessor(
                JsonSerializer, output_dir=args.output_directory),
                           component_id='serialize',
                           client=client)) as pipeline:
        for path in input_dir.glob("**/*.txt"):
            print("READING FILE:", str(path))
            with path.open('r') as f:
                contents = f.read()
            with Event(event_id=path.stem, client=client) as event:
                document = event.create_document("plaintext", text=contents)
                pipeline.run(document)

        pipeline.print_times()
예제 #11
0
 def process(self, event: Event, params: Dict[str, Any]):
     self.seen += 1
     if 'fail' in event.event_id:
         raise ValueError("fail")
     time.sleep(0.001)
     event.metadata[self.identifier] = 'True'
     event.metadata['processor'] = self.identifier
     self.processed += 1
예제 #12
0
 def process_text(self,
                  text: str,
                  *,
                  event_id: str = None) -> ProcessingResult:
     with Event(event_id=event_id, client=self.events_client) as event:
         document = event.create_document('plaintext', text=text)
         f = self.pipeline.run(document)
     return f
예제 #13
0
 def create_document(self):
     with Event(client=self.client) as e:
         document = e.create_document('plaintext', self.txt)
         document.add_labels('gold_dependencies', self.all_deps)
         document.add_labels('sentences', self.sentences)
         document.add_labels('pos_tags', self.pos_tags)
         document.add_labels('norm_forms', self.norms)
         yield document
예제 #14
0
def test_time_result():
    processor = Processor()
    with Pipeline(
            LocalProcessor(processor, component_id='test_processor', client=None)
    ) as pipeline:
        event = Event()
        results = pipeline.run(event)
        result = results[0]
        assert result.timing_info['process_method'] >= timedelta(seconds=0.001)
예제 #15
0
def test_copy_document():
    e = Event()
    doc = Document(document_name='first',
                   text='The quick brown fox jumped over the lazy dog.')
    e.add_document(doc)
    with doc.get_labeler('some_index') as label:
        label(0, 3, word='The')
        label(4, 9, word='quick')
        label(10, 15, word='brown')
    processor = CopyDocument('first', 'second')
    processor.process(e, {})
    second = e.documents['second']
    assert second is not None
    assert second.labels['some_index'] == [
        GenericLabel(0, 3, word='The'),
        GenericLabel(4, 9, word='quick'),
        GenericLabel(10, 15, word='brown')
    ]
예제 #16
0
def test_labeler_distinct_and_type_id_raises(mocker):
    with pytest.raises(ValueError):
        client = mocker.Mock(EventsClient)
        event = Event(event_id='1', client=client)
        document = Document(
            document_name='plaintext',
            text='The quick brown fox jumped over the lazy dog.',
            event=event)
        document.get_labeler('index',
                             distinct=True,
                             label_adapter=DistinctGenericLabelAdapter)
예제 #17
0
 def source():
     for path in input_directory.rglob('*.txt'):
         relative = str(path.relative_to(input_directory))
         if relative not in skip_documents:
             with path.open('r') as f:
                 txt = f.read()
             with Event(event_id=relative,
                        client=pipeline.events_client,
                        only_create_new=True) as e:
                 doc = e.create_document('plaintext', txt)
                 yield doc
예제 #18
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument("input_directory", metavar="INPUT_DIR")
    parser.add_argument("concepts_csv", metavar="PATH_TO_CONCEPTS_CSV")
    parser.add_argument("output_directory", metavar="OUTPUT_DIR")
    parser.add_argument("--sentences")
    parser.add_argument("--tagger")
    parser.add_argument("--acronyms")
    parser.add_argument("--events")

    ns = parser.parse_args(args)

    print('Reading concepts csv...')
    concepts = {}
    with open(ns.concepts_csv, 'r') as f:
        for line in f.readlines():
            splits = line.split(',')
            end = splits[0]
            start = splits[1]
            cui = splits[5]
            identifier = splits[6]
            try:
                v = concepts[identifier]
            except KeyError:
                v = []
                concepts[identifier] = v
            v.append((start, end, cui))

    print('Reading mipacq source files...')
    with EventsClient(address=ns.events) as client, \
            Pipeline(
                RemoteProcessor('biomedicus-sentences', address=ns.sentences),
                RemoteProcessor('biomedicus-tnt-tagger', address=ns.tagger),
                RemoteProcessor('biomedicus-acronyms', address=ns.acronyms),
                LocalProcessor(SerializationProcessor(PickleSerializer,
                                                      output_dir=ns.output_directory),
                               component_id='serialize',
                               client=client)
            ) as pipeline:
        for path in Path(ns.input_directory).glob('**/*.source'):
            identifier = path.stem.split('-')[0]
            try:
                doc_concepts = concepts[identifier]
            except KeyError:
                continue
            with Event(event_id=identifier, client=client) as event:
                with path.open('r') as f:
                    text = f.read()
                document = event.create_document('plaintext', text)
                with document.get_labeler('gold_concepts') as label_concept:
                    for start, end, cui in doc_concepts:
                        label_concept(start, end, cui=cui)
                pipeline.run(document)
예제 #19
0
def rtf_source(input_directory: Path, extension_glob: str,
               events_client: EventsClient):
    input_directory = Path(input_directory)
    for path in input_directory.rglob(extension_glob):
        with path.open('rb', errors=None) as f:
            rtf = f.read()
        relative = str(path.relative_to(input_directory))
        with Event(event_id=relative,
                   client=events_client,
                   only_create_new=True) as event:
            event.binaries['rtf'] = rtf
            yield event
예제 #20
0
 def provide(self, consume: Callable[[Union[Document, Event]],
                                     None]):
     for i, path in enumerate(input_dir.rglob('*.txt'), start=1):
         if i > conf.limit:
             break
         with path.open('r', errors='replace') as f:
             txt = f.read()
         relative = str(path.relative_to(input_dir))
         with Event(event_id=relative,
                    client=default_pipeline.events_client,
                    only_create_new=True) as e:
             doc = e.create_document('plaintext', txt)
             consume(doc)
예제 #21
0
 def on_created(self, event: FileSystemEvent):
     if not event.is_directory:
         src_path = event.src_path()
         if fnmatch.fnmatch(src_path, self.extension_glob):
             path = Path(src_path)
             with path.open('rb', errors=None) as f:
                 rtf = f.read()
             relative = str(path.relative_to(self.input_directory))
             with Event(event_id=relative,
                        client=self.events_client,
                        only_create_new=True) as event:
                 event.binaries['rtf'] = rtf
                 self.consume(event)
예제 #22
0
def test_time_result(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_all_document_names.return_value = ['plaintext']
    client.get_all_metadata.return_value = {}
    client.instance_id = 0
    with Pipeline(
            LocalProcessor(Processor(), component_id='test_processor'),
            events_client=client
    ) as pipeline:
        event = Event()
        result = pipeline.run(event)
        assert result.component_results[0].timing_info['process_method'] >= timedelta(seconds=0.001)
예제 #23
0
파일: test_metrics.py 프로젝트: nlpie/mtap
def test_fields():
    with Event(event_id='1') as event:
        doc = event.create_document('test', 'This is some text.')
        with doc.get_labeler('tested') as tested:
            tested(0, 5, x=1, y=3)
            tested(6, 10, x=3, y=4)
        with doc.get_labeler('target') as target:
            target(0, 5, x=1, y=5)
            target(6, 10, x=2, y=6)

        acc = Accuracy(fields=['x'])
        metrics = Metrics(acc, tested='tested', target='target')
        metrics.process_document(doc, params={})
        assert abs(acc.value - 0.5) < 1e-6
예제 #24
0
파일: test_metrics.py 프로젝트: nlpie/mtap
def test_any():
    with Event(event_id='1') as event:
        doc = event.create_document('test', 'This is some text.')
        with doc.get_labeler('tested') as tested:
            tested(0, 5, x=1)
            tested(0, 5, x=3)
        with doc.get_labeler('target') as target:
            target(0, 5, x=1)
            target(6, 10, x=2)

        acc = Accuracy(mode='any')
        metrics = Metrics(acc, tested='tested', target='target')
        metrics.process_document(doc, params={})
        assert abs(acc.value - 0.5) < 1e-6
예제 #25
0
 def on_created(self, event: FileSystemEvent):
     if not event.is_directory:
         src_path = event.src_path
         if fnmatch.fnmatch(src_path, self.extension_glob):
             print('Processing: ' + src_path)
             path = Path(src_path)
             with path.open('r', errors=None) as f:
                 txt = f.read()
             relative = str(path.relative_to(self.input_directory))
             with Event(event_id=relative,
                        client=self.events_client,
                        only_create_new=True) as e:
                 doc = e.create_document(self.document_name, txt)
                 self.consume(doc)
예제 #26
0
def test_run_concurrently(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_all_document_names.return_value = ['plaintext']
    client.get_all_metadata.return_value = {}
    client.instance_id = 0
    with Pipeline(
            LocalProcessor(Processor('1', ), component_id='processor1'),
            LocalProcessor(Processor('2', ), component_id='processor2'),
            LocalProcessor(Processor('3', ), component_id='processor3'),
            events_client=client
    ) as pipeline:
        pipeline.events_client = client
        events = [Event() for _ in range(10)]
        pipeline.run_multithread(events, show_progress=False)
예제 #27
0
def test_run_multi(mocker):
    client = mocker.Mock(EventsClient)
    client.get_all_document_names.return_value = ['plaintext']
    client.get_all_metadata.return_value = {}
    processor1 = Processor('1')
    processor2 = Processor('2')
    processor3 = Processor('3')
    with Pipeline(
            LocalProcessor(processor1, component_id='processor1', client=client),
            LocalProcessor(processor2, component_id='processor2', client=client),
            LocalProcessor(processor3, component_id='processor3', client=client)
    ) as pipeline:
        events = [Event() for _ in range(10)]
        results = pipeline.run_multithread(events, progress=False)
        for result in results:
            assert len(result) == 3
예제 #28
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('input',
                        metavar='INPUT_FOLDER',
                        help='A folder containing PTB formatted documents.')
    parser.add_argument('--glob', metavar='GLOB', default='*.mrg')
    parser.add_argument('--source-name',
                        metavar='DOCUMENT_NAME',
                        default='source',
                        help='What document to dump the PTB text into.')
    parser.add_argument(
        '--target-name',
        metavar='DOCUMENT_NAME',
        default='plaintext',
        help='What document to the plaintext and annotations into.')
    parser.add_argument('--events',
                        metavar='EVENTS',
                        default=None,
                        help='The address of the events service.')
    parser.add_argument('--ptb-reader',
                        metavar='READER',
                        default=None,
                        help='The address of the PTB Reader.')
    parser.add_argument('--tnt-trainer',
                        metavar='TRAINER',
                        default=None,
                        help='The address of the TnT trainer.')
    args = parser.parse_args(args)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('ptb-reader',
                            address=args.ptb_reader,
                            params={
                                'source_document_name': args.source_name,
                                'target_document_name': args.target_name
                            }),
            RemoteProcessor('biomedicus-tnt-trainer',
                            address=args.tnt_trainer,
                            params={'document_name':
                                    args.target_name})) as pipeline:
        for f in Path(args.input).rglob(args.glob):
            print('Reading:', f)
            with f.open('r') as r:
                text = r.read()
            with Event(event_id=f.name, client=client) as event:
                d = Document(args.source_name, text=text)
                event.add_document(d)
                pipeline.run(event)
예제 #29
0
파일: test_metrics.py 프로젝트: nlpie/mtap
def test_begin_token_precision_recall_f1():
    with Event() as event:
        doc = event.create_document(
            'test', 'The quick brown fox jumps over the lazy dog.')
        with doc.get_labeler('tested') as label_tested:
            label_tested(0, 9)
            label_tested(10, 19)
            label_tested(20, 44)
        with doc.get_labeler('target') as label_target:
            label_target(0, 19)
            label_target(20, 30)
            label_target(31, 44)

        metric = FirstTokenConfusion()
        metric.update(doc, doc.labels['tested'], doc.labels['target'])
        assert metric.precision == 2 / 3
        assert metric.recall == 2 / 3
        assert metric.f1 == 2 / 3
예제 #30
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('--events-service', default='localhost:10100')
    parser.add_argument('--sentences-service', default='localhost:10102')
    conf = parser.parse_args(args)
    with Pipeline(
            RemoteProcessor(
                'biomedicus-sentences',
                address=conf.sentences_service)) as pipeline, EventsClient(
                    address=conf.events_service) as events_client:
        text = sys.stdin.read()
        with Event(client=events_client) as event:
            doc = event.create_document('plaintext', text)
            result = pipeline.run(doc)
            for sentence in doc.get_label_index('sentences'):
                print('S: "', sentence.text, '"')
            for k, v in result[0].timing_info.items():
                print('{}: {}'.format(k, v))