Пример #1
0
class DefaultPipeline:
    """The biomedicus default pipeline for processing clinical documents.

    Attributes
        events_client (mtap.EventsClient): An MTAP events client used by the pipeline.
        pipeline (mtap.Pipeline): An MTAP pipeline to use to process documents.

    """
    def __init__(self, conf: PipelineConf, *, events_client: EventsClient = None):
        conf.populate_addresses()
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [
            (conf.sentences_id, conf.sentences_address),
            (conf.section_headers_id, conf.section_headers_address),
            (conf.tagger_id, conf.tagger_address),
            (conf.acronyms_id, conf.acronyms_address),
            (conf.concepts_id, conf.concepts_address),
            (conf.negation_id, conf.negation_address),
            (conf.selective_dependencies_id, conf.selective_dependencies_address),
            (conf.deepen_id, conf.deepen_address)
        ]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline]
            )
        else:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier, address=addr) for identifier, addr in pipeline]
            )
        if conf.serializer is not None:
            serialization_proc = SerializationProcessor(get_serializer(conf.serializer),
                                                        conf.output_directory,
                                                        include_label_text=conf.include_label_text)
            ser_comp = LocalProcessor(serialization_proc, component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)

    def process_text(self, text: str, *, event_id: str = None) -> ProcessingResult:
        with Event(event_id=event_id, client=self.events_client) as event:
            document = event.create_document('plaintext', text=text)
            f = self.pipeline.run(document)
        return f

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pipeline.close()
        if self.close_client:
            self.events_client.close()
Пример #2
0
class DefaultPipeline:
    def __init__(self,
                 conf: DefaultPipelineConf,
                 *,
                 events_client: EventsClient = None):
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [(conf.sentences_id, conf.sentences_address),
                    (conf.tagger_id, conf.tagger_address),
                    (conf.acronyms_id, conf.acronyms_address),
                    (conf.concepts_id, conf.concepts_address),
                    (conf.negation_id, conf.negation_address)]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline],
                n_threads=conf.threads)
        else:
            self.pipeline = Pipeline(*[
                RemoteProcessor(identifier, address=addr)
                for identifier, addr in pipeline
            ],
                                     n_threads=conf.threads)
        if conf.serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(conf.serializer),
                conf.output_directory,
                include_label_text=conf.include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)

    def process_text(self,
                     text: str,
                     *,
                     event_id: str = None) -> ProcessingResult:
        with Event(event_id=event_id, client=self.events_client) as event:
            document = event.create_document('plaintext', text=text)
            f = self.pipeline.run(document)
        return f

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pipeline.close()
        if self.close_client:
            self.events_client.close()
Пример #3
0
class DefaultPipeline:
    """The biomedicus default pipeline for processing clinical documents.

    Attributes
        events_client (mtap.EventsClient): An MTAP events client used by the pipeline.
        pipeline (mtap.Pipeline): An MTAP pipeline to use to process documents.

    """
    def __init__(self,
                 conf: PipelineConf,
                 *,
                 events_client: EventsClient = None):
        conf.populate_addresses()
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [(conf.sentences_id, conf.sentences_address),
                    (conf.tagger_id, conf.tagger_address)]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline])
        else:
            self.pipeline = Pipeline(*[
                RemoteProcessor(identifier, address=addr)
                for identifier, addr in pipeline
            ])

    def process_text(self,
                     text: str,
                     *,
                     event_id: str = None) -> ProcessingResult:
        with Event(event_id=event_id, client=self.events_client) as event:
            document = event.create_document('plaintext', text=text)
            f = self.pipeline.run(document)
        return f

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pipeline.close()
        if self.close_client:
            self.events_client.close()
Пример #4
0
def test_modification_detector_performance(events_service, modification_detector_service,
                                           test_results):
    input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'negation' / 'i2b2_2010'
    confusion = metrics.FirstTokenConfusion()
    metrics_processor = metrics.Metrics(confusion, tested='negated', target='i2b2concepts',
                                        target_filter=is_negated)
    with EventsClient(address=events_service) as client, Pipeline(
            RemoteProcessor('biomedicus-negation', address=modification_detector_service,
                            params={'terms_index': 'i2b2concepts'}),
            LocalProcessor(metrics_processor, component_id='metrics', client=client)
    ) as pipeline:
        for test_file in input_dir.glob('**/*.pickle'):
            with PickleSerializer.file_to_event(test_file, client=client) as event:
                document = event.documents['plaintext']
                results = pipeline.run(document)
                print('F1 for event - "{}": {:0.3f} - elapsed: {}'.format(
                    event.event_id,
                    results.component_result('metrics').result_dict['first_token_confusion']['f1'],
                    results.component_result('biomedicus-negation').timing_info['process_method']
                ))

        print('Overall Precision:', confusion.precision)
        print('Overall Recall:', confusion.recall)
        print('Overall F1:', confusion.f1)
        pipeline.print_times()
        timing_info = pipeline.processor_timer_stats('biomedicus-negation').timing_info
        test_results['biomedicus-modification'] = {
            'Gold Standard': "2010 i2b2-VA",
            'Precision': confusion.precision,
            'Recall': confusion.recall,
            'F1': confusion.f1,
            'Per-Document Mean Remote Call Duration': str(timing_info['remote_call'].mean),
            'Per-Document Mean Process Method Duration': str(timing_info['process_method'].mean)
        }
Пример #5
0
def run_rtf_to_text_pipeline(config: Namespace):
    default_config = str(Path(__file__).parent / 'rtf_to_text_pipeline.yml')
    if config.write_config:
        print('Copying from "{}" to "{}"'.format(
            default_config, str(Path.cwd() / 'rtf_to_text_pipeline.yml')))
        shutil.copy2(default_config, 'rtf_to_text_pipeline.yml')
        return

    config_file = config.config
    if config_file is None:
        config_file = default_config

    workers = config.workers
    if workers is None:
        workers = max(os.cpu_count() // 2, 1)

    with Pipeline.from_yaml_file(config_file) as pipeline:
        pipeline += [
            LocalProcessor(WritePlaintext(Path(config.output_directory)),
                           component_id='write_text')
        ]

        input_directory = Path(config.input_directory)

        source = rtf_source(input_directory, config.extension_glob,
                            pipeline.events_client)
        total = sum(1 for _ in input_directory.rglob(config.extension_glob))

        pipeline.run_multithread(source,
                                 workers=workers,
                                 total=total,
                                 max_failures=config.max_failures)
        pipeline.print_times()
Пример #6
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument("input_directory", metavar="INPUT_DIR")
    parser.add_argument("output_directory", metavar="OUTPUT_DIR")
    parser.add_argument("--events")
    parser.add_argument("--tagger")
    parser.add_argument("--sentences")
    parser.add_argument("--acronyms")
    parser.add_argument("--norms")
    parser.add_argument("--concepts")
    args = parser.parse_args(args)

    input_dir = Path(args.input_directory)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('biomedicus-sentences', address=args.sentences),
            RemoteProcessor('biomedicus-tnt-tagger', address=args.tagger),
            RemoteProcessor('biomedicus-acronyms', address=args.acronyms),
            RemoteProcessor('biomedicus-concepts', address=args.concepts),
            LocalProcessor(SerializationProcessor(
                JsonSerializer, output_dir=args.output_directory),
                           component_id='serialize',
                           client=client)) as pipeline:
        for path in input_dir.glob("**/*.txt"):
            print("READING FILE:", str(path))
            with path.open('r') as f:
                contents = f.read()
            with Event(event_id=path.stem, client=client) as event:
                document = event.create_document("plaintext", text=contents)
                pipeline.run(document)

        pipeline.print_times()
Пример #7
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('input',
                        metavar='INPUT_FILE',
                        help='The input GENIA XML file.')
    parser.add_argument('--events',
                        metavar='EVENTS',
                        default=None,
                        help='The address of the events service.')
    parser.add_argument('--tnt-trainer',
                        metavar='TRAINER',
                        default=None,
                        help='The address of the TnT trainer.')
    args = parser.parse_args(args)
    etree = ElementTree.parse(args.input)
    set = etree.getroot()
    with EventsClient(args.events) as client, Pipeline(
            RemoteProcessor('biomedicus-tnt-trainer',
                            address=args.tnt_trainer)) as pipeline:
        for article in set.findall('article'):
            id = list(article.find('articleinfo'))[0].text
            with Event(id, client) as event:
                db = DocumentBuilder()
                for sentence in article.find('title').findall(
                        'sentence') + article.find('abstract').findall(
                            'sentence'):
                    db.add_sentence(sentence)
                d = db.build_doc(event)
                pipeline.run(d)
Пример #8
0
    def __init__(self,
                 conf_path: Union[str, Path],
                 output_directory: Union[str, Path],
                 *,
                 events_address: Optional[str] = None,
                 events_client: EventsClient = None,
                 serializer: Optional[str] = None,
                 include_label_text: bool = False):
        if events_address == 'None' or events_address == 'none' or events_address == 'null' or events_address == '':
            events_address = None
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        else:
            self.close_client = True
            self.events_client = EventsClient(address=events_address)

        self.pipeline = Pipeline.from_yaml_file(conf_path)

        if serializer == 'None':
            serializer = None
        if serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(serializer),
                output_directory,
                include_label_text=include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)
Пример #9
0
def test_serialization():
    p = Pipeline(
        RemoteProcessor(
            processor_id='processor-1',
            address='localhost:1234'
        ),
        RemoteProcessor(
            processor_id='processor-2',
            address='localhost:5678'
        ),
        name='mtap-test-pipeline',
        events_address='localhost:123',
        mp_config=MpConfig(
            max_failures=3,
            show_progress=False,
            workers=12,
            read_ahead=4,
            close_events=False
        ),
    )
    s = pickle.dumps(p)
    r = pickle.loads(s)
    assert r.name == 'mtap-test-pipeline'
    assert r.events_address == 'localhost:123'
    assert r.mp_config.max_failures == 3
    assert not r.mp_config.show_progress
    assert r.mp_config.workers == 12
    assert r.mp_config.read_ahead == 4
    assert not r.mp_config.close_events
    assert len(r) == 2
    assert r[0].processor_id == 'processor-1'
    assert r[0].address == 'localhost:1234'
    assert r[1].processor_id == 'processor-2'
    assert r[1].address == 'localhost:5678'
Пример #10
0
def test_tnt_performance(events_service, pos_tags_service, test_results):
    input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'pos_tags'
    accuracy = Accuracy()
    with EventsClient(address=events_service) as client, Pipeline(
            RemoteProcessor(processor_id='biomedicus-tnt-tagger',
                            address=pos_tags_service,
                            params={'token_index': 'gold_tags'}),
            LocalProcessor(Metrics(accuracy,
                                   tested='pos_tags',
                                   target='gold_tags'),
                           component_id='metrics'),
            events_client=client) as pipeline:
        for test_file in input_dir.glob('**/*.pickle'):
            event = PickleSerializer.file_to_event(test_file, client=client)
            with event:
                document = event.documents['gold']
                results = pipeline.run(document)
                print(
                    'Accuracy for event - ', event.event_id, ':',
                    results.component_result(
                        'metrics').result_dict['accuracy'])

        print('Accuracy:', accuracy.value)
        pipeline.print_times()
        timing_info = pipeline.processor_timer_stats(
            'biomedicus-tnt-tagger').timing_info
        test_results['TnT Pos Tagger'] = {
            'Accuracy': accuracy.value,
            'Remote Call Duration': str(timing_info['remote_call'].mean),
            'Process Method Duration': str(timing_info['process_method'].mean)
        }
        assert accuracy.value > 0.9
Пример #11
0
def test_dependencies(events_service, dependencies_service, test_results):
    test_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'dependencies'
    uas = Accuracy('UAS', equivalence_test=uas_equal)
    las = Accuracy('LAS', equivalence_test=las_equal)
    with EventsClient(address=events_service) as client, \
            Pipeline(
                RemoteProcessor(processor_id='biomedicus-dependencies',
                                address=dependencies_service),
                LocalProcessor(Metrics(uas, las, tested='dependencies', target='gold_dependencies'),
                               component_id='accuracy', client=client)
            ) as pipeline:
        for test_file in test_dir.glob('**/*.pickle'):
            with PickleSerializer.file_to_event(test_file,
                                                client=client) as event:
                document = event.documents['plaintext']
                results = pipeline.run(document)
                accuracy_dict = results.component_result(
                    'accuracy').result_dict
                print('Results for document: UAS: {}. LAS: {}.'.format(
                    accuracy_dict['UAS'], accuracy_dict['LAS']))

    print('UAS:', uas.value)
    print('LAS:', las.value)
    timing_info = pipeline.processor_timer_stats(
        'biomedicus-dependencies').timing_info
    test_results['biomedicus-dependencies'] = {
        'UAS': uas.value,
        'LAS': las.value,
        'Corpus': "MiPACQ converted to UD from PTB test set",
        'Remote Call Duration': str(timing_info['remote_call'].mean),
        'Process Method Duration': str(timing_info['process_method'].mean)
    }
Пример #12
0
def test_concepts_performance(events_service, concepts_service, test_results):
    input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'concepts'
    recall = Accuracy(name='recall', mode='any', fields=['cui'])
    precision = Accuracy(name='precision', mode='any', fields=['cui'])
    with EventsClient(address=events_service) as client, \
            Pipeline(
                RemoteProcessor(processor_id='biomedicus-concepts', address=concepts_service),
                LocalProcessor(Metrics(recall, tested='umls_concepts', target='gold_concepts'),
                               component_id='metrics'),
                LocalProcessor(Metrics(precision, tested='gold_concepts', target='umls_concepts'),
                               component_id='metrics_reverse'),
                events_client=client
            ) as pipeline:
        for test_file in input_dir.glob('**/*.pickle'):
            with PickleSerializer.file_to_event(test_file, client=client) as event:
                document = event.documents['plaintext']
                pipeline.run(document)

    print('Precision:', precision.value)
    print('Recall:', recall.value)
    timing_info = pipeline.processor_timer_stats('biomedicus-concepts').timing_info
    test_results['Concepts'] = {
        'Precision': precision.value,
        'Recall': recall.value,
        'Remote Call Duration': str(timing_info['remote_call'].mean),
        'Process Method Duration': str(timing_info['process_method'].mean)
    }
    assert recall.value > 0.6
Пример #13
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('--events-service')
    parser.add_argument('--sentences-service')
    parser.add_argument('--dependencies-service')
    parser.add_argument('input_file')
    conf = parser.parse_args(args)

    with EventsClient(address=conf.events_service) as client, \
            Pipeline(
                RemoteProcessor('biomedicus-sentences', address=conf.sentences_service),
                RemoteProcessor('biomedicus-dependencies', address=conf.dependencies_service)
            ) as pipeline:
        with open(conf.input_file, 'r') as in_f:
            txt = in_f.read()
        with Event(event_id=Path(conf.input_file).name,
                   client=client) as event:
            document = event.create_document('plaintext', txt)
            pipeline.run(document)
            for sentence in document.labels['sentences']:
                print(sentence.text)
                print('\n')
                for dependency in document.labels['dependencies'].inside(
                        sentence):
                    print((dependency.text, dependency.deprel,
                           dependency.head.text
                           if dependency.head is not None else 'ROOT'))
                print('\n')
Пример #14
0
def test_sentence_performance(events_service, sentences_service, test_results):
    input_dir = Path(os.environ['BIOMEDICUS_TEST_DATA']) / 'sentences'

    confusion = metrics.FirstTokenConfusion()
    with EventsClient(address=events_service) as client, Pipeline(
            RemoteProcessor(processor_id='biomedicus-sentences',
                            address=sentences_service),
            LocalProcessor(metrics.Metrics(confusion,
                                           tested='sentences',
                                           target='Sentence'),
                           component_id='metrics',
                           client=client)) as pipeline:
        for test_file in input_dir.glob('**/*.json'):
            with JsonSerializer.file_to_event(test_file,
                                              client=client) as event:
                document = event.documents['plaintext']
                results = pipeline.run(document)
                print('F1 for event - "{}": {:0.3f} - elapsed: {}'.format(
                    event.event_id,
                    results[1].results['first_token_confusion']['f1'],
                    results[0].timing_info['process_method']))

        print('Overall Precision:', confusion.precision)
        print('Overall Recall:', confusion.recall)
        print('Overall F1:', confusion.f1)
        pipeline.print_times()
        timing_info = pipeline.processor_timer_stats()[0].timing_info
        test_results['Sentences'] = {
            'Precision': confusion.precision,
            'Recall': confusion.recall,
            'F1': confusion.f1,
            'Remote Call Duration': str(timing_info['remote_call'].mean),
            'Process Method Duration': str(timing_info['process_method'].mean)
        }
        assert confusion.f1 > 0.85
Пример #15
0
def test_java_references(python_events, java_references_processor):
    with EventsClient(address=python_events) as client, Pipeline(
        RemoteProcessor('mtap-java-reference-labels-example-processor',
                        address=java_references_processor)
    ) as pipeline:
        with Event(event_id='1', client=client) as event:
            document = event.create_document('plaintext', 'abcd')
            pipeline.run(document)
            references = document.labels['references']
            assert references[0].a == GenericLabel(0, 1)
            assert references[0].b == GenericLabel(1, 2)
            assert references[1].a == GenericLabel(2, 3)
            assert references[1].b == GenericLabel(3, 4)

            map_references = document.labels['map_references']
            assert map_references[0].ref == {
                'a': GenericLabel(0, 1),
                'b': GenericLabel(1, 2),
                'c': GenericLabel(2, 3),
                'd': GenericLabel(3, 4)
            }

            list_references = document.labels['list_references']
            assert list_references[0].ref == [GenericLabel(0, 1), GenericLabel(1, 2)]
            assert list_references[1].ref == [GenericLabel(2, 3), GenericLabel(3, 4)]
Пример #16
0
def test_time_result():
    processor = Processor()
    with Pipeline(
            LocalProcessor(processor, component_id='test_processor', client=None)
    ) as pipeline:
        event = Event()
        results = pipeline.run(event)
        result = results[0]
        assert result.timing_info['process_method'] >= timedelta(seconds=0.001)
def test_acronyms_performance(events_service, acronyms_service, test_results):
    input_dir = Path(os.environ['BIOMEDICUS_PHI_TEST_DATA']) / 'acronyms'
    top_score_accuracy = Accuracy(name='top_score_accuracy',
                                  fields=['expansion'])
    any_accuracy = Accuracy(name='any_accuracy',
                            mode='any',
                            fields=['expansion'])
    detection_recall = Accuracy(name='detection_recall',
                                mode='location',
                                fields=['expansion'])
    detection_precision = Accuracy(name='detection_precision',
                                   mode='location',
                                   fields=['expansion'])
    with EventsClient(address=events_service) as client, Pipeline(
            RemoteProcessor(processor_id='biomedicus-acronyms',
                            address=acronyms_service),
            LocalProcessor(Metrics(top_score_accuracy,
                                   detection_recall,
                                   tested='acronyms',
                                   target='gold_acronyms'),
                           component_id='top_score_metrics',
                           client=client),
            LocalProcessor(Metrics(detection_precision,
                                   tested='gold_acronyms',
                                   target='acronyms'),
                           component_id='top_score_reverse',
                           client=client),
            LocalProcessor(Metrics(any_accuracy,
                                   tested='all_acronym_senses',
                                   target='gold_acronyms'),
                           component_id='all_senses_metrics',
                           client=client)) as pipeline:
        for test_file in input_dir.glob('**/*.json'):
            with JsonSerializer.file_to_event(test_file,
                                              client=client) as event:
                document = event.documents['plaintext']
                pipeline.run(document)

        print('Top Sense Accuracy:', top_score_accuracy.value)
        print('Any Sense Accuracy:', any_accuracy.value)
        print('Detection Recall:', detection_recall.value)
        print('Detection Precision:', detection_precision.value)
        pipeline.print_times()
        timing_info = pipeline.processor_timer_stats(
            'biomedicus-acronyms').timing_info
        test_results['acronyms'] = {
            'Top sense accuracy': top_score_accuracy.value,
            'Any sense accuracy': any_accuracy.value,
            'Detection Recall': detection_recall.value,
            'Detection Precision': detection_precision.value,
            'Remote Call Duration': str(timing_info['remote_call'].mean),
            'Process Method Duration': str(timing_info['process_method'].mean)
        }
        assert top_score_accuracy.value > 0.4
        assert any_accuracy.value > 0.4
        assert detection_recall.value > 0.65
Пример #18
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument("input_directory", metavar="INPUT_DIR")
    parser.add_argument("concepts_csv", metavar="PATH_TO_CONCEPTS_CSV")
    parser.add_argument("output_directory", metavar="OUTPUT_DIR")
    parser.add_argument("--sentences")
    parser.add_argument("--tagger")
    parser.add_argument("--acronyms")
    parser.add_argument("--events")

    ns = parser.parse_args(args)

    print('Reading concepts csv...')
    concepts = {}
    with open(ns.concepts_csv, 'r') as f:
        for line in f.readlines():
            splits = line.split(',')
            end = splits[0]
            start = splits[1]
            cui = splits[5]
            identifier = splits[6]
            try:
                v = concepts[identifier]
            except KeyError:
                v = []
                concepts[identifier] = v
            v.append((start, end, cui))

    print('Reading mipacq source files...')
    with EventsClient(address=ns.events) as client, \
            Pipeline(
                RemoteProcessor('biomedicus-sentences', address=ns.sentences),
                RemoteProcessor('biomedicus-tnt-tagger', address=ns.tagger),
                RemoteProcessor('biomedicus-acronyms', address=ns.acronyms),
                LocalProcessor(SerializationProcessor(PickleSerializer,
                                                      output_dir=ns.output_directory),
                               component_id='serialize',
                               client=client)
            ) as pipeline:
        for path in Path(ns.input_directory).glob('**/*.source'):
            identifier = path.stem.split('-')[0]
            try:
                doc_concepts = concepts[identifier]
            except KeyError:
                continue
            with Event(event_id=identifier, client=client) as event:
                with path.open('r') as f:
                    text = f.read()
                document = event.create_document('plaintext', text)
                with document.get_labeler('gold_concepts') as label_concept:
                    for start, end, cui in doc_concepts:
                        label_concept(start, end, cui=cui)
                pipeline.run(document)
Пример #19
0
    def __init__(self,
                 conf: PipelineConf,
                 *,
                 events_client: EventsClient = None):
        conf.populate_addresses()
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [(conf.sentences_id, conf.sentences_address),
                    (conf.section_headers_id, conf.section_headers_address),
                    (conf.tagger_id, conf.tagger_address),
                    (conf.acronyms_id, conf.acronyms_address),
                    (conf.concepts_id, conf.concepts_address),
                    (conf.negation_id, conf.negation_address),
                    (conf.selective_dependencies_id,
                     conf.selective_dependencies_address),
                    (conf.deepen_id, conf.deepen_address)]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline])
        else:
            self.pipeline = Pipeline(*[
                RemoteProcessor(identifier, address=addr)
                for identifier, addr in pipeline
            ])
        if conf.serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(conf.serializer),
                conf.output_directory,
                include_label_text=conf.include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)
Пример #20
0
def test_time_result(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_all_document_names.return_value = ['plaintext']
    client.get_all_metadata.return_value = {}
    client.instance_id = 0
    with Pipeline(
            LocalProcessor(Processor(), component_id='test_processor'),
            events_client=client
    ) as pipeline:
        event = Event()
        result = pipeline.run(event)
        assert result.component_results[0].timing_info['process_method'] >= timedelta(seconds=0.001)
Пример #21
0
def test_load_from_config():
    pipeline = Pipeline.from_yaml_file(Path(__file__).parent / 'pipeline.yml')
    assert pipeline.name == 'mtap-test-pipeline'
    assert pipeline.events_address == 'localhost:123'
    assert pipeline.mp_config.max_failures == 3
    assert not pipeline.mp_config.show_progress
    assert pipeline.mp_config.workers == 12
    assert pipeline.mp_config.read_ahead == 4
    assert not pipeline.mp_config.close_events
    assert len(pipeline) == 2
    assert pipeline[0].processor_id == 'processor-1'
    assert pipeline[0].address == 'localhost:1234'
    assert pipeline[1].processor_id == 'processor-2'
    assert pipeline[1].address == 'localhost:5678'
Пример #22
0
    def __init__(self,
                 conf: PipelineConf,
                 *,
                 events_client: EventsClient = None):
        conf.populate_addresses()
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        elif conf.events_address is not None:
            self.close_client = True
            self.events_client = EventsClient(address=conf.events_address)
        else:
            raise ValueError("Events client or address not specified.")

        pipeline = [(conf.sentences_id, conf.sentences_address),
                    (conf.tagger_id, conf.tagger_address)]
        if conf.use_discovery:
            self.pipeline = Pipeline(
                *[RemoteProcessor(identifier) for identifier, _ in pipeline])
        else:
            self.pipeline = Pipeline(*[
                RemoteProcessor(identifier, address=addr)
                for identifier, addr in pipeline
            ])
Пример #23
0
def run_themes_pipeline(input_directory, annotations_directory,
                        output_directory):
    events_address = 'localhost:50100'
    with Pipeline(RemoteProcessor('biomedicus-sentences',
                                  address='localhost:50300'),
                  LocalProcessor(
                      AttachPalliativeThemesProcessor(annotations_directory)),
                  LocalProcessor(CoalescePalliativeThemesProcessor(), ),
                  LocalProcessor(
                      SerializationProcessor(JsonSerializer,
                                             output_directory)),
                  events_address=events_address) as pipeline:
        source = FilesInDirectoryProcessingSource(pipeline.events_client,
                                                  input_directory)
        pipeline.run_multithread(source, workers=8)
Пример #24
0
def test_run_concurrently(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_all_document_names.return_value = ['plaintext']
    client.get_all_metadata.return_value = {}
    client.instance_id = 0
    with Pipeline(
            LocalProcessor(Processor('1', ), component_id='processor1'),
            LocalProcessor(Processor('2', ), component_id='processor2'),
            LocalProcessor(Processor('3', ), component_id='processor3'),
            events_client=client
    ) as pipeline:
        pipeline.events_client = client
        events = [Event() for _ in range(10)]
        pipeline.run_multithread(events, show_progress=False)
Пример #25
0
def test_run_multi(mocker):
    client = mocker.Mock(EventsClient)
    client.get_all_document_names.return_value = ['plaintext']
    client.get_all_metadata.return_value = {}
    processor1 = Processor('1')
    processor2 = Processor('2')
    processor3 = Processor('3')
    with Pipeline(
            LocalProcessor(processor1, component_id='processor1', client=client),
            LocalProcessor(processor2, component_id='processor2', client=client),
            LocalProcessor(processor3, component_id='processor3', client=client)
    ) as pipeline:
        events = [Event() for _ in range(10)]
        results = pipeline.run_multithread(events, progress=False)
        for result in results:
            assert len(result) == 3
Пример #26
0
def test_run_concurrently_with_failure(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_all_document_names.return_value = ['plaintext']
    client.get_all_metadata.return_value = {}
    client.instance_id = 0
    with Pipeline(
            LocalProcessor(Processor('1', ), component_id='processor1'),
            LocalProcessor(Processor('2', ), component_id='processor2'),
            LocalProcessor(Processor('3', ), component_id='processor3'),
            events_client=client
    ) as pipeline:
        events = [Event(event_id=str(i), client=client) for i in range(7)] + [
            Event(event_id='fail_' + str(i), client=client) for i in range(4)]
        with pytest.raises(ValueError) as e_info:
            pipeline.run_multithread(events, show_progress=False, max_failures=2)
Пример #27
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('input',
                        metavar='INPUT_FOLDER',
                        help='A folder containing PTB formatted documents.')
    parser.add_argument('--glob', metavar='GLOB', default='*.mrg')
    parser.add_argument('--source-name',
                        metavar='DOCUMENT_NAME',
                        default='source',
                        help='What document to dump the PTB text into.')
    parser.add_argument(
        '--target-name',
        metavar='DOCUMENT_NAME',
        default='plaintext',
        help='What document to the plaintext and annotations into.')
    parser.add_argument('--events',
                        metavar='EVENTS',
                        default=None,
                        help='The address of the events service.')
    parser.add_argument('--ptb-reader',
                        metavar='READER',
                        default=None,
                        help='The address of the PTB Reader.')
    parser.add_argument('--tnt-trainer',
                        metavar='TRAINER',
                        default=None,
                        help='The address of the TnT trainer.')
    args = parser.parse_args(args)
    with EventsClient(address=args.events) as client, Pipeline(
            RemoteProcessor('ptb-reader',
                            address=args.ptb_reader,
                            params={
                                'source_document_name': args.source_name,
                                'target_document_name': args.target_name
                            }),
            RemoteProcessor('biomedicus-tnt-trainer',
                            address=args.tnt_trainer,
                            params={'document_name':
                                    args.target_name})) as pipeline:
        for f in Path(args.input).rglob(args.glob):
            print('Reading:', f)
            with f.open('r') as r:
                text = r.read()
            with Event(event_id=f.name, client=client) as event:
                d = Document(args.source_name, text=text)
                event.add_document(d)
                pipeline.run(event)
Пример #28
0
def test_normalization(events_service, normalization_processor):
    with EventsClient(address=events_service) as client, \
            Pipeline(RemoteProcessor(processor_id='biomedicus_normalizer',
                                     address=normalization_processor)) as pipeline, \
            PickleSerializer.file_to_event(Path(__file__).parent / '97_95.pickle',
                                         client=client) as event:
        document = event.documents['plaintext']
        pipeline.run(document)
        for norm_form in document.get_label_index('norm_forms'):
            if norm_form.text == "according":
                assert norm_form.norm == "accord"
            if norm_form.text == "expressing":
                assert norm_form.norm == "express"
            if norm_form.text == "receiving":
                assert norm_form.norm == "receive"
            if norm_form.text == "days":
                assert norm_form.norm == "day"
Пример #29
0
def main(args=None):
    parser = ArgumentParser()
    parser.add_argument('--events-service', default='localhost:10100')
    parser.add_argument('--sentences-service', default='localhost:10102')
    conf = parser.parse_args(args)
    with Pipeline(
            RemoteProcessor(
                'biomedicus-sentences',
                address=conf.sentences_service)) as pipeline, EventsClient(
                    address=conf.events_service) as events_client:
        text = sys.stdin.read()
        with Event(client=events_client) as event:
            doc = event.create_document('plaintext', text)
            result = pipeline.run(doc)
            for sentence in doc.get_label_index('sentences'):
                print('S: "', sentence.text, '"')
            for k, v in result[0].timing_info.items():
                print('{}: {}'.format(k, v))
Пример #30
0
    def __init__(self,
                 conf_path: Union[str, Path],
                 output_directory: Union[str, Path],
                 *,
                 events_addresses: Optional[str] = None,
                 serializer: Optional[str] = None,
                 include_label_text: bool = False):
        self.pipeline = Pipeline.from_yaml_file(conf_path)
        if events_addresses is not None:
            self.pipeline.events_address = events_addresses

        if serializer == 'None':
            serializer = None
        if serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(serializer),
                output_directory,
                include_label_text=include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer')
            self.pipeline.append(ser_comp)