예제 #1
0
    def __init__(self,
                 conf_path: Union[str, Path],
                 output_directory: Union[str, Path],
                 *,
                 events_address: Optional[str] = None,
                 events_client: EventsClient = None,
                 serializer: Optional[str] = None,
                 include_label_text: bool = False):
        if events_address == 'None' or events_address == 'none' or events_address == 'null' or events_address == '':
            events_address = None
        if events_client is not None:
            self.close_client = False
            self.events_client = events_client
        else:
            self.close_client = True
            self.events_client = EventsClient(address=events_address)

        self.pipeline = Pipeline.from_yaml_file(conf_path)

        if serializer == 'None':
            serializer = None
        if serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(serializer),
                output_directory,
                include_label_text=include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer',
                                      client=self.events_client)
            self.pipeline.append(ser_comp)
예제 #2
0
def run_rtf_to_text_pipeline(config: Namespace):
    default_config = str(Path(__file__).parent / 'rtf_to_text_pipeline.yml')
    if config.write_config:
        print('Copying from "{}" to "{}"'.format(
            default_config, str(Path.cwd() / 'rtf_to_text_pipeline.yml')))
        shutil.copy2(default_config, 'rtf_to_text_pipeline.yml')
        return

    config_file = config.config
    if config_file is None:
        config_file = default_config

    workers = config.workers
    if workers is None:
        workers = max(os.cpu_count() // 2, 1)

    with Pipeline.from_yaml_file(config_file) as pipeline:
        pipeline += [
            LocalProcessor(WritePlaintext(Path(config.output_directory)),
                           component_id='write_text')
        ]

        input_directory = Path(config.input_directory)

        source = rtf_source(input_directory, config.extension_glob,
                            pipeline.events_client)
        total = sum(1 for _ in input_directory.rglob(config.extension_glob))

        pipeline.run_multithread(source,
                                 workers=workers,
                                 total=total,
                                 max_failures=config.max_failures)
        pipeline.print_times()
예제 #3
0
def test_load_from_config():
    pipeline = Pipeline.from_yaml_file(Path(__file__).parent / 'pipeline.yml')
    assert pipeline.name == 'mtap-test-pipeline'
    assert pipeline.events_address == 'localhost:123'
    assert pipeline.mp_config.max_failures == 3
    assert not pipeline.mp_config.show_progress
    assert pipeline.mp_config.workers == 12
    assert pipeline.mp_config.read_ahead == 4
    assert not pipeline.mp_config.close_events
    assert len(pipeline) == 2
    assert pipeline[0].processor_id == 'processor-1'
    assert pipeline[0].address == 'localhost:1234'
    assert pipeline[1].processor_id == 'processor-2'
    assert pipeline[1].address == 'localhost:5678'
예제 #4
0
    def __init__(self,
                 conf_path: Union[str, Path],
                 output_directory: Union[str, Path],
                 *,
                 events_addresses: Optional[str] = None,
                 serializer: Optional[str] = None,
                 include_label_text: bool = False):
        self.pipeline = Pipeline.from_yaml_file(conf_path)
        if events_addresses is not None:
            self.pipeline.events_address = events_addresses

        if serializer == 'None':
            serializer = None
        if serializer is not None:
            serialization_proc = SerializationProcessor(
                get_serializer(serializer),
                output_directory,
                include_label_text=include_label_text)
            ser_comp = LocalProcessor(serialization_proc,
                                      component_id='serializer')
            self.pipeline.append(ser_comp)