def __init__(self, conf_path: Union[str, Path], output_directory: Union[str, Path], *, events_address: Optional[str] = None, events_client: EventsClient = None, serializer: Optional[str] = None, include_label_text: bool = False): if events_address == 'None' or events_address == 'none' or events_address == 'null' or events_address == '': events_address = None if events_client is not None: self.close_client = False self.events_client = events_client else: self.close_client = True self.events_client = EventsClient(address=events_address) self.pipeline = Pipeline.from_yaml_file(conf_path) if serializer == 'None': serializer = None if serializer is not None: serialization_proc = SerializationProcessor( get_serializer(serializer), output_directory, include_label_text=include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer', client=self.events_client) self.pipeline.append(ser_comp)
def run_rtf_to_text_pipeline(config: Namespace): default_config = str(Path(__file__).parent / 'rtf_to_text_pipeline.yml') if config.write_config: print('Copying from "{}" to "{}"'.format( default_config, str(Path.cwd() / 'rtf_to_text_pipeline.yml'))) shutil.copy2(default_config, 'rtf_to_text_pipeline.yml') return config_file = config.config if config_file is None: config_file = default_config workers = config.workers if workers is None: workers = max(os.cpu_count() // 2, 1) with Pipeline.from_yaml_file(config_file) as pipeline: pipeline += [ LocalProcessor(WritePlaintext(Path(config.output_directory)), component_id='write_text') ] input_directory = Path(config.input_directory) source = rtf_source(input_directory, config.extension_glob, pipeline.events_client) total = sum(1 for _ in input_directory.rglob(config.extension_glob)) pipeline.run_multithread(source, workers=workers, total=total, max_failures=config.max_failures) pipeline.print_times()
def test_load_from_config(): pipeline = Pipeline.from_yaml_file(Path(__file__).parent / 'pipeline.yml') assert pipeline.name == 'mtap-test-pipeline' assert pipeline.events_address == 'localhost:123' assert pipeline.mp_config.max_failures == 3 assert not pipeline.mp_config.show_progress assert pipeline.mp_config.workers == 12 assert pipeline.mp_config.read_ahead == 4 assert not pipeline.mp_config.close_events assert len(pipeline) == 2 assert pipeline[0].processor_id == 'processor-1' assert pipeline[0].address == 'localhost:1234' assert pipeline[1].processor_id == 'processor-2' assert pipeline[1].address == 'localhost:5678'
def __init__(self, conf_path: Union[str, Path], output_directory: Union[str, Path], *, events_addresses: Optional[str] = None, serializer: Optional[str] = None, include_label_text: bool = False): self.pipeline = Pipeline.from_yaml_file(conf_path) if events_addresses is not None: self.pipeline.events_address = events_addresses if serializer == 'None': serializer = None if serializer is not None: serialization_proc = SerializationProcessor( get_serializer(serializer), output_directory, include_label_text=include_label_text) ser_comp = LocalProcessor(serialization_proc, component_id='serializer') self.pipeline.append(ser_comp)