def test_get_job_name(self) -> None: self.assertEqual(beam_tables.get_job_name('base.scan_echo', False), 'write-base-scan-echo') self.assertEqual(beam_tables.get_job_name('base.scan_discard', True), 'append-base-scan-discard') self.assertEqual(beam_tables.get_job_name('laplante.scan_http', False), 'write-laplante-scan-http') self.assertEqual(beam_tables.get_job_name('laplante.scan_https', True), 'append-laplante-scan-https')
def run_parallel_pipelines(runner: beam_tables.ScanDataBeamPipelineRunner, dataset: str, scan_types: List[str], incremental_load: bool, start_date: Optional[datetime.date] = None, end_date: Optional[datetime.date] = None) -> bool: """Runs beam pipelines for different scan types in parallel. Args: runner: ScanDataBeamPipelineRunner to run pipelines dataset: dataset name to write to like 'prod' or 'laplante scan_types: list of scan types to run ['echo' 'http'] incremental_load: boolean. If true, only load the latest new data, if false reload all data. start_date: date object, only files after or at this date will be read. Mostly only used during development. end_date: date object, only files at or before this date will be read. Mostly only used during development. Returns: True on success Raises: Exception: if any of the pipelines fail or don't finish. """ with concurrent.futures.ThreadPoolExecutor() as pool: futures = [] for scan_type in scan_types: table_name = beam_tables.get_table_name( dataset, scan_type, beam_tables.BASE_TABLE_NAME) job_name = beam_tables.get_job_name(table_name, incremental_load) future = pool.submit(runner.run_beam_pipeline, scan_type, incremental_load, job_name, table_name, start_date, end_date) futures.append(future) finished, pending = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_EXCEPTION) # Raise any exceptions for future in finished: future.result() if pending: raise Exception('Some pipelines failed to finish: ', pending, 'finished: ', finished) return True