def datafiles_to_db_by_source(**kwargs): """ Populate tables with source files """ if not any(kwargs.values()): ctx = click.get_current_context() click.echo(ctx.get_help()) else: manager = PipelineManager(storage=get_source_folder(), dbi=app.dbi) for _arg, pipeline_info_list in arg_to_pipeline_config_list.items(): arg = _arg.replace(".", "__") if kwargs['all'] or kwargs[arg]: for pipeline, sub_dir, options in pipeline_info_list: required_flag = options.get('required_flag', None) if not required_flag or kwargs.get(required_flag, False): manager.pipeline_register( pipeline=pipeline, sub_directory=sub_dir, force=kwargs['force'], continue_transform=kwargs['continue'], products=kwargs['products'], **options, ) manager.pipeline_process_all()
def test_pipeline_register(self): manager = Manager() manager.pipeline_register( 'test_pipeline', pipeline_id='fake_pipeline', custom_parameter=True, force=True, unpack=True, trigger_dataflow_dag=True, ) assert len(manager._pipelines) == 1 self.assert_pipeline_config(manager, 'fake_pipeline', True, 'test_pipeline', None, True, True)
def test_transform_pipeline_process(self, app_with_db): manager = Manager(dbi=app_with_db.dbi) manager._pipelines['fake_pipeline'] = PipelineConfig( pipeline=FakePipeline(1234), sub_directory=None, force=False, unpack=False, trigger_dataflow_dag=False, ) manager.pipeline_process('fake_pipeline') actual_data_file_registry_list = DatafileRegistryModel.query.all() assert len(actual_data_file_registry_list) == 1 actual_data_file_registry = actual_data_file_registry_list[0] assert actual_data_file_registry.state == DatafileState.PROCESSED.value assert actual_data_file_registry.error_message is None assert actual_data_file_registry.file_name is None assert actual_data_file_registry.created_timestamp == datetime.utcnow() assert actual_data_file_registry.updated_timestamp == datetime.utcnow() assert actual_data_file_registry.source == '1234'
def test_pipeline_register_when_already_registered(self): manager = Manager() manager.pipeline_register('test_pipeline', pipeline_id='fake_pipeline') with pytest.raises(ValueError): manager.pipeline_register('test_pipeline', pipeline_id='fake_pipeline') self.assert_pipeline_config(manager, 'fake_pipeline', False, 'test_pipeline', None, True, False)
def test_pipeline_process( self, mock_read_files, mock_get_file_names, app_with_db, raise_exception, expected_state, expected_error_message, ): def read_files(*args, **kwargs): yield ['fake_file.txt'] mock_get_file_names.return_value = ['fake_file.txt'] mock_read_files.side_effect = read_files bucket = app_with_db.config['s3']['bucket_url'] source_folder = os.path.join( bucket, app_with_db.config['s3']['datasets_folder']) manager = Manager(storage=source_folder, dbi=app_with_db.dbi) manager._pipelines['fake_pipeline'] = PipelineConfig( pipeline=FakePipeline(1234, raise_processing_exception=raise_exception), sub_directory='/tmp/fake_pipeline', force=False, unpack=False, trigger_dataflow_dag=False, ) manager.pipeline_process('fake_pipeline') actual_data_file_registry_list = DatafileRegistryModel.query.all() assert len(actual_data_file_registry_list) == 1 actual_data_file_registry = actual_data_file_registry_list[0] assert actual_data_file_registry.state == expected_state assert actual_data_file_registry.error_message == expected_error_message assert actual_data_file_registry.file_name == 'fake_file.txt' assert actual_data_file_registry.created_timestamp == datetime.utcnow() assert actual_data_file_registry.updated_timestamp == datetime.utcnow() assert actual_data_file_registry.source == '1234'
def test_pipeline_process_all(self, mock_pipeline_process): mock_pipeline_process.return_value = None manager = Manager() manager._pipelines['fake_pipeline'] = None manager._pipelines['fake_pipeline_2'] = None manager.pipeline_process_all() assert mock_pipeline_process.call_args_list[0][0][0] == 'fake_pipeline' assert mock_pipeline_process.call_args_list[1][0][ 0] == 'fake_pipeline_2'
def test_pipeline_remove(self, pipeline): manager = Manager() fake_pipeline = FakePipeline(0) manager._pipelines[getattr(pipeline, 'id', pipeline)] = fake_pipeline manager.pipeline_remove(pipeline) assert pipeline not in manager._pipelines
def test_pipeline_get(self, pipeline, expected_result): manager = Manager() fake_pipeline = FakePipeline(0) manager._pipelines[getattr(pipeline, 'id', pipeline)] = fake_pipeline assert manager.pipeline_get(pipeline) == fake_pipeline
def test_to_pipeline_id(self, pipeline, expected_result): manager = Manager() assert manager._to_pipeline_id(pipeline) == expected_result