def test_checkpoint_function_sample_transfomer(): X = np.arange(20, dtype=int).reshape(10, 2) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] offset = 3 oracle = X + offset with tempfile.TemporaryDirectory() as d: model_path = os.path.join(d, "model.pkl") features_dir = os.path.join(d, "features") transformer = mario.wrap( [FunctionTransformer, "sample", "checkpoint"], func=_offset_add_func, kw_args=dict(offset=offset), validate=True, model_path=model_path, features_dir=features_dir, ) features = transformer.transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, True) features = transformer.fit_transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, True) _assert_delayed_samples(features) # remove all files and call fit_transform again shutil.rmtree(d) features = transformer.fit_transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, True) # test when both model_path and features_dir is None transformer = mario.wrap( [FunctionTransformer, "sample", "checkpoint"], func=_offset_add_func, kw_args=dict(offset=offset), validate=True, ) features = transformer.transform(samples) _assert_all_close_numpy_array(oracle, [s.data for s in features]) # test when both model_path and features_dir is None with tempfile.TemporaryDirectory() as dir_name: transformer = mario.wrap( [FunctionTransformer, "sample", "checkpoint"], func=_offset_add_func, kw_args=dict(offset=offset), validate=True, features_dir=dir_name, hash_fn=hash_string, ) features = transformer.transform(samples) # Checking if we can cast the has as integer assert isinstance(int(features[0]._load.args[0].split("/")[-2]), int) _assert_all_close_numpy_array(oracle, [s.data for s in features])
def _build_transformer(path, i): features_dir = os.path.join(path, f"transformer{i}") estimator = mario.wrap( [DummyTransformer, "sample", "checkpoint"], i=i, features_dir=features_dir ) return estimator
def _run(dask_enabled): X = np.ones(shape=(10, 2), dtype=int) samples_transform = mario.SampleSet( [mario.Sample(data, key=str(i)) for i, data in enumerate(X)], key="1" ) offset = 2 oracle = X + offset with tempfile.TemporaryDirectory() as d: pipeline = Pipeline( [(f"{i}", _build_transformer(d, i)) for i in range(offset)] ) if dask_enabled: pipeline = mario.wrap(["dask"], pipeline) transformed_samples = pipeline.transform([samples_transform]).compute( scheduler="single-threaded" ) else: transformed_samples = pipeline.transform([samples_transform]) _assert_all_close_numpy_array( oracle, [s.data for sample_set in transformed_samples for s in sample_set], ) assert np.all([len(s) == 10 for s in transformed_samples])
def _run(dask_enabled): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] samples_transform = [ mario.Sample(data, key=str(i + 10)) for i, data in enumerate(X) ] oracle = X + 2 with tempfile.TemporaryDirectory() as d: fitter = ("0", _build_estimator(d, 0)) transformer = ("1", _build_transformer(d, 1)) pipeline = Pipeline([fitter, transformer]) if dask_enabled: pipeline = mario.wrap(["dask"], pipeline, fit_tag="GPU", npartitions=1) pipeline = pipeline.fit(samples) tags = mario.dask_tags(pipeline) assert len(tags) == 1, tags transformed_samples = pipeline.transform(samples_transform) transformed_samples = transformed_samples.compute( scheduler="single-threaded" ) else: pipeline = pipeline.fit(samples) transformed_samples = pipeline.transform(samples_transform) _assert_all_close_numpy_array(oracle, [s.data for s in transformed_samples])
def test_mod_4hz(): """Loading and running the mod-4hz annotator.""" # Test setup and config annotator = bob.bio.base.load_resource("mod-4hz", "annotator") assert isinstance(annotator, bob.bio.spear.annotator.Mod_4Hz) # Read input rate, wav = _wav() # Test the VAD annotator annotator = bob.bio.spear.annotator.Mod_4Hz() _compare( annotator.transform_one(wav, sample_rate=rate), pkg_resources.resource_filename( "bob.bio.spear.test", "data/vad_mod_4hz.hdf5" ), ) # Test the processing of Sample objects and tags of annotator transformer wrapped_annotator = wrap(["sample"], annotator) samples = [Sample(data=wav, rate=rate)] # Attribute `rate` should be passed as `sample_rate` argument of transform (tags) result = wrapped_annotator.transform(samples) # Annotations should be in attribute `annotations` of result samples (tags) _compare( result[0].annotations, pkg_resources.resource_filename( "bob.bio.spear.test", "data/vad_mod_4hz.hdf5" ), )
def wrap_sample_preprocessor( preprocessor, transform_extra_arguments=(("annotations", "annotations"), ), **kwargs, ): """ Wraps :any:`bob.bio.base.preprocessor.Preprocessor` with :any:`bob.pipelines.wrappers.CheckpointWrapper` and :any:`bob.pipelines.wrappers.SampleWrapper` .. warning:: This wrapper doesn't checkpoint data Parameters ---------- preprocessor: :any:`bob.bio.base.preprocessor.Preprocessor` Instance of :any:`bob.bio.base.transformers.PreprocessorTransformer` to be wrapped transform_extra_arguments: [tuple] Same behavior as in Check :any:`bob.pipelines.wrappers.transform_extra_arguments` """ transformer = PreprocessorTransformer(preprocessor) return mario.wrap( ["sample"], transformer, transform_extra_arguments=transform_extra_arguments, )
def _run(dask_enabled): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] samples_transform = [ mario.Sample(data, key=str(i + 10)) for i, data in enumerate(X) ] oracle = X + 2 with tempfile.TemporaryDirectory() as d: fitter = ("0", _build_estimator(d, 0)) transformer = ( "1", _build_transformer(d, 1), ) pipeline = Pipeline([fitter, transformer]) if dask_enabled: dask_client = _get_local_client() pipeline = mario.wrap(["dask"], pipeline) pipeline = pipeline.fit(samples) transformed_samples = pipeline.transform(samples_transform).compute( scheduler=dask_client ) else: pipeline = pipeline.fit(samples) transformed_samples = pipeline.transform(samples_transform) _assert_all_close_numpy_array(oracle, [s.data for s in transformed_samples])
def test_checkpoint_fittable_sample_transformer(): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] oracle = X + 1 with tempfile.TemporaryDirectory() as d: model_path = os.path.join(d, "model.pkl") features_dir = os.path.join(d, "features") transformer = mario.wrap( [DummyWithFit, "sample", "checkpoint"], model_path=model_path, features_dir=features_dir, ) assert not mario.utils.is_estimator_stateless(transformer) features = transformer.fit(samples).transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, False) features = transformer.fit_transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, False) _assert_delayed_samples(features) # remove all files and call fit_transform again shutil.rmtree(d) features = transformer.fit_transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, False)
def video_wrap_skpipeline(sk_pipeline): """ This function takes a `sklearn.Pipeline` and wraps each estimator inside of it with :any:`bob.bio.video.transformer.VideoWrapper` """ for i, name, estimator in sk_pipeline._iter(): # 1. Unwrap the estimator # If the estimator is `Sample` wrapped takes `estimator.estimator`. transformer = (estimator.estimator if hasattr(estimator, "estimator") else estimator) # 2. do a video wrap transformer = VideoWrapper(transformer) # 3. Sample wrap again transformer = wrap( ["sample"], transformer, fit_extra_arguments=estimator.fit_extra_arguments, transform_extra_arguments=estimator.transform_extra_arguments, ) sk_pipeline.steps[i] = (name, transformer) return sk_pipeline
def annotate(database, groups, annotator, output_dir, dask_client, **kwargs): """Annotates a database. The annotations are written in text file (json) format which can be read back using :any:`read_annotation_file` (annotation_type='json') """ log_parameters(logger) # Allows passing of Sample objects as parameters annotator = wrap(["sample"], annotator, output_attribute="annotations") # Will save the annotations in the `data` fields to a json file annotator = wrap( ["checkpoint"], annotator, features_dir=output_dir, extension=".json", save_func=save_json, load_func=load_json, sample_attribute="annotations", ) # Allows reception of Dask Bags annotator = wrap(["dask"], annotator) # Transformer that splits the samples into several Dask Bags to_dask_bags = ToDaskBag(npartitions=50) logger.debug("Retrieving samples from database.") samples = database.all_samples(groups) # Sets the scheduler to local if no dask_client is specified if dask_client is not None: scheduler = dask_client else: scheduler = "single-threaded" # Splits the samples list into bags dask_bags = to_dask_bags.transform(samples) logger.info(f"Saving annotations in {output_dir}.") logger.info(f"Annotating {len(samples)} samples...") annotator.transform(dask_bags).compute(scheduler=scheduler) logger.info("All annotations written.")
def test_dask_checkpoint_transform_pipeline(): X = np.ones(shape=(10, 2), dtype=int) samples_transform = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] with tempfile.TemporaryDirectory() as d: bag_transformer = mario.ToDaskBag() estimator = mario.wrap(["dask"], _build_transformer(d, 0), transform_tag="CPU") X_tr = estimator.transform(bag_transformer.transform(samples_transform)) assert len(mario.dask_tags(estimator)) == 1 assert len(X_tr.compute(scheduler="single-threaded")) == 10
def embedding_transformer( cropped_image_size, embedding, cropped_positions, fixed_positions=None, color_channel="rgb", annotator=None, **kwargs, ): """ Creates a pipeline composed by a FaceCropper and an Embedding extractor. This transformer is suited for Facenet based architectures .. warning:: This will resize images to the requested `image_size` """ face_cropper, transform_extra_arguments = make_cropper( cropped_image_size=cropped_image_size, cropped_positions=cropped_positions, fixed_positions=fixed_positions, color_channel=color_channel, annotator=annotator, **kwargs, ) # Support None and "passthrough" Estimators if embedding is not None and type(embedding) is not str: embedding = wrap(["sample"], embedding) transformer = Pipeline([ ( "cropper", wrap( ["sample"], face_cropper, transform_extra_arguments=transform_extra_arguments, ), ), ("embedding", embedding), ]) return transformer
def load(annotation_type, fixed_positions=None): transform_extra_arguments = (("annotations", "annotations"), ) transformer = make_pipeline( wrap( ["sample"], ToGray(), transform_extra_arguments=transform_extra_arguments, ), wrap( ["sample"], FunctionTransformer(lambda X: [x.flatten() for x in X]), ), ) algorithm = Distance() return PipelineSimple(transformer, algorithm)
def test_resample(): """Resample using the transformer.""" audio_path = resource_filename("bob.bio.spear.test", "data/sample.wav") audio_n_samples = 77760 audio_sample_rate = 16000 sample = Sample(data=audio_path, channel=None, rate=audio_sample_rate) pipeline = make_pipeline( PathToAudio(), wrap(["sample"], Resample(audio_sample_rate // 2))) results = pipeline.transform([sample])[0] assert results.data.shape == (audio_n_samples // 2, ), results.data.shape
def test_failing_sample_transformer(): X = np.zeros(shape=(10, 2)) samples = [mario.Sample(data) for i, data in enumerate(X)] expected = np.full_like(X, 2, dtype=np.object) expected[::2] = None expected[1::4] = None transformer = Pipeline( [ ("1", mario.wrap([HalfFailingDummyTransformer, "sample"])), ("2", mario.wrap([HalfFailingDummyTransformer, "sample"])), ] ) features = transformer.transform(samples) features = [f.data for f in features] assert len(expected) == len( features ), f"Expected: {len(expected)} but got: {len(features)}" assert all( (e == f).all() for e, f in zip(expected, features) ), f"Expected: {expected} but got: {features}" samples = [mario.Sample(data) for data in X] expected = [None] * X.shape[0] transformer = Pipeline( [ ("1", mario.wrap([FullFailingDummyTransformer, "sample"])), ("2", mario.wrap([FullFailingDummyTransformer, "sample"])), ] ) features = transformer.transform(samples) features = [f.data for f in features] assert len(expected) == len( features ), f"Expected: {len(expected)} but got: {len(features)}" assert all( e == f for e, f in zip(expected, features) ), f"Expected: {expected} but got: {features}"
def test_fittable_sample_transformer(): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data) for data in X] # Mixing up with an object transformer = mario.wrap([DummyWithFit, "sample"]) features = transformer.fit(samples).transform(samples) _assert_all_close_numpy_array(X + 1, [s.data for s in features]) features = transformer.fit_transform(samples) _assert_all_close_numpy_array(X + 1, [s.data for s in features])
def _build_estimator(path, i): base_dir = os.path.join(path, f"transformer{i}") os.makedirs(base_dir, exist_ok=True) model_path = os.path.join(base_dir, "model.pkl") features_dir = os.path.join(base_dir, "features") transformer = mario.wrap( [DummyWithFit, "sample", "checkpoint"], model_path=model_path, features_dir=features_dir, ) return transformer
def check_valid_pipeline(pipeline_simple): """ Applying some checks in the PipelineSimple """ # CHECKING THE TRANSFORMER # Checking if it's a Scikit Pipeline or an estimator if isinstance(pipeline_simple.transformer, Pipeline): # Checking if all steps are wrapped as samples, if not, we should wrap them for p in pipeline_simple.transformer: if ( not is_instance_nested(p, "estimator", SampleWrapper) and type(p) is not str and p is not None ): wrap(["sample"], p) # In this case it can be a simple estimator. AND # Checking if it's sample wrapper, if not, do it elif is_instance_nested( pipeline_simple.transformer, "estimator", BaseEstimator ) and is_instance_nested( pipeline_simple.transformer, "estimator", BaseEstimator ): wrap(["sample"], pipeline_simple.transformer) else: raise ValueError( f"pipeline_simple.transformer should be instance of either `sklearn.pipeline.Pipeline` or" f"sklearn.base.BaseEstimator, not {pipeline_simple.transformer}" ) # Checking the Biometric algorithm if not isinstance(pipeline_simple.biometric_algorithm, BioAlgorithm): raise ValueError( f"pipeline_simple.biometric_algorithm should be instance of `BioAlgorithm`" f"not {pipeline_simple.biometric_algorithm}" ) return True
def wrap_checkpoint_preprocessor( preprocessor, features_dir=None, transform_extra_arguments=(("annotations", "annotations"), ), load_func=None, save_func=None, extension=".hdf5", ): """ Wraps :any:`bob.bio.base.preprocessor.Preprocessor` with :any:`bob.pipelines.wrappers.CheckpointWrapper` and :any:`bob.pipelines.wrappers.SampleWrapper` Parameters ---------- preprocessor: :any:`bob.bio.base.preprocessor.Preprocessor` Instance of :any:`bob.bio.base.transformers.PreprocessorTransformer` to be wrapped features_dir: str Features directory to be checkpointed (see :any:bob.pipelines.CheckpointWrapper`). extension : str, optional Extension o preprocessed files (see :any:bob.pipelines.CheckpointWrapper`). load_func : None, optional Function that loads data to be preprocessed. The default is :any:`bob.bio.base.preprocessor.Preprocessor.read_data` save_func : None, optional Function that saves preprocessed data. The default is :any:`bob.bio.base.preprocessor.Preprocessor.write_data` transform_extra_arguments: [tuple] Same behavior as in Check :any:`bob.pipelines.wrappers.transform_extra_arguments` """ transformer = PreprocessorTransformer(preprocessor) return mario.wrap( ["sample", "checkpoint"], transformer, load_func=load_func or preprocessor.read_data, save_func=save_func or preprocessor.write_data, features_dir=features_dir, transform_extra_arguments=transform_extra_arguments, extension=extension, )
def test_function_sample_transfomer(): X = np.zeros(shape=(10, 2), dtype=int) samples = [mario.Sample(data) for data in X] transformer = mario.wrap( [FunctionTransformer, "sample"], func=_offset_add_func, kw_args=dict(offset=3), validate=True, ) features = transformer.transform(samples) _assert_all_close_numpy_array(X + 3, [s.data for s in features]) features = transformer.fit_transform(samples) _assert_all_close_numpy_array(X + 3, [s.data for s in features])
def run_experiment(dataset): def linearize(X): X = np.asarray(X) return np.reshape(X, (X.shape[0], -1)) # Testing it in a real recognition systems transformer = wrap( ["sample"], make_pipeline(FunctionTransformer(linearize)) ) pipeline_simple = PipelineSimple(transformer, Distance()) return pipeline_simple( dataset.background_model_samples(), dataset.references(), dataset.probes(), )
def wrap_sample_extractor( extractor, fit_extra_arguments=None, transform_extra_arguments=None, model_path=None, **kwargs, ): """ Wraps :any:`bob.bio.base.extractor.Extractor` with :any:`bob.pipelines.wrappers.CheckpointWrapper` and :any:`bob.pipelines.wrappers.SampleWrapper` Parameters ---------- extractor: :any:`bob.bio.base.extractor.Preprocessor` Instance of :any:`bob.bio.base.transformers.ExtractorTransformer` to be wrapped transform_extra_arguments: [tuple], optional Same behavior as in Check :any:`bob.pipelines.wrappers.transform_extra_arguments` model_path: str Path to `extractor_file` in :any:`bob.bio.base.extractor.Extractor` """ extractor_file = (os.path.join(model_path, "Extractor.hdf5") if model_path is not None else None) transformer = ExtractorTransformer(extractor, model_path=extractor_file) ( transform_extra_arguments, fit_extra_arguments, ) = _prepare_extractor_sample_args(extractor, transform_extra_arguments, fit_extra_arguments) return mario.wrap( ["sample"], transformer, transform_extra_arguments=transform_extra_arguments, fit_extra_arguments=fit_extra_arguments, **kwargs, )
import bob.pipelines as mario pipeline = mario.wrap(["dask"], pipeline) # noqa
def wrap_bob_legacy( bob_object, dir_name, fit_extra_arguments=None, transform_extra_arguments=None, dask_it=False, **kwargs, ): """ Wraps either :any:`bob.bio.base.preprocessor.Preprocessor` or :any:`bob.bio.base.extractor.Extractor` with :any:`sklearn.base.TransformerMixin` and :any:`bob.pipelines.wrappers.CheckpointWrapper` and :any:`bob.pipelines.wrappers.SampleWrapper` Parameters ---------- bob_object: object Instance of :any:`bob.bio.base.preprocessor.Preprocessor` or :any:`bob.bio.base.extractor.Extractor` dir_name: str Directory name for the checkpoints fit_extra_arguments: [tuple] Same behavior as in Check :any:`bob.pipelines.wrappers.fit_extra_arguments` transform_extra_arguments: [tuple] Same behavior as in Check :any:`bob.pipelines.wrappers.transform_extra_arguments` dask_it: bool If True, the transformer will be a dask graph """ if isinstance(bob_object, Preprocessor): transformer = wrap_checkpoint_preprocessor( bob_object, features_dir=os.path.join(dir_name, "preprocessor"), **kwargs, ) elif isinstance(bob_object, Extractor): transformer = wrap_checkpoint_extractor( bob_object, features_dir=os.path.join(dir_name, "extractor"), model_path=dir_name, fit_extra_arguments=fit_extra_arguments, transform_extra_arguments=transform_extra_arguments, **kwargs, ) else: raise ValueError( "`bob_object` should be an instance of `Preprocessor`, `Extractor` and `Algorithm`" ) if dask_it: transformer = mario.wrap(["dask"], transformer) return transformer
def wrap_checkpoint_extractor( extractor, features_dir=None, fit_extra_arguments=None, transform_extra_arguments=None, load_func=None, save_func=None, extension=".hdf5", model_path=None, **kwargs, ): """ Wraps :any:`bob.bio.base.extractor.Extractor` with :any:`bob.pipelines.wrappers.CheckpointWrapper` and :any:`bob.pipelines.wrappers.SampleWrapper` Parameters ---------- extractor: :any:`bob.bio.base.extractor.Preprocessor` Instance of :any:`bob.bio.base.transformers.ExtractorTransformer` to be wrapped features_dir: str Features directory to be checkpointed (see :any:bob.pipelines.CheckpointWrapper`). extension : str, optional Extension o preprocessed files (see :any:bob.pipelines.CheckpointWrapper`). load_func : None, optional Function that loads data to be preprocessed. The default is :any:`bob.bio.base.extractor.Extractor.read_feature` save_func : None, optional Function that saves preprocessed data. The default is :any:`bob.bio.base.extractor.Extractor.write_feature` fit_extra_arguments: [tuple] Same behavior as in Check :any:`bob.pipelines.wrappers.fit_extra_arguments` transform_extra_arguments: [tuple], optional Same behavior as in Check :any:`bob.pipelines.wrappers.transform_extra_arguments` model_path: str See :any:`TransformerExtractor`. """ extractor_file = (os.path.join(model_path, "Extractor.hdf5") if model_path is not None else None) model_file = (os.path.join(model_path, "Extractor.pkl") if model_path is not None else None) transformer = ExtractorTransformer(extractor, model_path=extractor_file) ( transform_extra_arguments, fit_extra_arguments, ) = _prepare_extractor_sample_args(extractor, transform_extra_arguments, fit_extra_arguments) return mario.wrap( ["sample", "checkpoint"], transformer, load_func=load_func or extractor.read_feature, save_func=save_func or extractor.write_feature, model_path=model_file, features_dir=features_dir, transform_extra_arguments=transform_extra_arguments, fit_extra_arguments=fit_extra_arguments, **kwargs, )
# Kmeans machine used for GMM initialization kmeans_trainer = KMeansMachine( n_clusters=n_gaussians, max_iter=25, convergence_threshold=0.0, init_max_iter=5, oversampling_factor=128, ) # Algorithm used for enrollment and scoring, trained first as a Transformer. bioalgorithm = GMM( n_gaussians=n_gaussians, max_fitting_steps=25, enroll_iterations=1, convergence_threshold= 0.0, # Maximum number of iterations as stopping criterion k_means_trainer=kmeans_trainer, random_state=2, ) # Transformer part of PipelineSimple transformer = Pipeline([ ("annotator", wrap(["sample"], Mod_4Hz())), ("extractor", wrap(["sample"], Cepstral())), ("algorithm_trainer", wrap(["sample"], bioalgorithm)), ]) # PipelineSimple instance used by `execute_pipeline_simple` or the `pipeline simple` command pipeline = PipelineSimple(transformer, bioalgorithm)
k_means_trainer=KMeansMachine( n_clusters=256, max_iter=2, random_state=SEED, init_max_iter=5, oversampling_factor=64, ), return_stats_in_transform=True, ) bioalgorithm = ISV( # ISV parameters r_U=50, random_state=SEED, em_iterations=2, enroll_iterations=1, # GMM parameters ubm=ubm, ) transformer = Pipeline([ ("annotator", Energy_2Gauss()), ("extractor", Cepstral()), ("ubm", ubm), ("reference_id_encoder", ReferenceIdEncoder()), ("isv", bioalgorithm), ]) transformer = wrap(["sample"], transformer) pipeline = PipelineSimple(transformer, bioalgorithm)
def annotate_samples(samples, reader, make_key, annotator, output_dir, dask_client, **kwargs): """Annotates a list of samples. This command is very similar to ``bob bio annotate`` except that it works without a database interface. You must provide a list of samples as well as two functions: def reader(sample): # Loads data from a sample. # for example: data = bob.io.base.load(sample) # data will be given to the annotator return data def make_key(sample): # Creates a unique str identifier for this sample. # for example: return str(sample) """ log_parameters(logger, ignore=("samples", )) # Allows passing of Sample objects as parameters annotator = wrap(["sample"], annotator, output_attribute="annotations") # Will save the annotations in the `data` fields to a json file annotator = wrap( bases=["checkpoint"], estimator=annotator, features_dir=output_dir, extension=".json", save_func=save_json, load_func=load_json, sample_attribute="annotations", ) # Allows reception of Dask Bags annotator = wrap(["dask"], annotator) # Transformer that splits the samples into several Dask Bags to_dask_bags = ToDaskBag(npartitions=50) if dask_client is not None: scheduler = dask_client else: scheduler = "single-threaded" # Converts samples into a list of DelayedSample objects samples_obj = [ DelayedSample( load=functools.partial(reader, s), key=make_key(s), ) for s in samples ] # Splits the samples list into bags dask_bags = to_dask_bags.transform(samples_obj) logger.info(f"Saving annotations in {output_dir}") logger.info(f"Annotating {len(samples_obj)} samples...") annotator.transform(dask_bags).compute(scheduler=scheduler) logger.info("All annotations written.")
# Kmeans machine used for GMM initialization kmeans_trainer = KMeansMachine( n_clusters=n_gaussians, max_iter=25, convergence_threshold=0.0, init_max_iter=5, oversampling_factor=64, ) # Algorithm used for enrollment and scoring, trained first as a Transformer. bioalgorithm = GMM( n_gaussians=n_gaussians, max_fitting_steps=25, enroll_iterations=1, convergence_threshold= 0.0, # Maximum number of iterations as stopping criterion k_means_trainer=kmeans_trainer, random_state=2, ) # Transformer part of PipelineSimple transformer = Pipeline([ ("annotator", wrap(["sample"], Energy_2Gauss())), ("extractor", wrap(["sample"], Cepstral())), ("algorithm_trainer", wrap(["sample"], bioalgorithm)), ]) # PipelineSimple instance used by `execute_pipeline_simple` or the `pipeline simple` command pipeline = PipelineSimple(transformer, bioalgorithm)
def test_failing_checkpoint_transformer(): X = np.zeros(shape=(10, 2)) samples = [mario.Sample(data, key=i) for i, data in enumerate(X)] expected = np.full_like(X, 2) expected[::2] = None expected[1::4] = None expected = list(expected) with tempfile.TemporaryDirectory() as d: features_dir_1 = os.path.join(d, "features_1") features_dir_2 = os.path.join(d, "features_2") transformer = Pipeline( [ ( "1", mario.wrap( [HalfFailingDummyTransformer, "sample", "checkpoint"], features_dir=features_dir_1, ), ), ( "2", mario.wrap( [HalfFailingDummyTransformer, "sample", "checkpoint"], features_dir=features_dir_2, ), ), ] ) features = transformer.transform(samples) np_features = np.array( [ np.full(X.shape[1], np.nan) if f.data is None else f.data for f in features ] ) assert len(expected) == len( np_features ), f"Expected: {len(expected)} but got: {len(np_features)}" assert np.allclose( expected, np_features, equal_nan=True ), f"Expected: {expected} but got: {np_features}" samples = [mario.Sample(data, key=i) for i, data in enumerate(X)] expected = [None] * X.shape[0] with tempfile.TemporaryDirectory() as d: features_dir_1 = os.path.join(d, "features_1") features_dir_2 = os.path.join(d, "features_2") transformer = Pipeline( [ ( "1", mario.wrap( [FullFailingDummyTransformer, "sample", "checkpoint"], features_dir=features_dir_1, ), ), ( "2", mario.wrap( [FullFailingDummyTransformer, "sample", "checkpoint"], features_dir=features_dir_2, ), ), ] ) features = transformer.transform(samples) assert len(expected) == len( features ), f"Expected: {len(expected)} but got: {len(features)}" assert all( e == f.data for e, f in zip(expected, features) ), f"Expected: {expected} but got: {features}"