def _run(dask_enabled): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] samples_transform = [ mario.Sample(data, key=str(i + 10)) for i, data in enumerate(X) ] oracle = X + 2 with tempfile.TemporaryDirectory() as d: fitter = ("0", _build_estimator(d, 0)) transformer = ("1", _build_transformer(d, 1)) pipeline = Pipeline([fitter, transformer]) if dask_enabled: pipeline = mario.wrap(["dask"], pipeline, fit_tag="GPU", npartitions=1) pipeline = pipeline.fit(samples) tags = mario.dask_tags(pipeline) assert len(tags) == 1, tags transformed_samples = pipeline.transform(samples_transform) transformed_samples = transformed_samples.compute( scheduler="single-threaded" ) else: pipeline = pipeline.fit(samples) transformed_samples = pipeline.transform(samples_transform) _assert_all_close_numpy_array(oracle, [s.data for s in transformed_samples])
def _run(dask_enabled): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] samples_transform = [ mario.Sample(data, key=str(i + 10)) for i, data in enumerate(X) ] oracle = X + 2 with tempfile.TemporaryDirectory() as d: fitter = ("0", _build_estimator(d, 0)) transformer = ( "1", _build_transformer(d, 1), ) pipeline = Pipeline([fitter, transformer]) if dask_enabled: dask_client = _get_local_client() pipeline = mario.wrap(["dask"], pipeline) pipeline = pipeline.fit(samples) transformed_samples = pipeline.transform(samples_transform).compute( scheduler=dask_client ) else: pipeline = pipeline.fit(samples) transformed_samples = pipeline.transform(samples_transform) _assert_all_close_numpy_array(oracle, [s.data for s in transformed_samples])
def _run(dask_enabled): X = np.ones(shape=(10, 2), dtype=int) samples_transform = mario.SampleSet( [mario.Sample(data, key=str(i)) for i, data in enumerate(X)], key="1" ) offset = 2 oracle = X + offset with tempfile.TemporaryDirectory() as d: pipeline = Pipeline( [(f"{i}", _build_transformer(d, i)) for i in range(offset)] ) if dask_enabled: pipeline = mario.wrap(["dask"], pipeline) transformed_samples = pipeline.transform([samples_transform]).compute( scheduler="single-threaded" ) else: transformed_samples = pipeline.transform([samples_transform]) _assert_all_close_numpy_array( oracle, [s.data for sample_set in transformed_samples for s in sample_set], ) assert np.all([len(s) == 10 for s in transformed_samples])
def test_checkpoint_fittable_sample_transformer(): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] oracle = X + 1 with tempfile.TemporaryDirectory() as d: model_path = os.path.join(d, "model.pkl") features_dir = os.path.join(d, "features") transformer = mario.wrap( [DummyWithFit, "sample", "checkpoint"], model_path=model_path, features_dir=features_dir, ) assert not mario.utils.is_estimator_stateless(transformer) features = transformer.fit(samples).transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, False) features = transformer.fit_transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, False) _assert_delayed_samples(features) # remove all files and call fit_transform again shutil.rmtree(d) features = transformer.fit_transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, False)
def test_checkpoint_fittable_pipeline(): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] samples_transform = [ mario.Sample(data, key=str(i + 10)) for i, data in enumerate(X) ] oracle = X + 3 with tempfile.TemporaryDirectory() as d: pipeline = Pipeline([(f"{i}", _build_estimator(d, i)) for i in range(2)]) pipeline.fit(samples) transformed_samples = pipeline.transform(samples_transform) _assert_all_close_numpy_array(oracle, [s.data for s in transformed_samples])
def test_pca(): # Test wrapped in to a Sample X = np.random.rand(100, 10) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] # fit n_components = 2 estimator = mario.transformers.SamplePCA(n_components=n_components) estimator = estimator.fit(samples) # https://scikit-learn.org/stable/modules/generated/sklearn.utils.validation.check_is_fitted.html assert check_is_fitted(estimator, "n_components_") is None # transform samples_tr = estimator.transform(samples) assert samples_tr[0].data.shape == (n_components, ) # Test Checkpoining with tempfile.TemporaryDirectory() as d: model_path = os.path.join(d, "model.pkl") estimator = mario.transformers.CheckpointSamplePCA( n_components=n_components, features_dir=d, model_path=model_path) # fit estimator = estimator.fit(samples) assert check_is_fitted(estimator, "n_components_") is None assert os.path.exists(model_path) # transform samples_tr = estimator.transform(samples) assert samples_tr[0].data.shape == (n_components, ) assert os.path.exists(os.path.join(d, samples_tr[0].key + ".h5"))
def test_linearize(): def _assert(Xt, oracle): assert np.allclose(Xt, oracle), (Xt, oracle) X = np.zeros(shape=(10, 10, 10)) oracle = X.reshape((10, -1)) # Test the transformer only transformer = mario.transformers.Linearize() X_tr = transformer.transform(X) _assert(X_tr, oracle) # Test wrapped in to a Sample samples = [mario.Sample(x, key=f"{i}") for i, x in enumerate(X)] transformer = mario.transformers.SampleLinearize() X_tr = transformer.transform(samples) _assert([s.data for s in X_tr], oracle) # Test checkpoint with tempfile.TemporaryDirectory() as d: transformer = mario.transformers.CheckpointSampleLinearize( features_dir=d) X_tr = transformer.transform(samples) _assert([s.data for s in X_tr], oracle) assert os.path.exists(os.path.join(d, "1.h5"))
def test_checkpoint_function_sample_transfomer(): X = np.arange(20, dtype=int).reshape(10, 2) samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] offset = 3 oracle = X + offset with tempfile.TemporaryDirectory() as d: model_path = os.path.join(d, "model.pkl") features_dir = os.path.join(d, "features") transformer = mario.wrap( [FunctionTransformer, "sample", "checkpoint"], func=_offset_add_func, kw_args=dict(offset=offset), validate=True, model_path=model_path, features_dir=features_dir, ) features = transformer.transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, True) features = transformer.fit_transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, True) _assert_delayed_samples(features) # remove all files and call fit_transform again shutil.rmtree(d) features = transformer.fit_transform(samples) _assert_checkpoints(features, oracle, model_path, features_dir, True) # test when both model_path and features_dir is None transformer = mario.wrap( [FunctionTransformer, "sample", "checkpoint"], func=_offset_add_func, kw_args=dict(offset=offset), validate=True, ) features = transformer.transform(samples) _assert_all_close_numpy_array(oracle, [s.data for s in features]) # test when both model_path and features_dir is None with tempfile.TemporaryDirectory() as dir_name: transformer = mario.wrap( [FunctionTransformer, "sample", "checkpoint"], func=_offset_add_func, kw_args=dict(offset=offset), validate=True, features_dir=dir_name, hash_fn=hash_string, ) features = transformer.transform(samples) # Checking if we can cast the has as integer assert isinstance(int(features[0]._load.args[0].split("/")[-2]), int) _assert_all_close_numpy_array(oracle, [s.data for s in features])
def test_dask_checkpoint_transform_pipeline(): X = np.ones(shape=(10, 2), dtype=int) samples_transform = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] with tempfile.TemporaryDirectory() as d: bag_transformer = mario.ToDaskBag() estimator = mario.wrap(["dask"], _build_transformer(d, 0), transform_tag="CPU") X_tr = estimator.transform(bag_transformer.transform(samples_transform)) assert len(mario.dask_tags(estimator)) == 1 assert len(X_tr.compute(scheduler="single-threaded")) == 10
def test_failing_sample_transformer(): X = np.zeros(shape=(10, 2)) samples = [mario.Sample(data) for i, data in enumerate(X)] expected = np.full_like(X, 2, dtype=np.object) expected[::2] = None expected[1::4] = None transformer = Pipeline( [ ("1", mario.wrap([HalfFailingDummyTransformer, "sample"])), ("2", mario.wrap([HalfFailingDummyTransformer, "sample"])), ] ) features = transformer.transform(samples) features = [f.data for f in features] assert len(expected) == len( features ), f"Expected: {len(expected)} but got: {len(features)}" assert all( (e == f).all() for e, f in zip(expected, features) ), f"Expected: {expected} but got: {features}" samples = [mario.Sample(data) for data in X] expected = [None] * X.shape[0] transformer = Pipeline( [ ("1", mario.wrap([FullFailingDummyTransformer, "sample"])), ("2", mario.wrap([FullFailingDummyTransformer, "sample"])), ] ) features = transformer.transform(samples) features = [f.data for f in features] assert len(expected) == len( features ), f"Expected: {len(expected)} but got: {len(features)}" assert all( e == f for e, f in zip(expected, features) ), f"Expected: {expected} but got: {features}"
def test_fittable_sample_transformer(): X = np.ones(shape=(10, 2), dtype=int) samples = [mario.Sample(data) for data in X] # Mixing up with an object transformer = mario.wrap([DummyWithFit, "sample"]) features = transformer.fit(samples).transform(samples) _assert_all_close_numpy_array(X + 1, [s.data for s in features]) features = transformer.fit_transform(samples) _assert_all_close_numpy_array(X + 1, [s.data for s in features])
def test_function_sample_transfomer(): X = np.zeros(shape=(10, 2), dtype=int) samples = [mario.Sample(data) for data in X] transformer = mario.wrap( [FunctionTransformer, "sample"], func=_offset_add_func, kw_args=dict(offset=3), validate=True, ) features = transformer.transform(samples) _assert_all_close_numpy_array(X + 3, [s.data for s in features]) features = transformer.fit_transform(samples) _assert_all_close_numpy_array(X + 3, [s.data for s in features])
def _build_toy_samples(delayed=False): X = np.ones(shape=(10, 5), dtype=int) if delayed: def _load(index, attr): if attr == "data": return X[index] if attr == "key": return str(index) samples = [ mario.DelayedSample( partial(_load, i, "data"), delayed_attributes=dict(key=partial(_load, i, "key")), ) for i in range(len(X)) ] else: samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)] return X, samples
def _build_iris_dataset(shuffle=False, delayed=False): iris = datasets.load_iris() X = iris.data keys = [str(k) for k in range(len(X))] if delayed: def _load(index, attr): if attr == "data": return X[index] if attr == "key": return str(index) if attr == "target": return iris.target[index] samples = [ mario.DelayedSample( partial(_load, i, "data"), delayed_attributes=dict( key=partial(_load, i, "key"), target=partial(_load, i, "target"), ), ) for i in range(len(X)) ] else: samples = [ mario.Sample(x, target=y, key=k) for x, y, k in zip(iris.data, iris.target, keys) ] meta = xr.DataArray(X[0], dims=("feature",)) dataset = mario.xr.samples_to_dataset( samples, meta=meta, npartitions=3, shuffle=shuffle ) return dataset
def test_failing_checkpoint_transformer(): X = np.zeros(shape=(10, 2)) samples = [mario.Sample(data, key=i) for i, data in enumerate(X)] expected = np.full_like(X, 2) expected[::2] = None expected[1::4] = None expected = list(expected) with tempfile.TemporaryDirectory() as d: features_dir_1 = os.path.join(d, "features_1") features_dir_2 = os.path.join(d, "features_2") transformer = Pipeline( [ ( "1", mario.wrap( [HalfFailingDummyTransformer, "sample", "checkpoint"], features_dir=features_dir_1, ), ), ( "2", mario.wrap( [HalfFailingDummyTransformer, "sample", "checkpoint"], features_dir=features_dir_2, ), ), ] ) features = transformer.transform(samples) np_features = np.array( [ np.full(X.shape[1], np.nan) if f.data is None else f.data for f in features ] ) assert len(expected) == len( np_features ), f"Expected: {len(expected)} but got: {len(np_features)}" assert np.allclose( expected, np_features, equal_nan=True ), f"Expected: {expected} but got: {np_features}" samples = [mario.Sample(data, key=i) for i, data in enumerate(X)] expected = [None] * X.shape[0] with tempfile.TemporaryDirectory() as d: features_dir_1 = os.path.join(d, "features_1") features_dir_2 = os.path.join(d, "features_2") transformer = Pipeline( [ ( "1", mario.wrap( [FullFailingDummyTransformer, "sample", "checkpoint"], features_dir=features_dir_1, ), ), ( "2", mario.wrap( [FullFailingDummyTransformer, "sample", "checkpoint"], features_dir=features_dir_2, ), ), ] ) features = transformer.transform(samples) assert len(expected) == len( features ), f"Expected: {len(expected)} but got: {len(features)}" assert all( e == f.data for e, f in zip(expected, features) ), f"Expected: {expected} but got: {features}"