Пример #1
0
    def _run(dask_enabled):
        X = np.ones(shape=(10, 2), dtype=int)
        samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)]
        samples_transform = [
            mario.Sample(data, key=str(i + 10)) for i, data in enumerate(X)
        ]
        oracle = X + 2

        with tempfile.TemporaryDirectory() as d:
            fitter = ("0", _build_estimator(d, 0))
            transformer = ("1", _build_transformer(d, 1))
            pipeline = Pipeline([fitter, transformer])
            if dask_enabled:
                pipeline = mario.wrap(["dask"], pipeline, fit_tag="GPU", npartitions=1)
                pipeline = pipeline.fit(samples)
                tags = mario.dask_tags(pipeline)

                assert len(tags) == 1, tags
                transformed_samples = pipeline.transform(samples_transform)

                transformed_samples = transformed_samples.compute(
                    scheduler="single-threaded"
                )
            else:
                pipeline = pipeline.fit(samples)
                transformed_samples = pipeline.transform(samples_transform)

            _assert_all_close_numpy_array(oracle, [s.data for s in transformed_samples])
Пример #2
0
    def _run(dask_enabled):
        X = np.ones(shape=(10, 2), dtype=int)
        samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)]
        samples_transform = [
            mario.Sample(data, key=str(i + 10)) for i, data in enumerate(X)
        ]
        oracle = X + 2

        with tempfile.TemporaryDirectory() as d:
            fitter = ("0", _build_estimator(d, 0))
            transformer = (
                "1",
                _build_transformer(d, 1),
            )

            pipeline = Pipeline([fitter, transformer])
            if dask_enabled:
                dask_client = _get_local_client()
                pipeline = mario.wrap(["dask"], pipeline)
                pipeline = pipeline.fit(samples)
                transformed_samples = pipeline.transform(samples_transform).compute(
                    scheduler=dask_client
                )
            else:
                pipeline = pipeline.fit(samples)
                transformed_samples = pipeline.transform(samples_transform)

            _assert_all_close_numpy_array(oracle, [s.data for s in transformed_samples])
Пример #3
0
    def _run(dask_enabled):

        X = np.ones(shape=(10, 2), dtype=int)
        samples_transform = mario.SampleSet(
            [mario.Sample(data, key=str(i)) for i, data in enumerate(X)], key="1"
        )
        offset = 2
        oracle = X + offset

        with tempfile.TemporaryDirectory() as d:
            pipeline = Pipeline(
                [(f"{i}", _build_transformer(d, i)) for i in range(offset)]
            )
            if dask_enabled:
                pipeline = mario.wrap(["dask"], pipeline)
                transformed_samples = pipeline.transform([samples_transform]).compute(
                    scheduler="single-threaded"
                )
            else:
                transformed_samples = pipeline.transform([samples_transform])

            _assert_all_close_numpy_array(
                oracle,
                [s.data for sample_set in transformed_samples for s in sample_set],
            )
            assert np.all([len(s) == 10 for s in transformed_samples])
Пример #4
0
def test_checkpoint_fittable_sample_transformer():
    X = np.ones(shape=(10, 2), dtype=int)
    samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)]
    oracle = X + 1

    with tempfile.TemporaryDirectory() as d:
        model_path = os.path.join(d, "model.pkl")
        features_dir = os.path.join(d, "features")

        transformer = mario.wrap(
            [DummyWithFit, "sample", "checkpoint"],
            model_path=model_path,
            features_dir=features_dir,
        )
        assert not mario.utils.is_estimator_stateless(transformer)
        features = transformer.fit(samples).transform(samples)
        _assert_checkpoints(features, oracle, model_path, features_dir, False)

        features = transformer.fit_transform(samples)
        _assert_checkpoints(features, oracle, model_path, features_dir, False)
        _assert_delayed_samples(features)

        # remove all files and call fit_transform again
        shutil.rmtree(d)
        features = transformer.fit_transform(samples)
        _assert_checkpoints(features, oracle, model_path, features_dir, False)
Пример #5
0
def test_checkpoint_fittable_pipeline():

    X = np.ones(shape=(10, 2), dtype=int)
    samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)]
    samples_transform = [
        mario.Sample(data, key=str(i + 10)) for i, data in enumerate(X)
    ]
    oracle = X + 3

    with tempfile.TemporaryDirectory() as d:
        pipeline = Pipeline([(f"{i}", _build_estimator(d, i)) for i in range(2)])
        pipeline.fit(samples)

        transformed_samples = pipeline.transform(samples_transform)

        _assert_all_close_numpy_array(oracle, [s.data for s in transformed_samples])
Пример #6
0
def test_pca():

    # Test wrapped in to a Sample
    X = np.random.rand(100, 10)
    samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)]

    # fit
    n_components = 2
    estimator = mario.transformers.SamplePCA(n_components=n_components)
    estimator = estimator.fit(samples)

    # https://scikit-learn.org/stable/modules/generated/sklearn.utils.validation.check_is_fitted.html
    assert check_is_fitted(estimator, "n_components_") is None

    # transform
    samples_tr = estimator.transform(samples)
    assert samples_tr[0].data.shape == (n_components, )

    # Test Checkpoining
    with tempfile.TemporaryDirectory() as d:
        model_path = os.path.join(d, "model.pkl")
        estimator = mario.transformers.CheckpointSamplePCA(
            n_components=n_components, features_dir=d, model_path=model_path)

        # fit
        estimator = estimator.fit(samples)
        assert check_is_fitted(estimator, "n_components_") is None
        assert os.path.exists(model_path)

        # transform
        samples_tr = estimator.transform(samples)
        assert samples_tr[0].data.shape == (n_components, )
        assert os.path.exists(os.path.join(d, samples_tr[0].key + ".h5"))
Пример #7
0
def test_linearize():
    def _assert(Xt, oracle):
        assert np.allclose(Xt, oracle), (Xt, oracle)

    X = np.zeros(shape=(10, 10, 10))
    oracle = X.reshape((10, -1))

    # Test the transformer only
    transformer = mario.transformers.Linearize()
    X_tr = transformer.transform(X)
    _assert(X_tr, oracle)

    # Test wrapped in to a Sample
    samples = [mario.Sample(x, key=f"{i}") for i, x in enumerate(X)]
    transformer = mario.transformers.SampleLinearize()
    X_tr = transformer.transform(samples)
    _assert([s.data for s in X_tr], oracle)

    # Test checkpoint
    with tempfile.TemporaryDirectory() as d:
        transformer = mario.transformers.CheckpointSampleLinearize(
            features_dir=d)
        X_tr = transformer.transform(samples)
        _assert([s.data for s in X_tr], oracle)
        assert os.path.exists(os.path.join(d, "1.h5"))
Пример #8
0
def test_checkpoint_function_sample_transfomer():

    X = np.arange(20, dtype=int).reshape(10, 2)
    samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)]
    offset = 3
    oracle = X + offset

    with tempfile.TemporaryDirectory() as d:
        model_path = os.path.join(d, "model.pkl")
        features_dir = os.path.join(d, "features")

        transformer = mario.wrap(
            [FunctionTransformer, "sample", "checkpoint"],
            func=_offset_add_func,
            kw_args=dict(offset=offset),
            validate=True,
            model_path=model_path,
            features_dir=features_dir,
        )

        features = transformer.transform(samples)
        _assert_checkpoints(features, oracle, model_path, features_dir, True)

        features = transformer.fit_transform(samples)
        _assert_checkpoints(features, oracle, model_path, features_dir, True)
        _assert_delayed_samples(features)

        # remove all files and call fit_transform again
        shutil.rmtree(d)
        features = transformer.fit_transform(samples)
        _assert_checkpoints(features, oracle, model_path, features_dir, True)

    # test when both model_path and features_dir is None
    transformer = mario.wrap(
        [FunctionTransformer, "sample", "checkpoint"],
        func=_offset_add_func,
        kw_args=dict(offset=offset),
        validate=True,
    )
    features = transformer.transform(samples)
    _assert_all_close_numpy_array(oracle, [s.data for s in features])

    # test when both model_path and features_dir is None
    with tempfile.TemporaryDirectory() as dir_name:
        transformer = mario.wrap(
            [FunctionTransformer, "sample", "checkpoint"],
            func=_offset_add_func,
            kw_args=dict(offset=offset),
            validate=True,
            features_dir=dir_name,
            hash_fn=hash_string,
        )

        features = transformer.transform(samples)
        # Checking if we can cast the has as integer
        assert isinstance(int(features[0]._load.args[0].split("/")[-2]), int)

        _assert_all_close_numpy_array(oracle, [s.data for s in features])
Пример #9
0
def test_dask_checkpoint_transform_pipeline():
    X = np.ones(shape=(10, 2), dtype=int)
    samples_transform = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)]
    with tempfile.TemporaryDirectory() as d:
        bag_transformer = mario.ToDaskBag()
        estimator = mario.wrap(["dask"], _build_transformer(d, 0), transform_tag="CPU")
        X_tr = estimator.transform(bag_transformer.transform(samples_transform))
        assert len(mario.dask_tags(estimator)) == 1
        assert len(X_tr.compute(scheduler="single-threaded")) == 10
Пример #10
0
def test_failing_sample_transformer():

    X = np.zeros(shape=(10, 2))
    samples = [mario.Sample(data) for i, data in enumerate(X)]
    expected = np.full_like(X, 2, dtype=np.object)
    expected[::2] = None
    expected[1::4] = None

    transformer = Pipeline(
        [
            ("1", mario.wrap([HalfFailingDummyTransformer, "sample"])),
            ("2", mario.wrap([HalfFailingDummyTransformer, "sample"])),
        ]
    )
    features = transformer.transform(samples)

    features = [f.data for f in features]
    assert len(expected) == len(
        features
    ), f"Expected: {len(expected)} but got: {len(features)}"
    assert all(
        (e == f).all() for e, f in zip(expected, features)
    ), f"Expected: {expected} but got: {features}"

    samples = [mario.Sample(data) for data in X]
    expected = [None] * X.shape[0]
    transformer = Pipeline(
        [
            ("1", mario.wrap([FullFailingDummyTransformer, "sample"])),
            ("2", mario.wrap([FullFailingDummyTransformer, "sample"])),
        ]
    )
    features = transformer.transform(samples)

    features = [f.data for f in features]
    assert len(expected) == len(
        features
    ), f"Expected: {len(expected)} but got: {len(features)}"
    assert all(
        e == f for e, f in zip(expected, features)
    ), f"Expected: {expected} but got: {features}"
Пример #11
0
def test_fittable_sample_transformer():

    X = np.ones(shape=(10, 2), dtype=int)
    samples = [mario.Sample(data) for data in X]

    # Mixing up with an object
    transformer = mario.wrap([DummyWithFit, "sample"])
    features = transformer.fit(samples).transform(samples)
    _assert_all_close_numpy_array(X + 1, [s.data for s in features])

    features = transformer.fit_transform(samples)
    _assert_all_close_numpy_array(X + 1, [s.data for s in features])
Пример #12
0
def test_function_sample_transfomer():

    X = np.zeros(shape=(10, 2), dtype=int)
    samples = [mario.Sample(data) for data in X]

    transformer = mario.wrap(
        [FunctionTransformer, "sample"],
        func=_offset_add_func,
        kw_args=dict(offset=3),
        validate=True,
    )

    features = transformer.transform(samples)
    _assert_all_close_numpy_array(X + 3, [s.data for s in features])

    features = transformer.fit_transform(samples)
    _assert_all_close_numpy_array(X + 3, [s.data for s in features])
Пример #13
0
def _build_toy_samples(delayed=False):
    X = np.ones(shape=(10, 5), dtype=int)
    if delayed:

        def _load(index, attr):
            if attr == "data":
                return X[index]
            if attr == "key":
                return str(index)

        samples = [
            mario.DelayedSample(
                partial(_load, i, "data"),
                delayed_attributes=dict(key=partial(_load, i, "key")),
            )
            for i in range(len(X))
        ]
    else:
        samples = [mario.Sample(data, key=str(i)) for i, data in enumerate(X)]
    return X, samples
Пример #14
0
def _build_iris_dataset(shuffle=False, delayed=False):

    iris = datasets.load_iris()

    X = iris.data
    keys = [str(k) for k in range(len(X))]

    if delayed:

        def _load(index, attr):
            if attr == "data":
                return X[index]
            if attr == "key":
                return str(index)
            if attr == "target":
                return iris.target[index]

        samples = [
            mario.DelayedSample(
                partial(_load, i, "data"),
                delayed_attributes=dict(
                    key=partial(_load, i, "key"),
                    target=partial(_load, i, "target"),
                ),
            )
            for i in range(len(X))
        ]
    else:
        samples = [
            mario.Sample(x, target=y, key=k)
            for x, y, k in zip(iris.data, iris.target, keys)
        ]
    meta = xr.DataArray(X[0], dims=("feature",))
    dataset = mario.xr.samples_to_dataset(
        samples, meta=meta, npartitions=3, shuffle=shuffle
    )
    return dataset
Пример #15
0
def test_failing_checkpoint_transformer():

    X = np.zeros(shape=(10, 2))
    samples = [mario.Sample(data, key=i) for i, data in enumerate(X)]
    expected = np.full_like(X, 2)
    expected[::2] = None
    expected[1::4] = None
    expected = list(expected)

    with tempfile.TemporaryDirectory() as d:
        features_dir_1 = os.path.join(d, "features_1")
        features_dir_2 = os.path.join(d, "features_2")
        transformer = Pipeline(
            [
                (
                    "1",
                    mario.wrap(
                        [HalfFailingDummyTransformer, "sample", "checkpoint"],
                        features_dir=features_dir_1,
                    ),
                ),
                (
                    "2",
                    mario.wrap(
                        [HalfFailingDummyTransformer, "sample", "checkpoint"],
                        features_dir=features_dir_2,
                    ),
                ),
            ]
        )
        features = transformer.transform(samples)

        np_features = np.array(
            [
                np.full(X.shape[1], np.nan) if f.data is None else f.data
                for f in features
            ]
        )
        assert len(expected) == len(
            np_features
        ), f"Expected: {len(expected)} but got: {len(np_features)}"
        assert np.allclose(
            expected, np_features, equal_nan=True
        ), f"Expected: {expected} but got: {np_features}"

    samples = [mario.Sample(data, key=i) for i, data in enumerate(X)]
    expected = [None] * X.shape[0]

    with tempfile.TemporaryDirectory() as d:
        features_dir_1 = os.path.join(d, "features_1")
        features_dir_2 = os.path.join(d, "features_2")
        transformer = Pipeline(
            [
                (
                    "1",
                    mario.wrap(
                        [FullFailingDummyTransformer, "sample", "checkpoint"],
                        features_dir=features_dir_1,
                    ),
                ),
                (
                    "2",
                    mario.wrap(
                        [FullFailingDummyTransformer, "sample", "checkpoint"],
                        features_dir=features_dir_2,
                    ),
                ),
            ]
        )
        features = transformer.transform(samples)

        assert len(expected) == len(
            features
        ), f"Expected: {len(expected)} but got: {len(features)}"
        assert all(
            e == f.data for e, f in zip(expected, features)
        ), f"Expected: {expected} but got: {features}"