Exemplo n.º 1
0
    def test_boston(self):
        from sklearn.datasets import load_boston
        
        scikit_data = load_boston()
        scikit_model = Normalizer(norm='l2').fit(scikit_data.data)

        spec = converter.convert(scikit_model, scikit_data.feature_names, 'out')

        input_data = [dict(zip(scikit_data.feature_names, row)) 
                for row in scikit_data.data]

        output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)]

        evaluate_transformer(spec, input_data, output_data)
Exemplo n.º 2
0
    def test_boston_OHE_pipeline(self):
        data = load_boston()

        for categorical_features in [[3], [8], [3, 8], [8, 3]]:

            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct.

            model = Pipeline([
                ("OHE",
                 OneHotEncoder(categorical_features=categorical_features)),
                ("Normalizer", Normalizer())
            ])

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, 'out').get_spec()

            if macos_version() >= (10, 13):
                input_data = [
                    dict(zip(data.feature_names, row)) for row in data.data
                ]
                output_data = [{
                    "out": row
                } for row in model.transform(data.data.copy())]

                result = evaluate_transformer(spec, input_data, output_data)

                assert result["num_errors"] == 0
Exemplo n.º 3
0
    def test_random(self):
        # Generate some random data_imputeValue.multiArrayValue[i]
        X = _np.random.random(size=(50, 3))

        for param in ('l1', 'l2', 'max'):

            cur_model = Normalizer(norm=param)

            output = cur_model.fit_transform(X)

            spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out')

            evaluate_transformer(
                spec, [dict(zip(["a", "b", "c"], row)) for row in X],
                [{
                    "out": row
                } for row in output])
Exemplo n.º 4
0
    def test_random():
        # Generate some random data_imputeValue.multiArrayValue[i]
        X = _np.random.random(size=(50, 3))

        for param in ("l1", "l2", "max"):
            cur_model = Normalizer(norm=param)

            output = cur_model.fit_transform(X)

            spec = converter.convert(cur_model, ["a", "b", "c"], "out")

            evaluate_transformer(
                spec,
                [dict(zip(["a", "b", "c"], row)) for row in X],
                [{
                    "out": row
                } for row in output],
            )
Exemplo n.º 5
0
    def test_conversion_one_column_of_several(self):
        scikit_model = OneHotEncoder(categorical_features = [0])
        scikit_model.fit(copy(self.scikit_data_multiple_cols))
        spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec()

        if macos_version() >= (10, 13):
            test_data = [{'feature_1': row[0], 'feature_2': row[1]} for row in self.scikit_data_multiple_cols]
            scikit_output = [{'out': row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()]
            metrics = evaluate_transformer(spec, test_data, scikit_output)

            self.assertIsNotNone(spec)
            self.assertIsNotNone(spec.description)
            self.assertEquals(metrics['num_errors'], 0)
Exemplo n.º 6
0
    def _test_conversion(self, data, trained_dict_vectorizer):

        X = trained_dict_vectorizer.transform(data) 

        m = sklearn.convert(trained_dict_vectorizer, 
                input_features = "features", 
                output_feature_names = "output")
        
        ret = evaluate_transformer(
                m, [{"features" : row} for row in data], 
                [{"output" : x_r} for x_r in X], True)

        assert ret["num_errors"] == 0
Exemplo n.º 7
0
    def test_conversion_one_column(self):
        # Fit a single OHE
        scikit_model = OneHotEncoder()
        scikit_model.fit(self.scikit_data)
        spec = sklearn.convert(scikit_model, 'single_feature', 'out').get_spec()

        if macos_version() >= (10, 13):
            test_data = [{'single_feature' : row} for row in self.scikit_data]
            scikit_output = [{'out' : row} for row in scikit_model.transform(self.scikit_data).toarray()]
            metrics = evaluate_transformer(spec, test_data, scikit_output)

            self.assertIsNotNone(spec)
            self.assertIsNotNone(spec.description)
            self.assertEquals(metrics['num_errors'], 0)
Exemplo n.º 8
0
    def test_random(self):
        # Generate some random data
        X = _np.random.random(size = (50, 3))

        cur_model = StandardScaler()

        output = cur_model.fit_transform(X)

        spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out').get_spec()

        if macos_version() >= (10, 13):
            metrics = evaluate_transformer(spec,
                    [dict(zip(["a", "b", "c"], row)) for row in X],
                    [{"out" : row} for row in output])

            assert metrics["num_errors"] == 0
Exemplo n.º 9
0
    def test_conversion_one_column(self):
        # Fit a single OHE
        scikit_model = OneHotEncoder()
        scikit_model.fit(self.scikit_data)
        spec = sklearn.convert(scikit_model, "single_feature",
                               "out").get_spec()

        test_data = [{"single_feature": row} for row in self.scikit_data]
        scikit_output = [{
            "out": row
        } for row in scikit_model.transform(self.scikit_data).toarray()]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEqual(metrics["num_errors"], 0)
Exemplo n.º 10
0
    def test_boston(self):
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = StandardScaler().fit(scikit_data.data)

        spec = converter.convert(scikit_model, scikit_data.feature_names, 'out').get_spec()

        if macos_version() >= (10, 13):
            input_data = [dict(zip(scikit_data.feature_names, row))
                    for row in scikit_data.data]

            output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)]

            metrics = evaluate_transformer(spec, input_data, output_data)

            assert metrics["num_errors"] == 0
Exemplo n.º 11
0
    def test_boston_OHE(self): 
        data = load_boston()

        for categorical_features in [ [3], [8], [3, 8], [8,3] ]:

            model = OneHotEncoder(categorical_features = categorical_features, sparse=False)
            model.fit(data.data, data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, 'out').get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out" : row} for row in model.transform(data.data)]

            if macos_version() >= (10, 13):
                result = evaluate_transformer(spec, input_data, output_data)

                assert result["num_errors"] == 0
Exemplo n.º 12
0
    def test_boston_OHE_plus_normalizer(self):

        data = load_boston()

        pl = Pipeline([
            ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)),
            ("Scaler",StandardScaler())])

        pl.fit(data.data, data.target)

        # Convert the model
        spec = convert(pl, data.feature_names, 'out')

        if macos_version() >= (10, 13):
            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out" : row} for row in pl.transform(data.data)]

            result = evaluate_transformer(spec, input_data, output_data)
            assert result["num_errors"] == 0
Exemplo n.º 13
0
    def test_conversion_many_columns(self):
        scikit_model = OneHotEncoder()
        scikit_model.fit(self.scikit_data_multiple_cols)
        spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'],
                               'out').get_spec()

        test_data = [{
            'feature_1': row[0],
            'feature_2': row[1]
        } for row in self.scikit_data_multiple_cols]
        scikit_output = [{
            'out': row
        } for row in scikit_model.transform(
            self.scikit_data_multiple_cols).toarray()]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics['num_errors'], 0)
Exemplo n.º 14
0
    def test_conversion_one_column_of_several(self):
        scikit_model = OneHotEncoder(categorical_features=[0])
        scikit_model.fit(copy(self.scikit_data_multiple_cols))
        spec = sklearn.convert(scikit_model, ["feature_1", "feature_2"],
                               "out").get_spec()

        test_data = [{
            "feature_1": row[0],
            "feature_2": row[1]
        } for row in self.scikit_data_multiple_cols]
        scikit_output = [{
            "out": row
        } for row in scikit_model.transform(
            self.scikit_data_multiple_cols).toarray()]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEqual(metrics["num_errors"], 0)
    def test_random():
        # Generate some random data
        X = _np.random.random(size=(50, 3))

        cur_model = StandardScaler()

        output = cur_model.fit_transform(X)

        spec = converter.convert(cur_model, ["a", "b", "c"], "out").get_spec()

        metrics = evaluate_transformer(
            spec,
            [dict(zip(["a", "b", "c"], row)) for row in X],
            [{
                "out": row
            } for row in output],
        )

        if metrics["num_errors"] != 0:
            raise AssertionError
Exemplo n.º 16
0
    def test_conversion_boston(self):

        from sklearn.datasets import load_boston

        scikit_data = load_boston()

        sh = scikit_data.data.shape

        rn.seed(0)
        missing_value_indices = [(rn.randint(sh[0]), rn.randint(sh[1]))
                                 for k in range(sh[0])]

        for strategy in ["mean", "median", "most_frequent"]:
            for missing_value in [0, 'NaN', -999]:

                X = np.array(scikit_data.data).copy()

                for i, j in missing_value_indices:
                    X[i, j] = missing_value

                model = Imputer(missing_values=missing_value,
                                strategy=strategy)
                model = model.fit(X)

                tr_X = model.transform(X.copy())

                spec = converter.convert(model, scikit_data.feature_names,
                                         'out')

                if macos_version() >= (10, 13):
                    input_data = [
                        dict(zip(scikit_data.feature_names, row)) for row in X
                    ]

                    output_data = [{"out": row} for row in tr_X]

                    result = evaluate_transformer(spec, input_data,
                                                  output_data)

                    assert result["num_errors"] == 0
    def test_boston():
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = StandardScaler().fit(scikit_data.data)

        spec = converter.convert(scikit_model, scikit_data.feature_names,
                                 "out").get_spec()

        input_data = [
            dict(zip(scikit_data.feature_names, row))
            for row in scikit_data.data
        ]

        output_data = [{
            "out": row
        } for row in scikit_model.transform(scikit_data.data)]

        metrics = evaluate_transformer(spec, input_data, output_data)

        if metrics["num_errors"] != 0:
            raise AssertionError
    def test_random_sparse_data(self):

        n_columns = 8
        n_categories = 20

        import numpy.random as rn

        rn.seed(0)
        categories = rn.randint(50000, size=(n_columns, n_categories))

        for dt in ["int32", "float32", "float64"]:

            _X = np.array(
                [[
                    categories[j, rn.randint(n_categories)]
                    for j in range(n_columns)
                ] for i in range(100)],
                dtype=dt,
            )

            # Test this data on a bunch of possible inputs.
            for sparse in (True, False):
                for categorical_features in [
                        "all",
                    [3],
                    [4],
                        range(2, 8),
                        range(0, 4),
                        range(0, 8),
                ]:
                    X = _X.copy()

                    # This appears to be the only type now working.
                    if X.dtype != np.dtype(dt):
                        raise AssertionError

                    model = OneHotEncoder(
                        categorical_features=categorical_features,
                        sparse=sparse)
                    model.fit(X)

                    # Convert the model
                    spec = sklearn.convert(model, [("data", Array(n_columns))],
                                           "out")

                    X_out = model.transform(X)
                    if sparse:
                        X_out = X_out.todense()

                    input_data = [{"data": row} for row in X]
                    output_data = [{"out": row} for row in X_out]

                    result = evaluate_transformer(spec, input_data,
                                                  output_data)

                    if result["num_errors"] != 0:
                        raise AssertionError

            # Test normal data inside a pipeline
            for sparse in (True, False):
                for categorical_features in [
                        "all",
                    [3],
                    [4],
                        range(2, 8),
                        range(0, 4),
                        range(0, 8),
                ]:
                    X = _X.copy()

                    model = Pipeline([
                        (
                            "OHE",
                            OneHotEncoder(
                                categorical_features=categorical_features,
                                sparse=sparse,
                            ),
                        ),
                        ("Normalizer", Normalizer()),
                    ])

                    model.fit(X)

                    # Convert the model
                    spec = sklearn.convert(model, [("data", Array(n_columns))],
                                           "out").get_spec()

                    X_out = model.transform(X)
                    if sparse:
                        X_out = X_out.todense()

                    input_data = [{"data": row} for row in X]
                    output_data = [{"out": row} for row in X_out]

                    result = evaluate_transformer(spec, input_data,
                                                  output_data)

                    if result["num_errors"] != 0:
                        raise AssertionError
Exemplo n.º 19
0
    def test_random_sparse_data(self):

        n_columns = 8
        n_categories = 20

        import numpy.random as rn
        rn.seed(0)
        categories = rn.randint(50000, size=(n_columns, n_categories))

        for dt in ['int32', 'float32', 'float64']:

            _X = np.array([[
                categories[j, rn.randint(n_categories)]
                for j in range(n_columns)
            ] for i in range(100)],
                          dtype=dt)

            # Test this data on a bunch of possible inputs.
            for sparse in (True, False):
                for categorical_features in [
                        'all', [3], [4],
                        range(2, 8),
                        range(0, 4),
                        range(0, 8)
                ]:
                    X = _X.copy()

                    # This appears to be the only type now working.
                    assert X.dtype == np.dtype(dt)

                    model = OneHotEncoder(
                        categorical_features=categorical_features,
                        sparse=sparse)
                    model.fit(X)

                    # Convert the model
                    spec = sklearn.convert(model, [('data', Array(n_columns))],
                                           'out')

                    if macos_version() >= (10, 13):
                        X_out = model.transform(X)
                        if sparse:
                            X_out = X_out.todense()

                        input_data = [{'data': row} for row in X]
                        output_data = [{"out": row} for row in X_out]

                        result = evaluate_transformer(spec, input_data,
                                                      output_data)

                        assert result["num_errors"] == 0

            # Test normal data inside a pipeline
            for sparse in (True, False):
                for categorical_features in [
                        'all', [3], [4],
                        range(2, 8),
                        range(0, 4),
                        range(0, 8)
                ]:
                    X = _X.copy()

                    model = Pipeline([
                        ("OHE",
                         OneHotEncoder(
                             categorical_features=categorical_features,
                             sparse=sparse)), ("Normalizer", Normalizer())
                    ])

                    model.fit(X)

                    # Convert the model
                    spec = sklearn.convert(model, [('data', Array(n_columns))],
                                           'out').get_spec()

                    if macos_version() >= (10, 13):
                        X_out = model.transform(X)
                        if sparse:
                            X_out = X_out.todense()

                        input_data = [{'data': row} for row in X]
                        output_data = [{"out": row} for row in X_out]

                        result = evaluate_transformer(spec, input_data,
                                                      output_data)

                        assert result["num_errors"] == 0