예제 #1
0
파일: data.py 프로젝트: zilongqiu/mleap
class Imputer(SklearnImputer):
    def __init__(self,
                 missing_values="NaN",
                 strategy="mean",
                 axis=0,
                 verbose=0,
                 copy=True,
                 input_features=None,
                 output_features=None):
        self.name = "{}_{}".format(self.op, uuid.uuid1())
        self.input_features = input_features
        self.output_features = output_features
        self.input_shapes = {'data_shape': [{'shape': 'scalar'}]}
        self.feature_extractor = FeatureExtractor(
            input_scalars=[input_features],
            output_vector='extracted_' + output_features,
            output_vector_items=[output_features])
        SklearnImputer.__init__(self, missing_values, strategy, axis, verbose,
                                copy)

    def fit(self, X, y=None):
        super(Imputer, self).fit(self.feature_extractor.transform(X))
        return self

    def transform(self, X):
        return pd.DataFrame(
            super(Imputer,
                  self).transform(self.feature_extractor.transform(X)))

    def serialize_to_bundle(self, path, model_name):
        ImputerSerializer().serialize_to_bundle(self, path, model_name)
예제 #2
0
파일: data_test.py 프로젝트: samlex20/mleap
    def feature_extractor_test(self):

        extract_features = ['a', 'd']

        feature_extractor = FeatureExtractor(input_scalars=extract_features,
                                             output_vector='extract_features_output',
                                             output_vector_items=["{}_out".format(x) for x in extract_features])

        res = feature_extractor.fit_transform(self.df)

        self.assertEqual(len(res.columns), 2)

        feature_extractor.serialize_to_bundle(self.tmp_dir, feature_extractor.name)

        # Test node.json
        with open("{}/{}.node/node.json".format(self.tmp_dir, feature_extractor.name)) as json_data:
            node = json.load(json_data)

        self.assertEqual(feature_extractor.name, node['name'])
        self.assertEqual(feature_extractor.input_features[0], node['shape']['inputs'][0]['name'])
        self.assertEqual(feature_extractor.input_features[1], node['shape']['inputs'][1]['name'])
        self.assertEqual(feature_extractor.output_vector, node['shape']['outputs'][0]['name'])

        # Test model.json
        with open("{}/{}.node/model.json".format(self.tmp_dir, feature_extractor.name)) as json_data:
            model = json.load(json_data)

        expected_model = {
            "op": "vector_assembler",
            "attributes": {
                "input_shapes": {
                    "data_shape": [
                        {
                        "base": "scalar",
                        "isNullable": False
                        },
                        {
                        "base": "scalar",
                        "isNullable": False
                        }],
                    "type": "list"
                }
            }
        }

        self.assertEqual(expected_model['op'], model['op'])
        self.assertEqual(expected_model['attributes']['input_shapes']['data_shape'][0]['base'],
                         model['attributes']['input_shapes']['data_shape'][0]['base'])
        self.assertEqual(expected_model['attributes']['input_shapes']['data_shape'][0]['isNullable'],
                         model['attributes']['input_shapes']['data_shape'][0]['isNullable'])
        self.assertEqual(expected_model['attributes']['input_shapes']['data_shape'][1]['base'],
                         model['attributes']['input_shapes']['data_shape'][1]['base'])
        self.assertEqual(expected_model['attributes']['input_shapes']['data_shape'][1]['isNullable'],
                     model['attributes']['input_shapes']['data_shape'][1]['isNullable'])
예제 #3
0
    def test_min_max_scaler_multi_deserializer(self):

        extract_features = ['a', 'b']
        feature_extractor = FeatureExtractor(
            input_scalars=['a', 'b'],
            output_vector='extracted_multi_outputs',
            output_vector_items=["{}_out".format(x) for x in extract_features])

        scaler = MinMaxScaler()
        scaler.mlinit(prior_tf=feature_extractor,
                      output_features=['a_scaled', 'b_scaled'])

        scaler.fit(self.df[['a']])

        scaler.serialize_to_bundle(self.tmp_dir, scaler.name)

        # Deserialize the MinMaxScaler
        node_name = "{}.node".format(scaler.name)
        min_max_scaler_tf = MinMaxScaler()
        min_max_scaler_tf.deserialize_from_bundle(self.tmp_dir, node_name)

        # Transform some sample data
        res_a = scaler.transform(self.df[['a', 'b']])
        res_b = min_max_scaler_tf.transform(self.df[['a', 'b']])

        self.assertEqual(res_a[0][0], res_b[0][0])
        self.assertEqual(res_a[0][1], res_b[0][1])

        self.assertEqual(scaler.name, min_max_scaler_tf.name)
        self.assertEqual(scaler.op, min_max_scaler_tf.op)
예제 #4
0
    def test_logistic_regression_cv_serializer(self):

        logistic_regression = LogisticRegressionCV(fit_intercept=True)
        logistic_regression.mlinit(input_features='a',
                                   prediction_column='e_binary')

        extract_features = ['e']
        feature_extractor = FeatureExtractor(
            input_scalars=['e'],
            output_vector='extracted_e_output',
            output_vector_items=["{}_out".format(x) for x in extract_features])

        binarizer = Binarizer(threshold=0.0)
        binarizer.mlinit(prior_tf=feature_extractor,
                         output_features='e_binary')

        Xres = binarizer.fit_transform(self.df[['a']])

        logistic_regression.fit(self.df[['a']], Xres)

        logistic_regression.serialize_to_bundle(self.tmp_dir,
                                                logistic_regression.name)

        # Test model.json
        with open("{}/{}.node/model.json".format(
                self.tmp_dir, logistic_regression.name)) as json_data:
            model = json.load(json_data)

        self.assertEqual(model['op'], 'logistic_regression')
        self.assertTrue(model['attributes']['intercept']['double'] is not None)
예제 #5
0
파일: data.py 프로젝트: zilongqiu/mleap
 def __init__(self,
              missing_values="NaN",
              strategy="mean",
              axis=0,
              verbose=0,
              copy=True,
              input_features=None,
              output_features=None):
     self.name = "{}_{}".format(self.op, uuid.uuid1())
     self.input_features = input_features
     self.output_features = output_features
     self.input_shapes = {'data_shape': [{'shape': 'scalar'}]}
     self.feature_extractor = FeatureExtractor(
         input_scalars=[input_features],
         output_vector='extracted_' + output_features,
         output_vector_items=[output_features])
     SklearnImputer.__init__(self, missing_values, strategy, axis, verbose,
                             copy)
예제 #6
0
class TestImputerExtension(unittest.TestCase):
    def setUp(self):
        self.df = pd.DataFrame(
            [[0.85281608, 1.50669264], [-1.04544152, np.NaN],
             [0.41515407, -0.29941475], [np.NaN, -0.96775275],
             [np.NaN, -0.85734022]],
            columns=['a', 'b'])
        self.feature_extractor = FeatureExtractor(input_scalars=['a'],
                                                  output_vector='a_extracted')
        self.tmp_dir = tempfile.mkdtemp(prefix="mleap.python.tests")

    def tearDown(self):
        shutil.rmtree(self.tmp_dir)

    def test_imputer_extension_serialization_succeeds(self):
        imputer = Imputer(input_features='a', output_features='a_imputed')

        imputer.fit(self.feature_extractor.transform(self.df))
        imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

        expected_model = {
            "op": "imputer",
            "attributes": {
                "surrogate_value": {
                    "double": self.df.a.mean()
                },
                "strategy": {
                    "string": "mean"
                }
            }
        }

        with open("{}/{}.node/model.json".format(self.tmp_dir,
                                                 imputer.name)) as json_data:
            actual_model = json.load(json_data)

        self.assertEqual(expected_model, actual_model)

        with open("{}/{}.node/node.json".format(self.tmp_dir,
                                                imputer.name)) as json_data:
            node = json.load(json_data)

        self.assertEqual(imputer.name, node['name'])
        self.assertEqual("a", node['shape']['inputs'][0]['name'])
        self.assertEqual("a_imputed", node['shape']['outputs'][0]['name'])
예제 #7
0
    def test_standard_scaler_multi_deserializer(self):

        extract_features = ['a', 'b']
        feature_extractor = FeatureExtractor(
            input_scalars=['a', 'b'],
            output_vector='extracted_multi_outputs',
            output_vector_items=["{}_out".format(x) for x in extract_features])

        # Serialize a standard scaler to a bundle
        standard_scaler = StandardScaler(with_mean=True, with_std=True)

        standard_scaler.mlinit(prior_tf=feature_extractor,
                               output_features=['a_scaled', 'b_scaled'])

        standard_scaler.fit(self.df[['a', 'b']])

        standard_scaler.serialize_to_bundle(self.tmp_dir, standard_scaler.name)

        # Now deserialize it back

        node_name = "{}.node".format(standard_scaler.name)

        standard_scaler_tf = StandardScaler()

        standard_scaler_tf = standard_scaler_tf.deserialize_from_bundle(
            self.tmp_dir, node_name)

        # Transform some sample data
        res_a = standard_scaler.transform(self.df[['a', 'b']])
        res_b = standard_scaler_tf.transform(self.df[['a', 'b']])

        self.assertEqual(res_a[0][0], res_b[0][0])
        self.assertEqual(res_a[0][1], res_b[0][1])
        self.assertEqual(standard_scaler.name, standard_scaler_tf.name)
        self.assertEqual(standard_scaler.op, standard_scaler_tf.op)
        self.assertEqual(standard_scaler.mean_[0], standard_scaler_tf.mean_[0])
        self.assertEqual(standard_scaler.mean_[1], standard_scaler_tf.mean_[1])
        self.assertEqual(standard_scaler.scale_[0],
                         standard_scaler_tf.scale_[0])
        self.assertEqual(standard_scaler.scale_[1],
                         standard_scaler_tf.scale_[1])
예제 #8
0
    def test_logistic_regression_cv_deserializer(self):

        logistic_regression = LogisticRegressionCV(fit_intercept=True)
        logistic_regression.mlinit(input_features='a',
                                   prediction_column='e_binary')

        extract_features = ['e']
        feature_extractor = FeatureExtractor(
            input_scalars=['e'],
            output_vector='extracted_e_output',
            output_vector_items=["{}_out".format(x) for x in extract_features])

        binarizer = Binarizer(threshold=0.0)
        binarizer.mlinit(prior_tf=feature_extractor,
                         output_features='e_binary')

        Xres = binarizer.fit_transform(self.df[['a']])

        logistic_regression.fit(self.df[['a']], Xres)

        logistic_regression.serialize_to_bundle(self.tmp_dir,
                                                logistic_regression.name)

        # Test model.json
        with open("{}/{}.node/model.json".format(
                self.tmp_dir, logistic_regression.name)) as json_data:
            model = json.load(json_data)

        # Now deserialize it back
        node_name = "{}.node".format(logistic_regression.name)
        logistic_regression_tf = LogisticRegressionCV()
        logistic_regression_tf = logistic_regression_tf.deserialize_from_bundle(
            self.tmp_dir, node_name)

        res_a = logistic_regression.predict(self.df[['a']])
        res_b = logistic_regression_tf.predict(self.df[['a']])

        self.assertEqual(res_a[0], res_b[0])
        self.assertEqual(res_a[1], res_b[1])
        self.assertEqual(res_a[2], res_b[2])
예제 #9
0
    def test_min_max_scaler_serializer(self):

        extract_features = ['a']
        feature_extractor = FeatureExtractor(
            input_scalars=['a'],
            output_vector='extracted_a_output',
            output_vector_items=["{}_out".format(x) for x in extract_features])

        scaler = MinMaxScaler()
        scaler.mlinit(prior_tf=feature_extractor, output_features='a_scaled')

        scaler.fit(self.df[['a']])

        scaler.serialize_to_bundle(self.tmp_dir, scaler.name)

        expected_min = self.df.a.min()
        expected_max = self.df.a.max()

        expected_model = {
            "op": "min_max_scaler",
            "attributes": {
                "min": {
                    "double": [expected_min],
                    "shape": {
                        "dimensions": [{
                            "size": 1,
                            "name": ""
                        }]
                    },
                    "type": "tensor"
                },
                "max": {
                    "double": [expected_max],
                    "shape": {
                        "dimensions": [{
                            "size": 1,
                            "name": ""
                        }]
                    },
                    "type": "tensor"
                }
            }
        }

        self.assertEqual(expected_min, scaler.data_min_.tolist()[0])
        self.assertEqual(expected_max, scaler.data_max_.tolist()[0])

        # Test model.json
        with open("{}/{}.node/model.json".format(self.tmp_dir,
                                                 scaler.name)) as json_data:
            model = json.load(json_data)

        self.assertEqual(scaler.op, expected_model['op'])
        self.assertEqual(
            expected_model['attributes']['min']['shape']['dimensions'][0]
            ['size'],
            model['attributes']['min']['shape']['dimensions'][0]['size'])
        self.assertEqual(
            expected_model['attributes']['max']['shape']['dimensions'][0]
            ['size'],
            model['attributes']['max']['shape']['dimensions'][0]['size'])
        self.assertEqual(expected_model['attributes']['min']['double'][0],
                         model['attributes']['min']['double'][0])
        self.assertEqual(expected_model['attributes']['max']['double'][0],
                         model['attributes']['max']['double'][0])

        # Test node.json
        with open("{}/{}.node/node.json".format(self.tmp_dir,
                                                scaler.name)) as json_data:
            node = json.load(json_data)

        self.assertEqual(scaler.name, node['name'])
        self.assertEqual(scaler.input_features,
                         node['shape']['inputs'][0]['name'])
        self.assertEqual(scaler.output_features,
                         node['shape']['outputs'][0]['name'])
예제 #10
0
from mleap.sklearn.preprocessing.data import FeatureExtractor

# Import Scikit Transformer(s)
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv(
    'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
input_features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

output_vector_name = 'extracted_features'  # Used only for serialization purposes
output_features = [x for x in input_features]

feature_extractor_tf = FeatureExtractor(input_scalars=input_features,
                                        output_vector=output_vector_name,
                                        output_vector_items=output_features)

classification_tf = RandomForestClassifier(bootstrap=True,
                                           class_weight=None,
                                           criterion='gini',
                                           max_depth=2,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=10,
                                           n_jobs=1,
예제 #11
0
X = huis3.loc[:, feature_cols]
y = huis3.prijs

## fit model
outlm = lr.fit(X,y)
outlm
## bekijk coefficienten
outlm.intercept_
outlm.coef_

############  model pipeline #################

# Define our linear regression
feature_extractor_tf = FeatureExtractor(
    input_scalars = feature_cols, 
    output_vector = 'unscaled_cont_features'
)

# Vector Assembler, for serialization purposes only
feature_extractor_lr_model_tf = FeatureExtractor(
    input_vectors = [feature_extractor_tf], 
    output_vector = 'input_features'
)

feature_extractor_lr_model_tf.skip_fit_transform = True

# Define our linear regression
lr_model = LinearRegression()
lr_model.mlinit(
    input_features = 'input_features',
    prediction_column = 'prijs'