class Imputer(SklearnImputer): def __init__(self, missing_values="NaN", strategy="mean", axis=0, verbose=0, copy=True, input_features=None, output_features=None): self.name = "{}_{}".format(self.op, uuid.uuid1()) self.input_features = input_features self.output_features = output_features self.input_shapes = {'data_shape': [{'shape': 'scalar'}]} self.feature_extractor = FeatureExtractor( input_scalars=[input_features], output_vector='extracted_' + output_features, output_vector_items=[output_features]) SklearnImputer.__init__(self, missing_values, strategy, axis, verbose, copy) def fit(self, X, y=None): super(Imputer, self).fit(self.feature_extractor.transform(X)) return self def transform(self, X): return pd.DataFrame( super(Imputer, self).transform(self.feature_extractor.transform(X))) def serialize_to_bundle(self, path, model_name): ImputerSerializer().serialize_to_bundle(self, path, model_name)
def feature_extractor_test(self): extract_features = ['a', 'd'] feature_extractor = FeatureExtractor(input_scalars=extract_features, output_vector='extract_features_output', output_vector_items=["{}_out".format(x) for x in extract_features]) res = feature_extractor.fit_transform(self.df) self.assertEqual(len(res.columns), 2) feature_extractor.serialize_to_bundle(self.tmp_dir, feature_extractor.name) # Test node.json with open("{}/{}.node/node.json".format(self.tmp_dir, feature_extractor.name)) as json_data: node = json.load(json_data) self.assertEqual(feature_extractor.name, node['name']) self.assertEqual(feature_extractor.input_features[0], node['shape']['inputs'][0]['name']) self.assertEqual(feature_extractor.input_features[1], node['shape']['inputs'][1]['name']) self.assertEqual(feature_extractor.output_vector, node['shape']['outputs'][0]['name']) # Test model.json with open("{}/{}.node/model.json".format(self.tmp_dir, feature_extractor.name)) as json_data: model = json.load(json_data) expected_model = { "op": "vector_assembler", "attributes": { "input_shapes": { "data_shape": [ { "base": "scalar", "isNullable": False }, { "base": "scalar", "isNullable": False }], "type": "list" } } } self.assertEqual(expected_model['op'], model['op']) self.assertEqual(expected_model['attributes']['input_shapes']['data_shape'][0]['base'], model['attributes']['input_shapes']['data_shape'][0]['base']) self.assertEqual(expected_model['attributes']['input_shapes']['data_shape'][0]['isNullable'], model['attributes']['input_shapes']['data_shape'][0]['isNullable']) self.assertEqual(expected_model['attributes']['input_shapes']['data_shape'][1]['base'], model['attributes']['input_shapes']['data_shape'][1]['base']) self.assertEqual(expected_model['attributes']['input_shapes']['data_shape'][1]['isNullable'], model['attributes']['input_shapes']['data_shape'][1]['isNullable'])
def test_min_max_scaler_multi_deserializer(self): extract_features = ['a', 'b'] feature_extractor = FeatureExtractor( input_scalars=['a', 'b'], output_vector='extracted_multi_outputs', output_vector_items=["{}_out".format(x) for x in extract_features]) scaler = MinMaxScaler() scaler.mlinit(prior_tf=feature_extractor, output_features=['a_scaled', 'b_scaled']) scaler.fit(self.df[['a']]) scaler.serialize_to_bundle(self.tmp_dir, scaler.name) # Deserialize the MinMaxScaler node_name = "{}.node".format(scaler.name) min_max_scaler_tf = MinMaxScaler() min_max_scaler_tf.deserialize_from_bundle(self.tmp_dir, node_name) # Transform some sample data res_a = scaler.transform(self.df[['a', 'b']]) res_b = min_max_scaler_tf.transform(self.df[['a', 'b']]) self.assertEqual(res_a[0][0], res_b[0][0]) self.assertEqual(res_a[0][1], res_b[0][1]) self.assertEqual(scaler.name, min_max_scaler_tf.name) self.assertEqual(scaler.op, min_max_scaler_tf.op)
def test_logistic_regression_cv_serializer(self): logistic_regression = LogisticRegressionCV(fit_intercept=True) logistic_regression.mlinit(input_features='a', prediction_column='e_binary') extract_features = ['e'] feature_extractor = FeatureExtractor( input_scalars=['e'], output_vector='extracted_e_output', output_vector_items=["{}_out".format(x) for x in extract_features]) binarizer = Binarizer(threshold=0.0) binarizer.mlinit(prior_tf=feature_extractor, output_features='e_binary') Xres = binarizer.fit_transform(self.df[['a']]) logistic_regression.fit(self.df[['a']], Xres) logistic_regression.serialize_to_bundle(self.tmp_dir, logistic_regression.name) # Test model.json with open("{}/{}.node/model.json".format( self.tmp_dir, logistic_regression.name)) as json_data: model = json.load(json_data) self.assertEqual(model['op'], 'logistic_regression') self.assertTrue(model['attributes']['intercept']['double'] is not None)
def __init__(self, missing_values="NaN", strategy="mean", axis=0, verbose=0, copy=True, input_features=None, output_features=None): self.name = "{}_{}".format(self.op, uuid.uuid1()) self.input_features = input_features self.output_features = output_features self.input_shapes = {'data_shape': [{'shape': 'scalar'}]} self.feature_extractor = FeatureExtractor( input_scalars=[input_features], output_vector='extracted_' + output_features, output_vector_items=[output_features]) SklearnImputer.__init__(self, missing_values, strategy, axis, verbose, copy)
class TestImputerExtension(unittest.TestCase): def setUp(self): self.df = pd.DataFrame( [[0.85281608, 1.50669264], [-1.04544152, np.NaN], [0.41515407, -0.29941475], [np.NaN, -0.96775275], [np.NaN, -0.85734022]], columns=['a', 'b']) self.feature_extractor = FeatureExtractor(input_scalars=['a'], output_vector='a_extracted') self.tmp_dir = tempfile.mkdtemp(prefix="mleap.python.tests") def tearDown(self): shutil.rmtree(self.tmp_dir) def test_imputer_extension_serialization_succeeds(self): imputer = Imputer(input_features='a', output_features='a_imputed') imputer.fit(self.feature_extractor.transform(self.df)) imputer.serialize_to_bundle(self.tmp_dir, imputer.name) expected_model = { "op": "imputer", "attributes": { "surrogate_value": { "double": self.df.a.mean() }, "strategy": { "string": "mean" } } } with open("{}/{}.node/model.json".format(self.tmp_dir, imputer.name)) as json_data: actual_model = json.load(json_data) self.assertEqual(expected_model, actual_model) with open("{}/{}.node/node.json".format(self.tmp_dir, imputer.name)) as json_data: node = json.load(json_data) self.assertEqual(imputer.name, node['name']) self.assertEqual("a", node['shape']['inputs'][0]['name']) self.assertEqual("a_imputed", node['shape']['outputs'][0]['name'])
def test_standard_scaler_multi_deserializer(self): extract_features = ['a', 'b'] feature_extractor = FeatureExtractor( input_scalars=['a', 'b'], output_vector='extracted_multi_outputs', output_vector_items=["{}_out".format(x) for x in extract_features]) # Serialize a standard scaler to a bundle standard_scaler = StandardScaler(with_mean=True, with_std=True) standard_scaler.mlinit(prior_tf=feature_extractor, output_features=['a_scaled', 'b_scaled']) standard_scaler.fit(self.df[['a', 'b']]) standard_scaler.serialize_to_bundle(self.tmp_dir, standard_scaler.name) # Now deserialize it back node_name = "{}.node".format(standard_scaler.name) standard_scaler_tf = StandardScaler() standard_scaler_tf = standard_scaler_tf.deserialize_from_bundle( self.tmp_dir, node_name) # Transform some sample data res_a = standard_scaler.transform(self.df[['a', 'b']]) res_b = standard_scaler_tf.transform(self.df[['a', 'b']]) self.assertEqual(res_a[0][0], res_b[0][0]) self.assertEqual(res_a[0][1], res_b[0][1]) self.assertEqual(standard_scaler.name, standard_scaler_tf.name) self.assertEqual(standard_scaler.op, standard_scaler_tf.op) self.assertEqual(standard_scaler.mean_[0], standard_scaler_tf.mean_[0]) self.assertEqual(standard_scaler.mean_[1], standard_scaler_tf.mean_[1]) self.assertEqual(standard_scaler.scale_[0], standard_scaler_tf.scale_[0]) self.assertEqual(standard_scaler.scale_[1], standard_scaler_tf.scale_[1])
def test_logistic_regression_cv_deserializer(self): logistic_regression = LogisticRegressionCV(fit_intercept=True) logistic_regression.mlinit(input_features='a', prediction_column='e_binary') extract_features = ['e'] feature_extractor = FeatureExtractor( input_scalars=['e'], output_vector='extracted_e_output', output_vector_items=["{}_out".format(x) for x in extract_features]) binarizer = Binarizer(threshold=0.0) binarizer.mlinit(prior_tf=feature_extractor, output_features='e_binary') Xres = binarizer.fit_transform(self.df[['a']]) logistic_regression.fit(self.df[['a']], Xres) logistic_regression.serialize_to_bundle(self.tmp_dir, logistic_regression.name) # Test model.json with open("{}/{}.node/model.json".format( self.tmp_dir, logistic_regression.name)) as json_data: model = json.load(json_data) # Now deserialize it back node_name = "{}.node".format(logistic_regression.name) logistic_regression_tf = LogisticRegressionCV() logistic_regression_tf = logistic_regression_tf.deserialize_from_bundle( self.tmp_dir, node_name) res_a = logistic_regression.predict(self.df[['a']]) res_b = logistic_regression_tf.predict(self.df[['a']]) self.assertEqual(res_a[0], res_b[0]) self.assertEqual(res_a[1], res_b[1]) self.assertEqual(res_a[2], res_b[2])
def test_min_max_scaler_serializer(self): extract_features = ['a'] feature_extractor = FeatureExtractor( input_scalars=['a'], output_vector='extracted_a_output', output_vector_items=["{}_out".format(x) for x in extract_features]) scaler = MinMaxScaler() scaler.mlinit(prior_tf=feature_extractor, output_features='a_scaled') scaler.fit(self.df[['a']]) scaler.serialize_to_bundle(self.tmp_dir, scaler.name) expected_min = self.df.a.min() expected_max = self.df.a.max() expected_model = { "op": "min_max_scaler", "attributes": { "min": { "double": [expected_min], "shape": { "dimensions": [{ "size": 1, "name": "" }] }, "type": "tensor" }, "max": { "double": [expected_max], "shape": { "dimensions": [{ "size": 1, "name": "" }] }, "type": "tensor" } } } self.assertEqual(expected_min, scaler.data_min_.tolist()[0]) self.assertEqual(expected_max, scaler.data_max_.tolist()[0]) # Test model.json with open("{}/{}.node/model.json".format(self.tmp_dir, scaler.name)) as json_data: model = json.load(json_data) self.assertEqual(scaler.op, expected_model['op']) self.assertEqual( expected_model['attributes']['min']['shape']['dimensions'][0] ['size'], model['attributes']['min']['shape']['dimensions'][0]['size']) self.assertEqual( expected_model['attributes']['max']['shape']['dimensions'][0] ['size'], model['attributes']['max']['shape']['dimensions'][0]['size']) self.assertEqual(expected_model['attributes']['min']['double'][0], model['attributes']['min']['double'][0]) self.assertEqual(expected_model['attributes']['max']['double'][0], model['attributes']['max']['double'][0]) # Test node.json with open("{}/{}.node/node.json".format(self.tmp_dir, scaler.name)) as json_data: node = json.load(json_data) self.assertEqual(scaler.name, node['name']) self.assertEqual(scaler.input_features, node['shape']['inputs'][0]['name']) self.assertEqual(scaler.output_features, node['shape']['outputs'][0]['name'])
from mleap.sklearn.preprocessing.data import FeatureExtractor # Import Scikit Transformer(s) import pandas as pd from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier data = pd.read_csv( 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') input_features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] output_vector_name = 'extracted_features' # Used only for serialization purposes output_features = [x for x in input_features] feature_extractor_tf = FeatureExtractor(input_scalars=input_features, output_vector=output_vector_name, output_vector_items=output_features) classification_tf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
X = huis3.loc[:, feature_cols] y = huis3.prijs ## fit model outlm = lr.fit(X,y) outlm ## bekijk coefficienten outlm.intercept_ outlm.coef_ ############ model pipeline ################# # Define our linear regression feature_extractor_tf = FeatureExtractor( input_scalars = feature_cols, output_vector = 'unscaled_cont_features' ) # Vector Assembler, for serialization purposes only feature_extractor_lr_model_tf = FeatureExtractor( input_vectors = [feature_extractor_tf], output_vector = 'input_features' ) feature_extractor_lr_model_tf.skip_fit_transform = True # Define our linear regression lr_model = LinearRegression() lr_model.mlinit( input_features = 'input_features', prediction_column = 'prijs'