def _test_evaluation(self, allow_slow): """ Test that the same predictions are made """ # Generate some smallish (some kernels take too long on anything else) random data x, y = [], [] for _ in range(50): cur_x1, cur_x2 = random.gauss(2,3), random.gauss(-1,2) x.append([cur_x1, cur_x2]) y.append( 1 + 2*cur_x1 + 3*cur_x2 ) input_names = ['x1', 'x2'] df = pd.DataFrame(x, columns=input_names) # Parameters to test kernel_parameters = [{}, {'kernel': 'rbf', 'gamma': 1.2}, {'kernel': 'linear'}, {'kernel': 'poly'}, {'kernel': 'poly', 'degree': 2}, {'kernel': 'poly', 'gamma': 0.75}, {'kernel': 'poly', 'degree': 0, 'gamma': 0.9, 'coef0':2}, {'kernel': 'sigmoid'}, {'kernel': 'sigmoid', 'gamma': 1.3}, {'kernel': 'sigmoid', 'coef0': 0.8}, {'kernel': 'sigmoid', 'coef0': 0.8, 'gamma': 0.5} ] non_kernel_parameters = [{}, {'C': 1}, {'C': 1.5, 'shrinking': True}, {'C': 0.5, 'shrinking': False, 'nu': 0.9}] # Test for param1 in non_kernel_parameters: for param2 in kernel_parameters: cur_params = param1.copy() cur_params.update(param2) cur_model = NuSVR(**cur_params) cur_model.fit(x, y) df['prediction'] = cur_model.predict(x) spec = scikit_converter.convert(cur_model, input_names, 'target') if macos_version() >= (10, 13): metrics = evaluate_regressor(spec, df) self.assertAlmostEquals(metrics['max_error'], 0) if not allow_slow: break if not allow_slow: break
def _train_convert_evaluate_assert(self, **scikit_params): scikit_model = GradientBoostingRegressor(random_state=1, **scikit_params) scikit_model.fit(self.X, self.target) # Convert the model spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name) if _is_macos() and _macos_version() >= (10, 13): # Get predictions df = pd.DataFrame(self.X, columns=self.feature_names) df["prediction"] = scikit_model.predict(self.X) # Evaluate it metrics = evaluate_regressor(spec, df, "target", verbose=False) self._check_metrics(metrics, scikit_params)
def test_conversion_with_sparse_X(self): """Tests conversion of a model that's fitted with sparse data.""" num_samples = 100 num_dims = 64 sparse_X = sparse.rand( num_samples, num_dims, format='csr') # KNeighborsClassifier only supports CSR format y = self.iris_y[ 0: num_samples] # the labels themselves don't matter - just use 100 of the Iris ones sklearn_model = KNeighborsClassifier(algorithm='brute') sklearn_model.fit(sparse_X, y) coreml_model = sklearn.convert(sklearn_model) coreml_spec = coreml_model.get_spec() self.assertIsNotNone(coreml_spec)
def _train_convert_evaluate(self, **scikit_params): """ Train a scikit-learn model, convert it and then evaluate it with CoreML """ scikit_model = GradientBoostingClassifier(random_state = 1, **scikit_params) scikit_model.fit(self.X, self.target) # Convert the model spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name) # Get predictions df = pd.DataFrame(self.X, columns=self.feature_names) df['prediction'] = scikit_model.predict(self.X) # Evaluate it metrics = evaluate_classifier(spec, df) return metrics
def test_random(self): # Generate some random data_imputeValue.multiArrayValue[i] X = _np.random.random(size=(50, 3)) for param in ('l1', 'l2', 'max'): cur_model = Normalizer(norm=param) output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out') evaluate_transformer( spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{ "out": row } for row in output])
def test_linear_regression_evaluation(self): """ Check that the evaluation results are the same in scikit learn and coremltools """ input_names = self.scikit_data.feature_names df = pd.DataFrame(self.scikit_data.data, columns=input_names) for normalize_value in (True, False): cur_model = LinearRegression(normalize=normalize_value) cur_model.fit(self.scikit_data['data'], self.scikit_data['target']) spec = convert(cur_model, input_names, 'target') if macos_version() >= (10, 13): df['prediction'] = cur_model.predict(self.scikit_data.data) metrics = evaluate_regressor(spec, df) self.assertAlmostEquals(metrics['max_error'], 0)
def test_random(): # Generate some random data_imputeValue.multiArrayValue[i] X = _np.random.random(size=(50, 3)) for param in ("l1", "l2", "max"): cur_model = Normalizer(norm=param) output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", "b", "c"], "out") evaluate_transformer( spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{ "out": row } for row in output], )
def test_conversion_brute_algorithm(self): """Tests conversion of a scikit KNeighborsClassifier using the brute force algorithm.""" scikit_model = KNeighborsClassifier(algorithm='brute', n_neighbors=42) scikit_model.fit(self.iris_X, self.iris_y) coreml_model = sklearn.convert(scikit_model, 'single_input', 'single_output') coreml_spec = coreml_model.get_spec() self.assertIsNotNone(coreml_spec) self.assertTrue(coreml_spec.HasField("kNearestNeighborsClassifier")) self.assertEqual(coreml_spec.kNearestNeighborsClassifier.k, 42) self.assertTrue(coreml_spec.kNearestNeighborsClassifier.HasField("uniformWeighting")) self.assertEqual(coreml_spec.kNearestNeighborsClassifier.nearestNeighborsIndex.numberOfDimensions, len(self.iris_X[0])) self.assertTrue(coreml_spec.kNearestNeighborsClassifier.nearestNeighborsIndex.HasField("linearIndex")) self.assertTrue(coreml_spec.kNearestNeighborsClassifier.nearestNeighborsIndex.HasField("squaredEuclideanDistance")) self.validate_labels(coreml_spec, self.iris_y) self.validate_float_samples(coreml_spec, self.iris_X)
def _train_convert_evaluate(self, **scikit_params): """ Train a scikit-learn model, convert it and then evaluate it with CoreML """ scikit_model = DecisionTreeRegressor(random_state=1, **scikit_params) scikit_model.fit(self.X, self.target) # Convert the model spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name) # Get predictions df = pd.DataFrame(self.X, columns=self.feature_names) df['prediction'] = scikit_model.predict(self.X) # Evaluate it metrics = evaluate_regressor(spec, df, target='target', verbose=False) return metrics
def _train_convert_evaluate_assert(self, **scikit_params): """ Train a scikit-learn model, convert it and then evaluate it with CoreML """ scikit_model = RandomForestRegressor(random_state=1, **scikit_params) scikit_model.fit(self.X, self.target) # Convert the model spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name) if _is_macos() and _macos_version() >= (10, 13): # Get predictions df = pd.DataFrame(self.X, columns=self.feature_names) df["prediction"] = scikit_model.predict(self.X) # Evaluate it metrics = evaluate_regressor(spec, df, verbose=False) self._check_metrics(metrics, scikit_params)
def test_random(self): # Generate some random data X = _np.random.random(size=(50, 3)) cur_model = StandardScaler() output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out').get_spec() if macos_version() >= (10, 13): metrics = evaluate_transformer( spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{ "out": row } for row in output]) assert metrics["num_errors"] == 0
def test_pipeline_rename(self): # Convert scikit_spec = converter.convert(self.scikit_model).get_spec() model = MLModel(scikit_spec) sample_data = self.scikit_data.data[0] # Rename rename_feature(scikit_spec, "input", "renamed_input") renamed_model = MLModel(scikit_spec) # Check the predictions if _is_macos() and _macos_version() >= (10, 13): out_dict = model.predict({"input": sample_data}) out_dict_renamed = renamed_model.predict({"renamed_input": sample_data}) self.assertAlmostEqual(list(out_dict.keys()), list(out_dict_renamed.keys())) self.assertAlmostEqual( list(out_dict.values()), list(out_dict_renamed.values()) )
def test_boston_OHE(self): data = load_boston() for categorical_features in [ [3], [8], [3, 8], [8,3] ]: model = OneHotEncoder(categorical_features = categorical_features, sparse=False) model.fit(data.data, data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, 'out').get_spec() input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in model.transform(data.data)] if macos_version() >= (10, 13): result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def test_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = Normalizer(norm="l2").fit(scikit_data.data) spec = converter.convert(scikit_model, scikit_data.feature_names, "out") input_data = [ dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data ] output_data = [{ "out": row } for row in scikit_model.transform(scikit_data.data)] evaluate_transformer(spec, input_data, output_data)
def test_conversion_many_columns(self): scikit_model = OneHotEncoder() scikit_model.fit(self.scikit_data_multiple_cols) spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec() test_data = [{ 'feature_1': row[0], 'feature_2': row[1] } for row in self.scikit_data_multiple_cols] scikit_output = [{ 'out': row } for row in scikit_model.transform( self.scikit_data_multiple_cols).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics['num_errors'], 0)
def test_boston_OHE_plus_normalizer(self): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), ("Scaler",StandardScaler())]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, 'out') if macos_version() >= (10, 13): input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in pl.transform(data.data)] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def test_conversion_one_column_of_several(self): scikit_model = OneHotEncoder(categorical_features=[0]) scikit_model.fit(copy(self.scikit_data_multiple_cols)) spec = sklearn.convert(scikit_model, ["feature_1", "feature_2"], "out").get_spec() test_data = [{ "feature_1": row[0], "feature_2": row[1] } for row in self.scikit_data_multiple_cols] scikit_output = [{ "out": row } for row in scikit_model.transform( self.scikit_data_multiple_cols).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEqual(metrics["num_errors"], 0)
def test_conversion_kd_tree_algorithm(self): """Tests conversion of a scikit KNeighborsClassifier using the brute force algorithm.""" test_leaf_size = 23 test_n_neighbors = 42 scikit_model = KNeighborsClassifier(algorithm='kd_tree', leaf_size=test_leaf_size, n_neighbors=test_n_neighbors) scikit_model.fit(self.iris_X, self.iris_y) coreml_model = sklearn.convert(scikit_model, 'single_input', 'single_output') coreml_spec = coreml_model.get_spec() self.assertIsNotNone(coreml_spec) self.assertTrue(coreml_spec.HasField("kNearestNeighborsClassifier")) self.assertEqual( coreml_spec.kNearestNeighborsClassifier.numberOfNeighbors. defaultValue, test_n_neighbors) self.assertEqual( coreml_spec.kNearestNeighborsClassifier.numberOfNeighbors.range. minValue, 1) self.assertEqual( coreml_spec.kNearestNeighborsClassifier.numberOfNeighbors.range. maxValue, len(self.iris_X)) self.assertTrue( coreml_spec.kNearestNeighborsClassifier.HasField( "uniformWeighting")) self.assertEqual( coreml_spec.kNearestNeighborsClassifier.nearestNeighborsIndex. numberOfDimensions, len(self.iris_X[0])) self.assertTrue( coreml_spec.kNearestNeighborsClassifier.nearestNeighborsIndex. HasField("singleKdTreeIndex")) self.assertEqual( test_leaf_size, coreml_spec.kNearestNeighborsClassifier. nearestNeighborsIndex.singleKdTreeIndex.leafSize) self.assertTrue( coreml_spec.kNearestNeighborsClassifier.nearestNeighborsIndex. HasField("squaredEuclideanDistance")) self.validate_labels(coreml_spec, self.iris_y) self.validate_float_samples(coreml_spec, self.iris_X)
def test_random(): # Generate some random data X = _np.random.random(size=(50, 3)) cur_model = StandardScaler() output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", "b", "c"], "out").get_spec() metrics = evaluate_transformer( spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{ "out": row } for row in output], ) if metrics["num_errors"] != 0: raise AssertionError
def test_conversion_one_column_of_several(self): scikit_model = OneHotEncoder(categorical_features=[0]) scikit_model.fit(copy(self.scikit_data_multiple_cols)) spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec() if macos_version() >= (10, 13): test_data = [{ 'feature_1': row[0], 'feature_2': row[1] } for row in self.scikit_data_multiple_cols] scikit_output = [{ 'out': row } for row in scikit_model.transform( self.scikit_data_multiple_cols).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics['num_errors'], 0)
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ if not (HAS_SKLEARN): return scikit_data = load_boston() feature_names = scikit_data.feature_names scikit_model = LinearRegression() scikit_model.fit(scikit_data['data'], scikit_data['target']) scikit_spec = converter.convert(scikit_model, feature_names, 'target').get_spec() # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model self.scikit_spec = scikit_spec
def test_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = StandardScaler().fit(scikit_data.data) spec = converter.convert(scikit_model, scikit_data.feature_names, 'out').get_spec() input_data = [ dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data ] output_data = [{ "out": row } for row in scikit_model.transform(scikit_data.data)] metrics = evaluate_transformer(spec, input_data, output_data) assert metrics["num_errors"] == 0
def test_boston_OHE_plus_trees(self): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), ("Trees",GradientBoostingRegressor(random_state = 1))]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, 'target') # Get predictions df = pd.DataFrame(data.data, columns=data.feature_names) df['prediction'] = pl.predict(data.data) # Evaluate it result = evaluate_regressor(spec, df, 'target', verbose = False) assert result["max_error"] < 0.0001
def test_conversion_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() sh = scikit_data.data.shape rn.seed(0) missing_value_indices = [(rn.randint(sh[0]), rn.randint(sh[1])) for k in range(sh[0])] for strategy in ["mean", "median", "most_frequent"]: for missing_value in [0, 'NaN', -999]: X = np.array(scikit_data.data).copy() for i, j in missing_value_indices: X[i, j] = missing_value model = Imputer(missing_values=missing_value, strategy=strategy) model = model.fit(X) tr_X = model.transform(X.copy()) spec = converter.convert(model, scikit_data.feature_names, 'out') if macos_version() >= (10, 13): input_data = [ dict(zip(scikit_data.feature_names, row)) for row in X ] output_data = [{"out": row} for row in tr_X] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def _test_conversion(self, data, trained_dict_vectorizer): X = trained_dict_vectorizer.transform(data) m = sklearn.convert( trained_dict_vectorizer, input_features="features", output_feature_names="output", ) if _is_macos() and _macos_version() >= (10, 13): ret = evaluate_transformer( m, [{ "features": row } for row in data], [{ "output": x_r } for x_r in X], True, ) assert ret["num_errors"] == 0
def _conversion_and_evaluation_helper_for_linear_svc(self, class_labels): ARGS = [ {}, { "C": 0.75, "loss": "hinge" }, { "penalty": "l1", "dual": False }, { "tol": 0.001, "fit_intercept": False }, { "intercept_scaling": 1.5 }, ] x, y = GlmCassifierTest._generate_random_data(class_labels) column_names = ["x1", "x2"] df = pd.DataFrame(x, columns=column_names) for cur_args in ARGS: print(class_labels, cur_args) cur_model = LinearSVC(**cur_args) cur_model.fit(x, y) spec = convert(cur_model, input_features=column_names, output_feature_names="target") if _is_macos() and _macos_version() >= (10, 13): df["prediction"] = cur_model.predict(x) cur_eval_metics = evaluate_classifier(spec, df, verbose=False) self.assertEquals(cur_eval_metics["num_errors"], 0)
def test_linear_svr_evaluation(self): """ Check that the evaluation results are the same in scikit learn and coremltools """ ARGS = [ {}, { "C": 0.5, "epsilon": 0.25 }, { "dual": False, "loss": "squared_epsilon_insensitive" }, { "tol": 0.005 }, { "fit_intercept": False }, { "intercept_scaling": 1.5 }, ] input_names = self.scikit_data.feature_names df = pd.DataFrame(self.scikit_data.data, columns=input_names) for cur_args in ARGS: print(cur_args) cur_model = LinearSVR(**cur_args) cur_model.fit(self.scikit_data["data"], self.scikit_data["target"]) spec = convert(cur_model, input_names, "target") df["prediction"] = cur_model.predict(self.scikit_data.data) metrics = evaluate_regressor(spec, df) self.assertAlmostEqual(metrics["max_error"], 0)
def _test_boston_OHE_plus_trees(self, loss='ls'): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features=[8], sparse=False)), ("Trees", GradientBoostingRegressor(random_state=1, loss=loss)), ]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, "target") if _is_macos() and _macos_version() >= (10, 13): # Get predictions df = pd.DataFrame(data.data, columns=data.feature_names) df["prediction"] = pl.predict(data.data) # Evaluate it result = evaluate_regressor(spec, df, "target", verbose=False) assert result["max_error"] < 0.0001
def test_boston_OHE_pipeline(self): data = load_boston() for categorical_features in [ [3], [8], [3, 8], [8,3] ]: # Put it in a pipeline so that we can test whether the output dimension # handling is correct. model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)), ("Normalizer", Normalizer())]) model.fit(data.data.copy(), data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, 'out').get_spec() if macos_version() >= (10, 13): input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in model.transform(data.data.copy())] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def test_conversion_distance_function_good(self): """Tests conversion of a scikit KNeighborsClassifier with a valid distance metric.""" scikit_model = KNeighborsClassifier(algorithm="brute", metric="euclidean") scikit_model.fit(self.iris_X, self.iris_y) coreml_model = sklearn.convert(scikit_model, "single_input", "single_output") coreml_spec = coreml_model.get_spec() self.assertIsNotNone(coreml_spec) self.assertTrue( coreml_spec.kNearestNeighborsClassifier.nearestNeighborsIndex. HasField("squaredEuclideanDistance")) # Minkowski metric with p=2 is equivalent to the squared Euclidean distance scikit_model = KNeighborsClassifier(algorithm="brute", metric="minkowski", p=2) scikit_model.fit(self.iris_X, self.iris_y) coreml_spec = coreml_model.get_spec() self.assertIsNotNone(coreml_spec) self.assertTrue( coreml_spec.kNearestNeighborsClassifier.nearestNeighborsIndex. HasField("squaredEuclideanDistance"))