def test_validate_sklearn_knn_models_multiclass(self): model = KNeighborsClassifier() pipe = Pipeline([('model', model)]) pipe.fit(self.X, self.y) file_name = 'knn_model_numlti_class_classification.pmml' skl_to_pmml(pipe, self.features, 'species', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_sklearn_04(self): titanic = pd.read_csv("nyoka/tests/titanic_train.csv") features = titanic.columns target = 'Survived' f_name = "gb_pmml.pmml" pipeline_obj = Pipeline([ ("imp", Imputer(strategy="median")), ("gbc", GradientBoostingClassifier(n_estimators=10)) ]) pipeline_obj.fit(titanic[features], titanic[target]) skl_to_pmml(pipeline_obj, features, target, f_name) pmml_obj = pml.parse(f_name, True) ##1 self.assertEqual( pmml_obj.MiningModel[0].Segmentation.multipleModelMethod, "modelChain") ##2 self.assertEqual( pmml_obj.MiningModel[0].Segmentation.Segment.__len__(), 2) ##3 self.assertEqual( pmml_obj.MiningModel[0].Segmentation.Segment[1].RegressionModel. normalizationMethod, "logit")
def test_sklearn_02(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target features = irisd.columns.drop('Species') target = 'Species' f_name = "knn_pmml.pmml" pipeline_obj = Pipeline([('scaling', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=5))]) pipeline_obj.fit(irisd[features], irisd[target]) skl_to_pmml(pipeline_obj, features, target, f_name) pmml_obj = pml.parse(f_name, True) ##1 self.assertIsNotNone( pmml_obj.NearestNeighborModel[0].ComparisonMeasure.euclidean) ##2 self.assertEqual( pmml_obj.NearestNeighborModel[0].ComparisonMeasure.kind, "distance") ##3 self.assertEqual(pipeline_obj.steps[-1][-1].n_neighbors, pmml_obj.NearestNeighborModel[0].numberOfNeighbors)
def test_validate_sklearn_lda_models_multiclass(self): model = LinearDiscriminantAnalysis() pipe = Pipeline([('model', model)]) pipe.fit(self.X, self.y) file_name = 'lda_model_numlti_class_classification.pmml' skl_to_pmml(pipe, self.features, 'species', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_35_isolation_forest(self): print("\ntest 34 (Isolation Forest\n") detection_map = { 'true': -1, 'false': 1 } X = numpy.array([ [1,2,3,4], [2,1,3,4], [3,2,1,4], [3,2,4,1], [4,3,2,1], [2,4,3,1] ], dtype=numpy.float32) test_data = numpy.array([[0,4,0,7],[4,0,4,7]]) features = ['a','b','c','d'] model = IsolationForest(n_estimators=40,contamination=0) pipeline_obj = Pipeline([ ("model", model) ]) pipeline_obj.fit(X) file_name = 'test35sklearn.pmml' skl_to_pmml(pipeline_obj, features, '', file_name) model_pred = pipeline_obj.predict(test_data) model_scores = model.score_samples(test_data) model_name = self.adapa_utility.upload_to_zserver(file_name) z_predictions = self.adapa_utility.score_in_zserver(model_name,'nyoka/tests/test_forest.csv','ANOMALY') cnt = 0 for idx, value in enumerate(z_predictions): score, is_anomaly = value.split(",") score = -1 * float(score) if "{:.6f}".format(score) != "{:.6f}".format(model_scores[idx]) or model_pred[idx] != detection_map[is_anomaly]: cnt += 1 self.assertEqual(cnt,0)
def test_validate_sklearn_tree_models_multiclass(self): model = DecisionTreeClassifier() pipe = Pipeline([('pca', PCA()), ('model', model)]) pipe.fit(self.X, self.y) file_name = 'tree_model_numlti_class_classification.pmml' skl_to_pmml(pipe, self.features, 'species', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_sklearn_svm_models_binary_class(self): model = SVC() pipe = Pipeline([('scaler', MaxAbsScaler()), ('model', model)]) pipe.fit(self.X, self.y_bin) file_name = 'svm_model_binary_classification.pmml' skl_to_pmml(pipe, self.features, 'binary', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_sklearn_linear_models_binary_class(self): model = LogisticRegression() pipe = Pipeline([('sclaer', StandardScaler()), ('model', model)]) pipe.fit(self.X, self.y_bin) file_name = 'linear_model_binary_classification.pmml' skl_to_pmml(pipe, self.features, 'binary', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_03_logistic_regression_with_scaler(self): print( "\ntest 03 (logistic regression with preprocessing) [multi-class]\n" ) X, X_test, y, features, target, test_file = self.data_utility.get_data_for_multi_class_classification( ) model = LogisticRegression() pipeline_obj = Pipeline([ ("mapper", DataFrameMapper([(["sepal length (cm)", "sepal width (cm)"], MinMaxScaler()), (["petal length (cm)", "petal width (cm)"], None)])), ("model", model) ]) pipeline_obj.fit(X, y) file_name = 'test03sklearn.pmml' skl_to_pmml(pipeline_obj, features, target, file_name) model_name = self.adapa_utility.upload_to_zserver(file_name) predictions, probabilities = self.adapa_utility.score_in_zserver( model_name, test_file) model_pred = pipeline_obj.predict(X_test) model_prob = pipeline_obj.predict_proba(X_test) self.assertEqual( self.adapa_utility.compare_predictions(predictions, model_pred), True) self.assertEqual( self.adapa_utility.compare_probability(probabilities, model_prob), True)
def test_sklearn_03(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target features = irisd.columns.drop('Species') target = 'Species' f_name = "rf_pmml.pmml" model = RandomForestClassifier(n_estimators=100) pipeline_obj = Pipeline([ ("mapping", DataFrameMapper([(['sepal length (cm)', 'sepal width (cm)'], StandardScaler()), (['petal length (cm)', 'petal width (cm)'], Imputer())])), ("rfc", model) ]) pipeline_obj.fit(irisd[features], irisd[target]) skl_to_pmml(pipeline_obj, features, target, f_name) pmml_obj = pml.parse(f_name, True) ## 1 self.assertEqual( model.n_estimators, pmml_obj.MiningModel[0].Segmentation.Segment.__len__()) ##2 self.assertEqual( pmml_obj.MiningModel[0].Segmentation.multipleModelMethod, "majorityVote")
def test_36_one_class_svm(self): print("\ntest 36 (One Class SVM\n") detection_map = { 'true': -1, 'false': 1 } df = pd.read_csv("nyoka/tests/train_ocsvm.csv") df_test = pd.read_csv("nyoka/tests/test_ocsvm.csv") features = df.columns model = OneClassSVM(nu=0.1) pipeline_obj = Pipeline([ ("model", model) ]) pipeline_obj.fit(df) file_name = 'test36sklearn.pmml' skl_to_pmml(pipeline_obj, features, '', file_name) model_pred = pipeline_obj.predict(df_test) model_scores = pipeline_obj.decision_function(df_test) model_name = self.adapa_utility.upload_to_zserver(file_name) z_predictions = self.adapa_utility.score_in_zserver(model_name,'nyoka/tests/test_ocsvm.csv','ANOMALY') cnt = 0 for idx, value in enumerate(z_predictions): score, is_anomaly = value.split(",") score = float(score) if "{:.6f}".format(score) != "{:.6f}".format(model_scores[idx]) or model_pred[idx] != detection_map[is_anomaly]: cnt += 1 self.assertEqual(cnt,0)
def test_validate_sklearn_gboost_models_binary_class(self): model = GradientBoostingClassifier() pipe = Pipeline([('scaler', RobustScaler()), ('model', model)]) pipe.fit(self.X, self.y_bin) file_name = 'gboost_model_binary_classification.pmml' skl_to_pmml(pipe, self.features, 'binary', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_sklearn_knn_models_regression(self): model = KNeighborsRegressor() pipe = Pipeline([('model', model)]) pipe.fit(self.X_reg, self.y_reg) file_name = 'knn_model_regression.pmml' skl_to_pmml(pipe, self.features_reg, 'target', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_sklearn_kmeans_models(self): model = KMeans() pipe = Pipeline([('model', model)]) pipe.fit(self.X) file_name = 'kmeans_model.pmml' skl_to_pmml(pipe, self.features, 'target', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_sklearn_01(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target features = irisd.columns.drop('Species') target = 'Species' f_name = "svc_pmml.pmml" model = SVC() pipeline_obj = Pipeline([('svm', model)]) pipeline_obj.fit(irisd[features], irisd[target]) skl_to_pmml(pipeline_obj, features, target, f_name) pmml_obj = pml.parse(f_name, True) ## 1 svms = pmml_obj.SupportVectorMachineModel[0].SupportVectorMachine for mod_val, recon_val in zip(model.intercept_, svms): self.assertEqual( "{:.16f}".format(mod_val), "{:.16f}".format(recon_val.Coefficients.absoluteValue)) ## 2 svm = pmml_obj.SupportVectorMachineModel[0] self.assertEqual(svm.RadialBasisKernelType.gamma, model._gamma)
def test_validate_sklearn_gnb_models_binary_class(self): model = GaussianNB() pipe = Pipeline([('model', model)]) pipe.fit(self.X, self.y_bin) file_name = 'gnb_model_binary_classification.pmml' skl_to_pmml(pipe, self.features, 'binary', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_sklearn_30(self): iris = datasets.load_iris() model = KMeans() pipe = Pipeline([('model', model)]) pipe.fit(iris.data) file_name = 'kmeans_model.pmml' skl_to_pmml(pipe, iris.feature_names, 'target', file_name) self.assertEqual(os.path.isfile(file_name), True)
def test_validate_sklearn_linear_models_regression(self): model = LinearRegression() pipe = Pipeline([('impute', Imputer()), ('feat', PolynomialFeatures()), ('model', model)]) pipe.fit(self.X_reg, self.y_reg) file_name = 'linear_model_regression.pmml' skl_to_pmml(pipe, self.features_reg, 'target', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_sklearn_mlp_models_multiclass(self): from sklearn.neural_network import MLPClassifier model = MLPClassifier() pipe = Pipeline([('model', model)]) pipe.fit(self.X, self.y) file_name = 'mlp_model_numlti_class_classification.pmml' skl_to_pmml(pipe, self.features, 'species', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_sklearn_40(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data,columns=iris.feature_names) irisd['Species'] = iris.target target = 'Species' features = irisd.columns.drop('Species') model = GaussianProcessClassifier() model.fit(irisd[features],irisd[target]) with self.assertRaises(TypeError): skl_to_pmml(model,features,target,"no_pipeline.pmml")
def test_validate_isolation_forest(self): iris = datasets.load_iris() X = iris.data features = iris.feature_names model = IsolationForest() pipeline = Pipeline([('standard_scaler', StandardScaler()), ('Imputer', Imputer()), ('model', model)]) pipeline.fit(X) file_name = model.__class__.__name__ + '.pmml' skl_to_pmml(pipeline, features, pmml_f_name=file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_sklearn_rf_models_binary_class(self): df = pd.DataFrame(data=self.X, columns=self.features) df['new'] = [i % 3 for i in range(self.X.shape[0])] df['binary'] = self.y_bin model = RandomForestClassifier() pipe = Pipeline([('mapper', DataFrameMapper([('new', LabelBinarizer()) ])), ('model', model)]) pipe.fit(df[self.features + ['new']], df.binary) file_name = 'rf_model_binary_classification.pmml' skl_to_pmml(pipe, self.features + ['new'], 'binary', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_sklearn_gboost_models_multiclass(self): df = pd.DataFrame(data=self.X, columns=self.features) df['new'] = [i % 3 for i in range(self.X.shape[0])] df['species'] = self.y model = GradientBoostingClassifier() pipe = Pipeline([('mapper', DataFrameMapper([('new', LabelEncoder()) ])), ('model', model)]) pipe.fit(df.drop(['species'], axis=1), df.species) file_name = 'gboost_model_numlti_class_classification.pmml' skl_to_pmml(pipe, self.features + ['new'], 'species', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_sklearn_linear_models_multiclass(self): df = pd.DataFrame(data=self.X, columns=self.features) df['species'] = self.y model = LogisticRegression() pipe = Pipeline([('mapper', DataFrameMapper([(['sepal length (cm)'], Binarizer()) ])), ('model', model)]) pipe.fit(df[self.features], df.species) file_name = 'linear_model_multi_class_classification.pmml' skl_to_pmml(pipe, self.features, 'species', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_sklearn_37(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target target = 'Species' features = irisd.columns.drop('Species') model = LogisticRegression() pipeline_obj = Pipeline([('new', StandardScaler()), ('imputer', Imputer()), ('model', model)]) pipeline_obj.fit(irisd[features], irisd[target]) skl_to_pmml(pipeline_obj, features, target, "imputer.pmml") self.assertEqual(os.path.isfile("imputer.pmml"), True)
def test_sklearn_31(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target target = 'Species' features = irisd.columns.drop('Species') model = GradientBoostingClassifier() pipe = Pipeline([('scaler', MaxAbsScaler()), ('model', model)]) pipe.fit(irisd[features], irisd[target]) file_name = 'gbc_model_numlti_class_classification.pmml' skl_to_pmml(pipe, iris.feature_names, target, file_name) self.assertEqual(os.path.isfile(file_name), True)
def test_validate_ocsvm(self): iris = datasets.load_iris() X = iris.data y = iris.target features = iris.feature_names model = OneClassSVM() pipeline = Pipeline([('standard_scaler', StandardScaler()), ('Imputer', Imputer()), ('model', model)]) pipeline.fit(X, y) file_name = model.__class__.__name__ + '.pmml' skl_to_pmml(pipeline, features, pmml_f_name=file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_validate_lag(self): iris = datasets.load_iris() X = iris.data y = iris.target features = iris.feature_names model = LogisticRegression() pipeline = Pipeline([('lag', Lag(aggregation="stddev", value=3)), ('model', model)]) pipeline.fit(X, y) file_name = model.__class__.__name__ + 'lag_stddev.pmml' skl_to_pmml(pipeline, features, 'species', pmml_f_name=file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_sklearn_39(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target target = 'Species' features = irisd.columns.drop('Species') model = GaussianProcessClassifier() pipeline_obj = Pipeline([('model', model)]) pipeline_obj.fit(irisd[features], irisd[target]) with self.assertRaises(NotImplementedError): skl_to_pmml(pipeline_obj, numpy.array(features), target, "gpc.pmml")
def test_validate_sklearn_sgd_with_text(self): categories = ['alt.atheism', 'talk.religion.misc'] data = fetch_20newsgroups(subset='train', categories=categories) X = data.data[:4] Y = data.target[:4] features = ['input'] target = 'output' model = SGDClassifier(loss="log") file_name = model.__class__.__name__ + '_TfIdfVec_.pmml' pipeline = Pipeline([('vect', TfidfVectorizer()), ('clf', model)]) pipeline.fit(X, Y) skl_to_pmml(pipeline, features, target, file_name) self.assertEqual(self.schema.is_valid(file_name), True)