def test_revise_model(self): """Tests revise api in scoring engine""" kmodel = self.context.models.clustering.kmeans.train( self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], 5) result_frame = kmodel.predict(self.frame_test) old_model_path = kmodel.export_to_mar(self.get_export_file(self.get_name("kmeans"))) #create a revised model kmodel_revised = self.context.models.clustering.kmeans.train(self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4"], 4, max_iterations=10) result_revised = kmodel_revised.predict(self.frame_test) test_rows = result_revised.to_pandas(50) revised_model_path = kmodel_revised.export_to_mar(self.get_export_file(self.get_name("kmeans_revised"))) with scoring_utils.scorer( old_model_path, self.id()) as scorer: res = scorer.revise(revised_model_path) self.assertEqual(res.json()["status"], "success") for _, i in test_rows.iterrows(): res = scorer.score( [dict(zip(["Vec1", "Vec2", "Vec3", "Vec4"], list(i[0:4])))]) self.assertEqual(i["cluster"], res.json()["data"][0]['score'])
def test_arimax_scoring(self): """Tests standard usage of arimax.""" output = self.context.models.timeseries.arimax.train( self.train_frame, self.ts_column, self.x_columns, 1, 0, 1, 0) timeseries_column = self.actual_data.take(n=self.actual_data.count(), columns=self.ts_column) y = [item for sublist in timeseries_column for item in sublist] x_columns = self.actual_data.take(n=self.actual_data.count(), columns=self.x_columns) x = [item for sublist in x_columns for item in sublist] predict_frame = output.predict(self.actual_data, self.ts_column, self.x_columns) predict_data = predict_frame.take(n=self.actual_data.count(), columns="predicted_y") expected_score = [item for sublist in predict_data for item in sublist] model_path = output.export_to_mar( self.get_export_file(self.get_name("arimax"))) with scoring_utils.scorer(model_path, self.id()) as scorer: r = scorer.score([{"y": y, "x_values": x}]) scored = r.json()["data"][0]["score"] self.assertEqual(scored, expected_score)
def test_revise_model(self): """Tests revise api in scoring engine""" kmodel = self.context.models.clustering.kmeans.train( self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], 5) kmodel.predict(self.frame_test) old_model_path = kmodel.export_to_mar( self.get_export_file(self.get_name("kmeans"))) # create a revised model kmodel_revised = self.context.models.clustering.kmeans.train( self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4"], 4, max_iterations=10) result_revised = kmodel_revised.predict(self.frame_test) test_rows = result_revised.to_pandas(50) revised_model_path = kmodel_revised.export_to_mar( self.get_export_file(self.get_name("kmeans_revised"))) with scoring_utils.scorer(old_model_path, self.id()) as scorer: res = scorer.revise(revised_model_path) self.assertEqual(res.json()["status"], "success") for _, i in test_rows.iterrows(): res = scorer.score([ dict(zip(["Vec1", "Vec2", "Vec3", "Vec4"], list(i[0:4]))) ]) self.assertEqual(i["cluster"], res.json()["data"][0]['score'])
def test_scoring_pipeline(self): """Test scoring_pipeline""" model = self.context.models.classification.naive_bayes.train( self.frame, ['f1', 'f2', 'f3'], "label") res = model.predict(self.frame, ['f1', 'f2', 'f3']) analysis = res.to_pandas() file_name = self.get_name("naive_bayes") model_path = model.export_to_mar(self.get_export_file(file_name)) self.tarfile = "pipeline.tar" pipeline_funcs = os.path.join(config.root, "regression-tests", "sparktkregtests", "testcases", "scoretests", "pipeline_funcs.py") pipeline_config = os.path.join(config.root, "regression-tests", "sparktkregtests", "testcases", "scoretests", "pipeline_config.json") tar = tarfile.open(self.tarfile, "w:gz") tar.add(pipeline_funcs, "pipeline_funcs.py") tar.add(pipeline_config, "pipeline_config.json") tar.close() with scoring_utils.scorer(model_path, self.id(), pipeline=True, pipeline_filename=self.tarfile) as scorer: for _, i in analysis.iterrows(): r = scorer.score([ dict( zip(['f1', 'f2', 'f3'], map(lambda x: int(x), (i[1:4])))) ]) self.assertEqual(r.json(), i['predicted_class'])
def test_revise_model(self): """Tests revise api in scoring engine""" model = self.context.models.regression.linear_regression.train( self.frame, ['c1', 'c2', 'c3', 'c4'], 'label') old_model_path = model.export_to_mar( self.get_export_file(self.get_name("lin_reg"))) #create a revised model model_revised = self.context.models.regression.linear_regression.train( self.frame, ['c1', 'c2', 'c3'], 'label', max_iterations=10) result_revised = model_revised.predict(self.frame, ['c1', 'c2', 'c3']) test_rows = result_revised.to_pandas(50) revised_model_path = model_revised.export_to_mar( self.get_export_file(self.get_name("lin_reg_revised"))) with scoring_utils.scorer( old_model_path, self.id()) as scorer: res = scorer.revise(revised_model_path) self.assertEqual(res.json()["status"], "success") for _, i in test_rows.iterrows(): res = scorer.score( [dict(zip(["c1", "c2", "c3"], list(i[0:3])))]) self.assertEqual(i['predicted_value'], res.json()["data"][0]['Prediction'])
def test_arima_scoring(self): """Tests standard usage of arima.""" timeseries_column = self.train_frame.take(n=self.train_frame.count(), columns=self.ts_column) timeseries_data = [item for sublist in timeseries_column for item in sublist] output = self.context.models.timeseries.arima.train(timeseries_data, 1, 0, 1) predict = output.predict(0) prediction = predict[:99] model_path = output.export_to_mar(self.get_export_file(self.get_name("arima"))) with scoring_utils.scorer( model_path, self.id()) as scorer: r = scorer.score([{"future":0, "timeseries":timeseries_data}]) scored =r.json()["data"][0]["predicted_values"] self.assertEqual(scored, predict)
def test_model_scoring(self): """Test publishing a linear regression model""" model = self.context.models.regression.linear_regression.train( self.frame, ['c1', 'c2', 'c3', 'c4'], 'label') predict = model.predict(self.frame, ['c1', 'c2', 'c3', 'c4']) test_rows = predict.to_pandas(50) file_name = self.get_name("linear_regression") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer(model_path, self.id()) as scorer: for _, i in test_rows.iterrows(): res = scorer.score( [dict(zip(["c1", "c2", "c3", "c4"], list(i[0:4])))]) self.assertEqual(i['predicted_value'], res.json()["data"][0]['Prediction'])
def test_model_scoring(self): """Test publishing a linear regression model""" model = self.context.models.regression.linear_regression.train(self.frame, ['c1', 'c2', 'c3', 'c4'], "label") predict = model.predict(self.frame, ['c1', 'c2', 'c3', 'c4']) test_rows = predict.to_pandas(predict.count()) file_name = self.get_name("linear_regression") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer( model_path, self.id()) as scorer: for _, i in test_rows.iterrows(): res = scorer.score( [dict(zip(["c1", "c2", "c3", "c4"], list(i[0:4])))]) self.assertEqual( i["predicted_value"], res.json()["data"][0]['Prediction'])
def test_model_scoring(self): """Test training intializes theta, pi and labels""" model = self.context.models.classification.naive_bayes.train(self.frame, "label", ['f1', 'f2', 'f3']) res = model.predict(self.frame, ['f1', 'f2', 'f3']) analysis = res.to_pandas() file_name = self.get_name("naive_bayes") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer( model_path, self.id()) as scorer: for _, i in analysis.iterrows(): r = scorer.score( [dict(zip(['f1', 'f2', 'f3'], map(lambda x: int(x), (i[1:4]))))]) self.assertEqual( r.json()["data"][0]['Score'], i['predicted_class'])
def test_model_scoring(self): """Tests standard usage of the kmeans cluster algorithm.""" kmodel = self.context.models.clustering.kmeans.train( self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], 5) result_frame = kmodel.predict(self.frame_test) test_rows = result_frame.to_pandas(50) result = kmodel.export_to_mar(self.get_export_file(self.get_name("kmeans"))) with scoring_utils.scorer( result, self.id()) as scorer: for _, i in test_rows.iterrows(): res = scorer.score( [dict(zip(["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], list(i[0:5])))]) self.assertEqual(i["cluster"], res.json()["data"][0]['score'])
def test_max_scoring(self): """Tests standard usage of max.""" output = self.context.models.timeseries.max.train(self.train_frame, self.ts_column, self.x_columns, 1, 0) predict_frame = output.predict(self.actual_data, self.ts_column, self.x_columns) timeseries_column = self.actual_data.take(n=self.actual_data.count(), columns=self.ts_column) y = [item for sublist in timeseries_column for item in sublist] x_columns = self.actual_data.take(n=self.actual_data.count(), columns=self.x_columns) x = [item for sublist in x_columns for item in sublist] predict_data = predict_frame.take(n=self.actual_data.count(), columns="predicted_y") expected_score = [item for sublist in predict_data for item in sublist] model_path = output.export_to_mar(self.get_export_file(self.get_name("max"))) with scoring_utils.scorer( model_path, self.id()) as scorer: r = scorer.score([{"y":y,"x_values":x}]) scored =r.json()["data"][0]["score"] self.assertEqual(scored, expected_score)
def test_model_scoring(self): """Test lda model scoring""" model = self.context.models.clustering.lda.train(self.lda_frame, 'paper', 'word', 'count', num_topics=5, max_iterations=10, seed=0) test_phrase = ["word-0-0", "word-1-0", "word-2-0", "word-3-0", "word-4-0"] file_name = self.get_name("lda") model_path = model.export_to_mar(self.get_export_file(file_name)) res = lda_model.predict(test_phrase)["topics_given_doc"] with scoring_utils.scorer( model_path, self.id()) as scorer: result = scorer.score([{"paper":test_phrase}]).json() for i, j in zip(res, result[u"data"][0]["topics_given_doc"]): self.assertAlmostEqual(i, j)
def test_model_scoring(self): """Tests standard usage of the kmeans cluster algorithm.""" kmodel = self.context.models.clustering.kmeans.train( self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], 5) result_frame = kmodel.predict(self.frame_test) test_rows = result_frame.to_pandas(50) result = kmodel.export_to_mar( self.get_export_file(self.get_name("kmeans"))) with scoring_utils.scorer(result, self.id()) as scorer: for _, i in test_rows.iterrows(): res = scorer.score([ dict( zip(["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], list(i[0:5]))) ]) self.assertEqual(i["cluster"], res.json()["data"][0]['score'])
def test_reg_scoring(self): """Test random forest regressor scoring model""" rfmodel = self.context.models.regression.random_forest_regressor.train( self.frame, "class", ["feat1", "feat2"], seed=0) predresult = rfmodel.predict(self.frame) preddf = predresult.to_pandas(predresult.count()) file_name = self.get_name("random_forest_regressor") model_path = rfmodel.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer( model_path, self.id()) as scorer: for i, row in preddf.iterrows(): res = scorer.score( [dict(zip(["feat1", "feat2"], map(lambda x: x,row[0:2])))]) self.assertAlmostEqual( float(row[3]), float(res.json()["data"][0]['Prediction']))
def test_reg_scoring(self): """Test random forest regressor scoring model""" rfmodel = self.context.models.regression.random_forest_regressor.train( self.frame, ["feat1", "feat2"], "class", seed=0) predresult = rfmodel.predict(self.frame) preddf = predresult.to_pandas(predresult.count()) file_name = self.get_name("random_forest_regressor") model_path = rfmodel.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer(model_path, self.id()) as scorer: for i, row in preddf.iterrows(): res = scorer.score([ dict(zip(["feat1", "feat2"], map(lambda x: x, row[0:2]))) ]) self.assertAlmostEqual( float(row[3]), float(res.json()["data"][0]['Prediction']))
def test_model_scoring(self): """ Verify that SvmModel operates as expected. """ # Test set is a 3x3 square lattice of points # with a fully accurate, linear, unbiased divider. train_lattice = ["+++", "++-", "---"] training_frame = self.lattice2frame(train_lattice) svm_model = self.context.models.classification.svm.train( training_frame, ["x", "y"], u"model_class") file_name = self.get_name("svm") model_path = svm_model.export_to_mar(self.get_export_file(file_name)) test_rows = training_frame.to_pandas(training_frame.count()) with scoring_utils.scorer(model_path, self.id()) as scorer: for _, i in test_rows.iterrows(): res = scorer.score([dict(zip(["x", "y"], list(i[0:2])))]) self.assertEqual(i[2], res.json()["data"][0]['Prediction'])
def test_model_scoring(self): """Test publishing a gmm model""" model = self.context.models.clustering.gmm.train( self.frame, ["x1", "x2"], column_scalings=[1.0, 1.0], k=5, max_iterations=500, seed=20, convergence_tol=0.0001) predict = model.predict(self.frame) test_rows = predict.to_pandas(predict.count()) file_name = self.get_name("gmm") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer(model_path, self.id()) as scorer: for i, row in test_rows.iterrows(): res = scorer.score([dict(zip(["x1", "x2"], list(row[0:2])))]) self.assertEqual(row["predicted_cluster"], res.json()["data"][0]['Score'])
def test_model_scoring(self): """Test publishing a logistic regression model""" model = self.context.models.classification.logistic_regression.train( self.frame, ["vec0", "vec1", "vec2", "vec3", "vec4"], 'res') predict = model.predict(self.frame, ["vec0", "vec1", "vec2", "vec3", "vec4"]) test_rows = predict.to_pandas(100) file_name = self.get_name("logistic_regression") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer(model_path, self.id()) as scorer: for i, row in test_rows.iterrows(): res = scorer.score([ dict( zip(["vec0", "vec1", "vec2", "vec3", "vec4"], list(row[0:5]))) ]) self.assertEqual(row["predicted_label"], res.json()["data"][0]['PredictedLabel'])
def test_model_scoring(self): """Test publishing a gmm model""" model = self.context.models.clustering.gmm.train( self.frame, ["x1", "x2"], column_scalings=[1.0, 1.0], k=5, max_iterations=500, seed=20, convergence_tol=0.0001, ) predict = model.predict(self.frame) test_rows = predict.to_pandas(predict.count()) file_name = self.get_name("gmm") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer(model_path, self.id()) as scorer: for i, row in test_rows.iterrows(): res = scorer.score([dict(zip(["x1", "x2"], list(row[0:2])))]) self.assertEqual(row["predicted_cluster"], res.json()["data"][0]["Score"])
def test_model_scoring(self): """Test publishing a logistic regression model""" model = self.context.models.classification.logistic_regression.train( self.frame, ["vec0", "vec1", "vec2", "vec3", "vec4"], 'res') predict = model.predict( self.frame, ["vec0", "vec1", "vec2", "vec3", "vec4"]) test_rows = predict.to_pandas(100) file_name = self.get_name("logistic_regression") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer( model_path, self.id()) as scorer: for i, row in test_rows.iterrows(): res = scorer.score( [dict(zip(["vec0", "vec1", "vec2", "vec3", "vec4"], list(row[0:5])))]) self.assertEqual( row["predicted_label"], res.json()["data"][0]['PredictedLabel'])
def test_model_scoring(self): """ Verify that SvmModel operates as expected. """ # Test set is a 3x3 square lattice of points # with a fully accurate, linear, unbiased divider. train_lattice = ["+++", "++-", "---"] training_frame = self.lattice2frame(train_lattice) svm_model = self.context.models.classification.svm.train( training_frame, u"model_class", ["x", "y"]) file_name = self.get_name("svm") model_path = svm_model.export_to_mar(self.get_export_file(file_name)) test_rows = training_frame.to_pandas(training_frame.count()) with scoring_utils.scorer( model_path, self.id()) as scorer: for _, i in test_rows.iterrows(): res = scorer.score([dict(zip(["x", "y"], list(i[0:2])))]) self.assertEqual(i[2], res.json()["data"][0]['Prediction'])
def test_model_scoring(self): """Test pca scoring""" model = self.context.models.dimreduction.pca.train( self.frame, ["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10"], False, 10) file_name = self.get_name("pca") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer( model_path, self.id()) as scorer: baseline = model.predict(self.frame, mean_centered=False) testvals = baseline.to_pandas(50) for _, i in testvals.iterrows(): r = scorer.score( [dict(zip(["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10"], map(lambda x: x, i[0:10])))]) map(lambda x, y: self.assertAlmostEqual(float(x),float(y)), r.json()["data"][-1]["principal_components"], i[10:])
def test_scoring_pipeline(self): """Test scoring_pipeline""" model = self.context.models.classification.naive_bayes.train(self.frame, ['f1', 'f2', 'f3'], "label") res = model.predict(self.frame, ['f1', 'f2', 'f3']) analysis = res.to_pandas() file_name = self.get_name("naive_bayes") model_path = model.export_to_mar(self.get_export_file(file_name)) self.tarfile = "pipeline.tar" pipeline_funcs = os.path.join(config.root, "regression-tests", "sparktkregtests", "testcases", "scoretests","pipeline_funcs.py") pipeline_config = os.path.join(config.root, "regression-tests", "sparktkregtests", "testcases", "scoretests","pipeline_config.json") tar = tarfile.open(self.tarfile, "w:gz") tar.add(pipeline_funcs, "pipeline_funcs.py") tar.add(pipeline_config, "pipeline_config.json") tar.close() with scoring_utils.scorer( model_path, self.id(), pipeline=True, pipeline_filename=self.tarfile) as scorer: for _, i in analysis.iterrows(): r = scorer.score( [dict(zip(['f1', 'f2', 'f3'], map(lambda x: int(x), (i[1:4]))))]) self.assertEqual( r.json(), i['predicted_class'])
def test_model_scoring(self): """Test pca scoring""" model = self.context.models.dimreduction.pca.train( self.frame, ["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10"], False, 10) file_name = self.get_name("pca") model_path = model.export_to_mar(self.get_export_file(file_name)) with scoring_utils.scorer(model_path, self.id()) as scorer: baseline = model.predict(self.frame, mean_centered=False) testvals = baseline.to_pandas(50) for _, i in testvals.iterrows(): r = scorer.score([ dict( zip([ "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10" ], map(lambda x: x, i[0:10]))) ]) map(lambda x, y: self.assertAlmostEqual(float(x), float(y)), r.json()["data"][-1]["principal_components"], i[10:])
def test_revise_model(self): """Tests revise api in scoring engine""" model = self.context.models.regression.linear_regression.train( self.frame, ['c1', 'c2', 'c3', 'c4'], 'label') old_model_path = model.export_to_mar( self.get_export_file(self.get_name("lin_reg"))) #create a revised model model_revised = self.context.models.regression.linear_regression.train( self.frame, ['c1', 'c2', 'c3'], 'label', max_iterations=10) result_revised = model_revised.predict(self.frame, ['c1', 'c2', 'c3']) test_rows = result_revised.to_pandas(50) revised_model_path = model_revised.export_to_mar( self.get_export_file(self.get_name("lin_reg_revised"))) with scoring_utils.scorer(old_model_path, self.id()) as scorer: res = scorer.revise(revised_model_path) self.assertEqual(res.json()["status"], "success") for _, i in test_rows.iterrows(): res = scorer.score( [dict(zip(["c1", "c2", "c3"], list(i[0:3])))]) self.assertEqual(i['predicted_value'], res.json()["data"][0]['Prediction'])