def setUpClass(cls): # Create a temporary directory to store the trained model cls.model_dir = tempfile.TemporaryDirectory() # Create an instance of the model cls.model = XGBRegressorModel( XGBRegressorModelConfig( features=Features(Feature("Feature1", float, 1), Feature("Feature2")), predict=Feature("Target", float, 1), directory=cls.model_dir.name, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.records = [ Record( "x" + str(random.random()), data={ "features": { "Feature1": float(_temp_data[0][i]), "Feature2": float(_temp_data[1][i]), "Target": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.trainingsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[:1800]))) cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1800:])))
def test_predict(self): self.required_plugins("dffml-model-scikit") # Import SciKit modules dffml_model_scikit = importlib.import_module("dffml_model_scikit") # Instantiate the model model = dffml_model_scikit.LinearRegressionModel( directory=self.mktempdir(), predict=Feature("Salary", int, 1), features=Features( Feature("Years", int, 1), Feature("Expertise", int, 1), Feature("Trust", float, 1), ), ) training_data = CSVSource(filename=self.train_filename) test_data = CSVSource(filename=self.test_filename) predict_data = CSVSource(filename=self.predict_filename) # Train the model train(model, training_data) # Assess accuracy accuracy(model, test_data) # Make prediction predictions = [ prediction for prediction in predict(model, predict_data) ] self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70) self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80)
def test_config_set(self): config = FakeTesting.config( parse_unknown( "--test-fake-name", "feedface", "--test-num", "-4.2", "--test-fake-label", "default-label", "--test-fake-readonly", "--test-files", "a", "b", "c", "--test-fake-source", "csv", "--test-source-filename", "file.csv", "--test-features", "Year:int:1", "Commits:int:10", )) self.assertEqual(config.num, -4.2) self.assertEqual(config.files, ["a", "b", "c"]) self.assertEqual(config.name, "feedface") self.assertEqual(config.label, "default-label") self.assertTrue(config.readonly) self.assertTrue(isinstance(config.source, CSVSource)) self.assertEqual(config.source.config.filename, pathlib.Path("file.csv")) self.assertEqual( config.features, Features(Feature("Year", int, 1), Feature("Commits", int, 10)), )
def test_config_defaults(self): config = FakeTesting.config( parse_unknown( "--test-fake-name", "feedface", "--test-num", "-4.2", "--test-files", "a", "b", "c", "--test-source-filename", "file.json", "--test-features", "Year:int:1", "Commits:int:10", "--test-fake-nums", "100", )) self.assertEqual(config.num, -4.2) self.assertEqual(config.files, ["a", "b", "c"]) self.assertEqual(config.name, "feedface") self.assertEqual(config.label, "unlabeled") self.assertFalse(config.readonly) self.assertTrue(isinstance(config.source, JSONSource)) self.assertEqual(config.source.config.filename, pathlib.Path("file.json")) self.assertEqual( config.features, Features(Feature("Year", int, 1), Feature("Commits", int, 10)), ) self.assertEqual(config.nums, (100, ))
def setUpClass(cls): # Create a temporary directory to store the trained model cls.model_dir = tempfile.TemporaryDirectory() # Create an instance of the model cls.model = AnomalyModel( features=Features( Feature("A", int, 1), Feature("B", int, 2), ), predict=Feature("Y", int, 1), directory=cls.model_dir.name, ) # Generating data _n_data = 1800 _temp_data = np.random.normal(2, 1, size=(2, _n_data)) cls.records = [ Record( "x" + str(random.random()), data={ "features": { "A": float(_temp_data[0][i]), "B": float(_temp_data[1][i]), "Y": (_temp_data[0][i] > 1 - _temp_data[1][i]).astype(int), } }, ) for i in range(0, _n_data) ] cls.trainingsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[:1400]))) cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1400:])))
async def test_01_accuracy(self): scorer = MeanSquaredErrorAccuracy() # Use the test data to assess the model's accuracy res = await score(self.model, scorer, Feature("Target", float, 1), self.testsource) # Ensure the accuracy is above 80% self.assertTrue(res)
async def test_02_predict(self): # reduce overfitting res_train = await score( self.model, self.scorer, Feature("Target", float, 1), self.trainingsource, ) res_test = await score( self.model, self.scorer, Feature("Target", float, 1), self.testsource, ) # Test fails if the difference between training and testing is more that 5% self.assertLess(res_train - res_test, 0.05)
async def test_01_accuracy(self): # Use the test data to assess the model's accuracy res = await score( self.model, self.scorer, Feature("Target", float, 1), self.testsource, ) # Ensure the accuracy is above 80% self.assertTrue(0.8 <= res)
async def test_predict(self): self.required_plugins("dffml-model-scikit") # Import SciKit modules dffml_model_scikit = importlib.import_module("dffml_model_scikit") # Instantiate the model model = dffml_model_scikit.LinearRegressionModel( location=self.mktempdir(), predict=Feature("Salary", int, 1), features=Features( Feature("Years", int, 1), Feature("Expertise", int, 1), Feature("Trust", float, 1), ), ) training_data = CSVSource(filename=self.train_filename) test_data = CSVSource(filename=self.test_filename) predict_data = CSVSource(filename=self.predict_filename) # Train the model await train(model, training_data) # Assess accuracy scorer = MeanSquaredErrorAccuracy() await score(model, scorer, Feature("Salary", int, 1), test_data) # Make prediction predictions = [ prediction async for prediction in predict(model, predict_data) ] self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70) self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80) # Test input data as list await train(model, *self.train_data) await score(model, scorer, Feature("Salary", int, 1), *self.test_data) predictions = [ prediction async for prediction in predict(model, *self.predict_data) ] self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70) self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80)
**{ "directory": ( pathlib.Path, field("Directory where state should be saved", ), ), "features": (Features, field("Features to train on")), }, **config_fields, } if estimator_type in unsupervised_estimators: dffml_config_properties["predict"] = ( Feature, field( "Name used as meaning of prediction", default=Feature(name="cluster", dtype=str, length=1), ), ) dffml_config = make_config_numpy(name + "ModelConfig", cls, properties=dffml_config_properties) dffml_cls_ctx = type( name + "ModelContext", (parentContext, ), {}, ) dffml_cls = type( name + "Model",
async def test_01_accuracy(self): res = await score( self.model, self.scorer, Feature("Tag", str, 1), self.train_sources ) self.assertGreaterEqual(res, 0)
class FakeTestingConfig2: name: str = field("Name of FakeTesting2") num: float features: Features = Features(Feature("default", int, 1), Feature("features", int, 10)) label: str = "unlabeled"