def test_binarizer(self): b0 = Binarizer() self.assertListEqual(b0.params, [ b0.inputCol, b0.inputCols, b0.outputCol, b0.outputCols, b0.threshold, b0.thresholds ]) self.assertTrue(all([~b0.isSet(p) for p in b0.params])) self.assertTrue(b0.hasDefault(b0.threshold)) self.assertEqual(b0.getThreshold(), 0.0) b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0) self.assertTrue(not all([b0.isSet(p) for p in b0.params])) self.assertEqual(b0.getThreshold(), 1.0) self.assertEqual(b0.getInputCol(), "input") self.assertEqual(b0.getOutputCol(), "output") b0c = b0.copy({b0.threshold: 2.0}) self.assertEqual(b0c.uid, b0.uid) self.assertListEqual(b0c.params, b0.params) self.assertEqual(b0c.getThreshold(), 2.0) b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output") self.assertNotEqual(b1.uid, b0.uid) self.assertEqual(b1.getThreshold(), 2.0) self.assertEqual(b1.getInputCol(), "input") self.assertEqual(b1.getOutputCol(), "output")
def test_preserve_set_state(self): dataset = self.spark.createDataFrame([(0.5,)], ["data"]) binarizer = Binarizer(inputCol="data") self.assertFalse(binarizer.isSet("threshold")) binarizer.transform(dataset) binarizer._transfer_params_from_java() self.assertFalse(binarizer.isSet("threshold"), "Params not explicitly set should remain unset after transform")
def test_preserve_set_state(self): dataset = self.spark.createDataFrame([(0.5,)], ["data"]) binarizer = Binarizer(inputCol="data") self.assertFalse(binarizer.isSet("threshold")) binarizer.transform(dataset) binarizer._transfer_params_from_java() self.assertFalse(binarizer.isSet("threshold"), "Params not explicitly set should remain unset after transform")
def test_default_params_transferred(self): dataset = self.spark.createDataFrame([(0.5, )], ["data"]) binarizer = Binarizer(inputCol="data") # intentionally change the pyspark default, but don't set it binarizer._defaultParamMap[binarizer.outputCol] = "my_default" result = binarizer.transform(dataset).select("my_default").collect() self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0)
def test_default_params_transferred(self): dataset = self.spark.createDataFrame([(0.5,)], ["data"]) binarizer = Binarizer(inputCol="data") # intentionally change the pyspark default, but don't set it binarizer._defaultParamMap[binarizer.outputCol] = "my_default" result = binarizer.transform(dataset).select("my_default").collect() self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0)
def test_binarizer(self): b0 = Binarizer() self.assertListEqual(b0.params, [b0.inputCol, b0.outputCol, b0.threshold]) self.assertTrue(all([~b0.isSet(p) for p in b0.params])) self.assertTrue(b0.hasDefault(b0.threshold)) self.assertEqual(b0.getThreshold(), 0.0) b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0) self.assertTrue(all([b0.isSet(p) for p in b0.params])) self.assertEqual(b0.getThreshold(), 1.0) self.assertEqual(b0.getInputCol(), "input") self.assertEqual(b0.getOutputCol(), "output") b0c = b0.copy({b0.threshold: 2.0}) self.assertEqual(b0c.uid, b0.uid) self.assertListEqual(b0c.params, b0.params) self.assertEqual(b0c.getThreshold(), 2.0) b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output") self.assertNotEqual(b1.uid, b0.uid) self.assertEqual(b1.getThreshold(), 2.0) self.assertEqual(b1.getInputCol(), "input") self.assertEqual(b1.getOutputCol(), "output")