def test_type_error(self): df = self.spark.createDataFrame([("a", 0), ("b", 0)]).toDF("features", "key") keyedPCA = KeyedEstimator(sklearnEstimator=PCA()) self.assertRaises(TypeError, keyedPCA.fit, df) df = self.spark.createDataFrame([(Vectors.dense([i]), [i], 0) for i in range(10)]) df = df.toDF("features", "y", "key") keyedLR = KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y") self.assertRaises(TypeError, keyedLR.fit, df)
def test_correct_estimator_type(self): self.checkEstimatorType(KeyedEstimator(sklearnEstimator=PCA()), "transformer") self.checkEstimatorType( KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y"), "predictor") self.checkEstimatorType(KeyedEstimator(sklearnEstimator=DBSCAN()), "clusterer") self.checkEstimatorType(KeyedEstimator(sklearnEstimator=KMeans()), "clusterer") ke = KeyedEstimator(sklearnEstimator=KMeans(), estimatorType="transformer") self.checkEstimatorType(ke, "transformer") custom = KeyedModelTests._CustomClusterer() ke = KeyedEstimator(sklearnEstimator=custom) self.checkEstimatorType(ke, "clusterer") ke = KeyedEstimator(sklearnEstimator=custom, estimatorType="transformer") self.checkEstimatorType(ke, "transformer") custom = KeyedModelTests._CustomTransformer() self.checkEstimatorType(KeyedEstimator(sklearnEstimator=custom), "transformer")
def test_invalid_argument(self): self.assertRaises(ValueError, KeyedEstimator) create = lambda: KeyedEstimator(sklearnEstimator=5) self.assertRaises(ValueError, create) class SomeUDC(object): pass create = lambda: KeyedEstimator(sklearnEstimator=SomeUDC()) self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=PCA(), keyCols=[]) self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=PCA(), keyCols=["key", "estimator"]) self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=PCA(), xCol="estimator") self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="estimator") self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=PCA(), yCol="estimator") self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression()) self.assertRaises(ValueError, create)
def test_attr_error(self): ke = KeyedEstimator(sklearnEstimator=PCA(), estimatorType="clusterer") self.checkPredictionAttrError(ke) ke = KeyedEstimator(sklearnEstimator=PCA(), yCol="y", estimatorType="predictor") self.checkPredictionAttrError(ke) ke = KeyedEstimator(sklearnEstimator=DBSCAN(), estimatorType="transformer") self.checkPredictionAttrError(ke) ke = KeyedEstimator(sklearnEstimator=DBSCAN(), yCol="y", estimatorType="predictor") self.checkPredictionAttrError(ke)
def test_surprise_key(self): ke = KeyedEstimator(sklearnEstimator=PCA()) schema = StructType().add("features", LongType()).add("key", LongType()) df = self.spark.createDataFrame([], schema) km = ke.fit(df) self.assertEqual(km.keyedModels.collect(), []) self.assertEqual(km.keyedModels.dtypes, [("key", LongType().simpleString()), ("estimator", "sklearn-estimator")]) df = self.spark.createDataFrame([(1, 2)], schema) df = km.transform(df) self.assertEqual(df.collect(), [(1, 2, None)]) self.assertEqual(df.dtypes, [("features", "bigint"), ("key", "bigint"), ("output", "vector")])
def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen, **kwargs): NUSERS = 10 # featureGen() should generate a np rank-1 ndarray of equal length # labelGen() should generate a scalar assert (labelGen is not None) == ("yCol" in kwargs) isPredictor = labelGen is not None # sklearn's LinearRegression estimator is stable even if undetermined. # User keys are just [0, NUSERS), repeated for each key if there are multiple columns. # The i-th user has i examples. keyCols = kwargs.get("keyCols", KeyedEstimator._paramSpecs["keyCols"]["default"]) outputCol = kwargs.get("outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"]) xCol = kwargs.get("xCol", KeyedEstimator._paramSpecs["xCol"]["default"]) nExamplesPerUser = lambda i: max(minExamples, i + 1) userKeys = [[i for _ in keyCols] for i in range(NUSERS)] features = [[featureGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] useless = [["useless col" for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] if isPredictor: labels = [[labelGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] else: labels = None Xs = [np.vstack(x) for x in features] ys = [np.array(y) for y in labels] if isPredictor else repeat(None) localEstimators = [sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y) for X, y in zip(Xs, ys)] expectedDF = pd.DataFrame(userKeys, columns=keyCols) expectedDF["estimator"] = localEstimators def flattenAndConvertNumpy(x): return [Vectors.dense(i) if isinstance(i, np.ndarray) else i for i in chain.from_iterable(x)] inputDF = pd.DataFrame.from_dict( {k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))] for k in keyCols}) inputDF[xCol] = flattenAndConvertNumpy(features) inputDF["useless"] = flattenAndConvertNumpy(useless) if labels: inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels) inputDF = self.spark.createDataFrame(inputDF) ke = KeyedEstimator(**kwargs) km = ke.fit(inputDF) actualDF = km.keyedModels.toPandas() _assertPandasAlmostEqual(actualDF, expectedDF, keyCols) # Test users with different amounts of points. nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4 testFeatures = [[featureGen() for _ in range(nTestPerUser(i))] for i in range(NUSERS)] # "useless" column has nothing to do with computation, but is essential for keeping order # the same between the spark and non-spark versions useless = [range(nTestPerUser(i)) for i in range(NUSERS)] inputDF = pd.DataFrame.from_dict( {k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))] for k in keyCols}) inputDF[xCol] = flattenAndConvertNumpy(testFeatures) inputDF["useless"] = flattenAndConvertNumpy(useless) estimatorType = km.sklearnEstimatorType # tested to be correct elsewhere def makeOutput(estimator, X): if estimatorType == "transformer": return estimator.transform(X) else: assert estimatorType == "predictor" or estimatorType == "clusterer" return estimator.predict(X).tolist() Xs = [np.vstack(x) for x in testFeatures] expectedOutput = map(makeOutput, localEstimators, Xs) expectedDF = inputDF.copy(deep=True) expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput) inputDF = self.spark.createDataFrame(inputDF) actualDF = km.transform(inputDF).toPandas() _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])
def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen, **kwargs): NUSERS = 10 # featureGen() should generate a np rank-1 ndarray of equal length # labelGen() should generate a scalar assert (labelGen is not None) == ("yCol" in kwargs) isPredictor = labelGen is not None # sklearn's LinearRegression estimator is stable even if undetermined. # User keys are just [0, NUSERS), repeated for each key if there are multiple columns. # The i-th user has i examples. keyCols = kwargs.get("keyCols", KeyedEstimator._paramSpecs["keyCols"]["default"]) outputCol = kwargs.get( "outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"]) xCol = kwargs.get("xCol", KeyedEstimator._paramSpecs["xCol"]["default"]) nExamplesPerUser = lambda i: max(minExamples, i + 1) userKeys = [[i for _ in keyCols] for i in range(NUSERS)] features = [[featureGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] useless = [["useless col" for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] if isPredictor: labels = [[labelGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] else: labels = None Xs = [np.vstack(x) for x in features] ys = [np.array(y) for y in labels] if isPredictor else repeat(None) localEstimators = [ sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y) for X, y in zip(Xs, ys) ] expectedDF = pd.DataFrame(userKeys, columns=keyCols) expectedDF["estimator"] = localEstimators def flattenAndConvertNumpy(x): return [ Vectors.dense(i) if isinstance(i, np.ndarray) else i for i in chain.from_iterable(x) ] inputDF = pd.DataFrame.from_dict({ k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))] for k in keyCols }) inputDF[xCol] = flattenAndConvertNumpy(features) inputDF["useless"] = flattenAndConvertNumpy(useless) if labels: inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels) inputDF = self.spark.createDataFrame(inputDF) ke = KeyedEstimator(**kwargs) km = ke.fit(inputDF) actualDF = km.keyedModels.toPandas() _assertPandasAlmostEqual(actualDF, expectedDF, keyCols) # Test users with different amounts of points. nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4 testFeatures = [[featureGen() for _ in range(nTestPerUser(i))] for i in range(NUSERS)] # "useless" column has nothing to do with computation, but is essential for keeping order # the same between the spark and non-spark versions useless = [range(nTestPerUser(i)) for i in range(NUSERS)] inputDF = pd.DataFrame.from_dict({ k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))] for k in keyCols }) inputDF[xCol] = flattenAndConvertNumpy(testFeatures) inputDF["useless"] = flattenAndConvertNumpy(useless) estimatorType = km.sklearnEstimatorType # tested to be correct elsewhere def makeOutput(estimator, X): if estimatorType == "transformer": return estimator.transform(X) else: assert estimatorType == "predictor" or estimatorType == "clusterer" return estimator.predict(X).tolist() Xs = [np.vstack(x) for x in testFeatures] expectedOutput = map(makeOutput, localEstimators, Xs) expectedDF = inputDF.copy(deep=True) expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput) inputDF = self.spark.createDataFrame(inputDF) actualDF = km.transform(inputDF).toPandas() _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])
def test_invalid_argument(self): # Need to specify sklearnEstimator self.assertRaises(ValueError, KeyedEstimator) # sklearnEstimator must be a sklearn.base.Estimator create = lambda: KeyedEstimator(sklearnEstimator=5) self.assertRaises(ValueError, create) class SomeUDC(object): pass create = lambda: KeyedEstimator(sklearnEstimator=SomeUDC()) self.assertRaises(ValueError, create) # Must have fit() create = lambda: KeyedEstimator(sklearnEstimator=KeyedModelTests. _CustomMissingFit()) self.assertRaises(AttributeError, create) # Must have key columns create = lambda: KeyedEstimator(sklearnEstimator=PCA(), keyCols=[]) self.assertRaises(ValueError, create) # Columns can't have "estimator" name in them create = lambda: KeyedEstimator(sklearnEstimator=PCA(), keyCols=["key", "estimator"]) self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=PCA(), xCol="estimator") self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="estimator") self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=PCA(), yCol="estimator") self.assertRaises(ValueError, create) # Presence of yCol requires predictor create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y", estimatorType="transformer") self.assertRaises(ValueError, create) create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y", estimatorType="clusterer") self.assertRaises(ValueError, create) # estimatorType must be one of the three options create = lambda: KeyedEstimator(sklearnEstimator=PCA(), estimatorType="regressor") self.assertRaises(ValueError, create)
def test_defaults(self): ke = KeyedEstimator(sklearnEstimator=PCA()) for paramName, paramSpec in KeyedEstimator._paramSpecs.items(): if "default" in paramSpec: self.assertEqual(paramSpec["default"], ke.getOrDefault(paramName))
def test_create_no_errors(self): KeyedEstimator(sklearnEstimator=PCA()) KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="yCol") KeyedEstimator(sklearnEstimator=KeyedModelTests._CustomTransformer())