示例#1
0
    def test_type_error(self):
        df = self.spark.createDataFrame([("a", 0),
                                         ("b", 0)]).toDF("features", "key")
        keyedPCA = KeyedEstimator(sklearnEstimator=PCA())
        self.assertRaises(TypeError, keyedPCA.fit, df)

        df = self.spark.createDataFrame([(Vectors.dense([i]), [i], 0)
                                         for i in range(10)])
        df = df.toDF("features", "y", "key")
        keyedLR = KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y")
        self.assertRaises(TypeError, keyedLR.fit, df)
示例#2
0
    def test_correct_estimator_type(self):
        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=PCA()),
                                "transformer")

        self.checkEstimatorType(
            KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y"),
            "predictor")

        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=DBSCAN()),
                                "clusterer")

        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=KMeans()),
                                "clusterer")

        ke = KeyedEstimator(sklearnEstimator=KMeans(),
                            estimatorType="transformer")
        self.checkEstimatorType(ke, "transformer")

        custom = KeyedModelTests._CustomClusterer()
        ke = KeyedEstimator(sklearnEstimator=custom)
        self.checkEstimatorType(ke, "clusterer")

        ke = KeyedEstimator(sklearnEstimator=custom,
                            estimatorType="transformer")
        self.checkEstimatorType(ke, "transformer")

        custom = KeyedModelTests._CustomTransformer()
        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=custom),
                                "transformer")
示例#3
0
    def test_invalid_argument(self):
        self.assertRaises(ValueError, KeyedEstimator)

        create = lambda: KeyedEstimator(sklearnEstimator=5)
        self.assertRaises(ValueError, create)

        class SomeUDC(object):
            pass

        create = lambda: KeyedEstimator(sklearnEstimator=SomeUDC())
        self.assertRaises(ValueError, create)

        create = lambda: KeyedEstimator(sklearnEstimator=PCA(), keyCols=[])
        self.assertRaises(ValueError, create)

        create = lambda: KeyedEstimator(sklearnEstimator=PCA(),
                                        keyCols=["key", "estimator"])
        self.assertRaises(ValueError, create)

        create = lambda: KeyedEstimator(sklearnEstimator=PCA(),
                                        xCol="estimator")
        self.assertRaises(ValueError, create)

        create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(),
                                        yCol="estimator")
        self.assertRaises(ValueError, create)

        create = lambda: KeyedEstimator(sklearnEstimator=PCA(),
                                        yCol="estimator")
        self.assertRaises(ValueError, create)

        create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression())
        self.assertRaises(ValueError, create)
示例#4
0
    def test_attr_error(self):
        ke = KeyedEstimator(sklearnEstimator=PCA(), estimatorType="clusterer")
        self.checkPredictionAttrError(ke)
        ke = KeyedEstimator(sklearnEstimator=PCA(),
                            yCol="y",
                            estimatorType="predictor")
        self.checkPredictionAttrError(ke)

        ke = KeyedEstimator(sklearnEstimator=DBSCAN(),
                            estimatorType="transformer")
        self.checkPredictionAttrError(ke)
        ke = KeyedEstimator(sklearnEstimator=DBSCAN(),
                            yCol="y",
                            estimatorType="predictor")
        self.checkPredictionAttrError(ke)
示例#5
0
    def test_surprise_key(self):
        ke = KeyedEstimator(sklearnEstimator=PCA())
        schema = StructType().add("features",
                                  LongType()).add("key", LongType())
        df = self.spark.createDataFrame([], schema)
        km = ke.fit(df)

        self.assertEqual(km.keyedModels.collect(), [])
        self.assertEqual(km.keyedModels.dtypes,
                         [("key", LongType().simpleString()),
                          ("estimator", "sklearn-estimator")])

        df = self.spark.createDataFrame([(1, 2)], schema)
        df = km.transform(df)

        self.assertEqual(df.collect(), [(1, 2, None)])
        self.assertEqual(df.dtypes, [("features", "bigint"), ("key", "bigint"),
                                     ("output", "vector")])
    def test_surprise_key(self):
        ke = KeyedEstimator(sklearnEstimator=PCA())
        schema = StructType().add("features", LongType()).add("key", LongType())
        df = self.spark.createDataFrame([], schema)
        km = ke.fit(df)

        self.assertEqual(km.keyedModels.collect(), [])
        self.assertEqual(km.keyedModels.dtypes,
                         [("key", LongType().simpleString()),
                          ("estimator", "sklearn-estimator")])

        df = self.spark.createDataFrame([(1, 2)], schema)
        df = km.transform(df)

        self.assertEqual(df.collect(), [(1, 2, None)])
        self.assertEqual(df.dtypes,
                         [("features", "bigint"),
                          ("key", "bigint"),
                          ("output", "vector")])
    def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen, **kwargs):
        NUSERS = 10
        # featureGen() should generate a np rank-1 ndarray of equal length
        # labelGen() should generate a scalar
        assert (labelGen is not None) == ("yCol" in kwargs)
        isPredictor = labelGen is not None

        # sklearn's LinearRegression estimator is stable even if undetermined.
        # User keys are just [0, NUSERS), repeated for each key if there are multiple columns.
        # The i-th user has i examples.

        keyCols = kwargs.get("keyCols", KeyedEstimator._paramSpecs["keyCols"]["default"])
        outputCol = kwargs.get("outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"])
        xCol = kwargs.get("xCol", KeyedEstimator._paramSpecs["xCol"]["default"])

        nExamplesPerUser = lambda i: max(minExamples, i + 1)
        userKeys = [[i for _ in keyCols] for i in range(NUSERS)]
        features = [[featureGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        useless = [["useless col" for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        if isPredictor:
            labels = [[labelGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        else:
            labels = None

        Xs = [np.vstack(x) for x in features]
        ys = [np.array(y) for y in labels] if isPredictor else repeat(None)
        localEstimators = [sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y)
                           for X, y in zip(Xs, ys)]
        expectedDF = pd.DataFrame(userKeys, columns=keyCols)
        expectedDF["estimator"] = localEstimators

        def flattenAndConvertNumpy(x):
            return [Vectors.dense(i) if isinstance(i, np.ndarray) else i
                    for i in chain.from_iterable(x)]

        inputDF = pd.DataFrame.from_dict(
            {k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))] for k in keyCols})
        inputDF[xCol] = flattenAndConvertNumpy(features)
        inputDF["useless"] = flattenAndConvertNumpy(useless)
        if labels:
            inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels)
        inputDF = self.spark.createDataFrame(inputDF)

        ke = KeyedEstimator(**kwargs)
        km = ke.fit(inputDF)

        actualDF = km.keyedModels.toPandas()
        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols)

        # Test users with different amounts of points.
        nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4
        testFeatures = [[featureGen() for _ in range(nTestPerUser(i))] for i in range(NUSERS)]
        # "useless" column has nothing to do with computation, but is essential for keeping order
        # the same between the spark and non-spark versions
        useless = [range(nTestPerUser(i)) for i in range(NUSERS)]
        inputDF = pd.DataFrame.from_dict(
            {k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))] for k in keyCols})
        inputDF[xCol] = flattenAndConvertNumpy(testFeatures)
        inputDF["useless"] = flattenAndConvertNumpy(useless)

        estimatorType = km.sklearnEstimatorType  # tested to be correct elsewhere

        def makeOutput(estimator, X):
            if estimatorType == "transformer":
                return estimator.transform(X)
            else:
                assert estimatorType == "predictor" or estimatorType == "clusterer"
                return estimator.predict(X).tolist()

        Xs = [np.vstack(x) for x in testFeatures]
        expectedOutput = map(makeOutput, localEstimators, Xs)
        expectedDF = inputDF.copy(deep=True)
        expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput)

        inputDF = self.spark.createDataFrame(inputDF)
        actualDF = km.transform(inputDF).toPandas()

        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])
示例#8
0
    def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen,
                                  **kwargs):
        NUSERS = 10
        # featureGen() should generate a np rank-1 ndarray of equal length
        # labelGen() should generate a scalar
        assert (labelGen is not None) == ("yCol" in kwargs)
        isPredictor = labelGen is not None

        # sklearn's LinearRegression estimator is stable even if undetermined.
        # User keys are just [0, NUSERS), repeated for each key if there are multiple columns.
        # The i-th user has i examples.

        keyCols = kwargs.get("keyCols",
                             KeyedEstimator._paramSpecs["keyCols"]["default"])
        outputCol = kwargs.get(
            "outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"])
        xCol = kwargs.get("xCol",
                          KeyedEstimator._paramSpecs["xCol"]["default"])

        nExamplesPerUser = lambda i: max(minExamples, i + 1)
        userKeys = [[i for _ in keyCols] for i in range(NUSERS)]
        features = [[featureGen() for _ in range(nExamplesPerUser(i))]
                    for i in range(NUSERS)]
        useless = [["useless col" for _ in range(nExamplesPerUser(i))]
                   for i in range(NUSERS)]
        if isPredictor:
            labels = [[labelGen() for _ in range(nExamplesPerUser(i))]
                      for i in range(NUSERS)]
        else:
            labels = None

        Xs = [np.vstack(x) for x in features]
        ys = [np.array(y) for y in labels] if isPredictor else repeat(None)
        localEstimators = [
            sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y)
            for X, y in zip(Xs, ys)
        ]
        expectedDF = pd.DataFrame(userKeys, columns=keyCols)
        expectedDF["estimator"] = localEstimators

        def flattenAndConvertNumpy(x):
            return [
                Vectors.dense(i) if isinstance(i, np.ndarray) else i
                for i in chain.from_iterable(x)
            ]

        inputDF = pd.DataFrame.from_dict({
            k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))]
            for k in keyCols
        })
        inputDF[xCol] = flattenAndConvertNumpy(features)
        inputDF["useless"] = flattenAndConvertNumpy(useless)
        if labels:
            inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels)
        inputDF = self.spark.createDataFrame(inputDF)

        ke = KeyedEstimator(**kwargs)
        km = ke.fit(inputDF)

        actualDF = km.keyedModels.toPandas()
        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols)

        # Test users with different amounts of points.
        nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4
        testFeatures = [[featureGen() for _ in range(nTestPerUser(i))]
                        for i in range(NUSERS)]
        # "useless" column has nothing to do with computation, but is essential for keeping order
        # the same between the spark and non-spark versions
        useless = [range(nTestPerUser(i)) for i in range(NUSERS)]
        inputDF = pd.DataFrame.from_dict({
            k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))]
            for k in keyCols
        })
        inputDF[xCol] = flattenAndConvertNumpy(testFeatures)
        inputDF["useless"] = flattenAndConvertNumpy(useless)

        estimatorType = km.sklearnEstimatorType  # tested to be correct elsewhere

        def makeOutput(estimator, X):
            if estimatorType == "transformer":
                return estimator.transform(X)
            else:
                assert estimatorType == "predictor" or estimatorType == "clusterer"
                return estimator.predict(X).tolist()

        Xs = [np.vstack(x) for x in testFeatures]
        expectedOutput = map(makeOutput, localEstimators, Xs)
        expectedDF = inputDF.copy(deep=True)
        expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput)

        inputDF = self.spark.createDataFrame(inputDF)
        actualDF = km.transform(inputDF).toPandas()

        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])
示例#9
0
    def test_invalid_argument(self):
        # Need to specify sklearnEstimator
        self.assertRaises(ValueError, KeyedEstimator)

        # sklearnEstimator must be a sklearn.base.Estimator
        create = lambda: KeyedEstimator(sklearnEstimator=5)
        self.assertRaises(ValueError, create)

        class SomeUDC(object):
            pass

        create = lambda: KeyedEstimator(sklearnEstimator=SomeUDC())
        self.assertRaises(ValueError, create)

        # Must have fit()
        create = lambda: KeyedEstimator(sklearnEstimator=KeyedModelTests.
                                        _CustomMissingFit())
        self.assertRaises(AttributeError, create)

        # Must have key columns
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(), keyCols=[])
        self.assertRaises(ValueError, create)

        # Columns can't have "estimator" name in them
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(),
                                        keyCols=["key", "estimator"])
        self.assertRaises(ValueError, create)
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(),
                                        xCol="estimator")
        self.assertRaises(ValueError, create)
        create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(),
                                        yCol="estimator")
        self.assertRaises(ValueError, create)
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(),
                                        yCol="estimator")
        self.assertRaises(ValueError, create)

        # Presence of yCol requires predictor
        create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(),
                                        yCol="y",
                                        estimatorType="transformer")
        self.assertRaises(ValueError, create)
        create = lambda: KeyedEstimator(sklearnEstimator=LinearRegression(),
                                        yCol="y",
                                        estimatorType="clusterer")
        self.assertRaises(ValueError, create)

        # estimatorType must be one of the three options
        create = lambda: KeyedEstimator(sklearnEstimator=PCA(),
                                        estimatorType="regressor")
        self.assertRaises(ValueError, create)
示例#10
0
 def test_defaults(self):
     ke = KeyedEstimator(sklearnEstimator=PCA())
     for paramName, paramSpec in KeyedEstimator._paramSpecs.items():
         if "default" in paramSpec:
             self.assertEqual(paramSpec["default"],
                              ke.getOrDefault(paramName))
示例#11
0
 def test_create_no_errors(self):
     KeyedEstimator(sklearnEstimator=PCA())
     KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="yCol")
     KeyedEstimator(sklearnEstimator=KeyedModelTests._CustomTransformer())
示例#12
0
 def test_defaults(self):
     ke = KeyedEstimator(sklearnEstimator=PCA())
     for paramName, paramSpec in KeyedEstimator._paramSpecs.items():
         if "default" in paramSpec:
             self.assertEqual(paramSpec["default"], ke.getOrDefault(paramName))