Python KeyedEstimator.fit 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: spark_sklearn.keyed_models

클래스/타입: KeyedEstimator

메소드/함수: fit

hotexamples.com에서의 예제들: 4

Python KeyedEstimator.fit - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 spark_sklearn.keyed_models.KeyedEstimator.fit에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

KeyedEstimator(9)

fit(2)

getOrDefault(1)

자주 사용되는 메소드들

KeyedEstimator (9)

fit (2)

getOrDefault (1)

예제 #1

파일 보기

    def test_surprise_key(self):
        ke = KeyedEstimator(sklearnEstimator=PCA())
        schema = StructType().add("features",
                                  LongType()).add("key", LongType())
        df = self.spark.createDataFrame([], schema)
        km = ke.fit(df)

        self.assertEqual(km.keyedModels.collect(), [])
        self.assertEqual(km.keyedModels.dtypes,
                         [("key", LongType().simpleString()),
                          ("estimator", "sklearn-estimator")])

        df = self.spark.createDataFrame([(1, 2)], schema)
        df = km.transform(df)

        self.assertEqual(df.collect(), [(1, 2, None)])
        self.assertEqual(df.dtypes, [("features", "bigint"), ("key", "bigint"),
                                     ("output", "vector")])

예제 #2

파일 보기

파일: test_keyed_models.py 프로젝트: dominguezus/spark-sklearn

    def test_surprise_key(self):
        ke = KeyedEstimator(sklearnEstimator=PCA())
        schema = StructType().add("features", LongType()).add("key", LongType())
        df = self.spark.createDataFrame([], schema)
        km = ke.fit(df)

        self.assertEqual(km.keyedModels.collect(), [])
        self.assertEqual(km.keyedModels.dtypes,
                         [("key", LongType().simpleString()),
                          ("estimator", "sklearn-estimator")])

        df = self.spark.createDataFrame([(1, 2)], schema)
        df = km.transform(df)

        self.assertEqual(df.collect(), [(1, 2, None)])
        self.assertEqual(df.dtypes,
                         [("features", "bigint"),
                          ("key", "bigint"),
                          ("output", "vector")])

예제 #3

파일 보기

파일: test_keyed_models.py 프로젝트: dominguezus/spark-sklearn

    def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen, **kwargs):
        NUSERS = 10
        # featureGen() should generate a np rank-1 ndarray of equal length
        # labelGen() should generate a scalar
        assert (labelGen is not None) == ("yCol" in kwargs)
        isPredictor = labelGen is not None

        # sklearn's LinearRegression estimator is stable even if undetermined.
        # User keys are just [0, NUSERS), repeated for each key if there are multiple columns.
        # The i-th user has i examples.

        keyCols = kwargs.get("keyCols", KeyedEstimator._paramSpecs["keyCols"]["default"])
        outputCol = kwargs.get("outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"])
        xCol = kwargs.get("xCol", KeyedEstimator._paramSpecs["xCol"]["default"])

        nExamplesPerUser = lambda i: max(minExamples, i + 1)
        userKeys = [[i for _ in keyCols] for i in range(NUSERS)]
        features = [[featureGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        useless = [["useless col" for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        if isPredictor:
            labels = [[labelGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)]
        else:
            labels = None

        Xs = [np.vstack(x) for x in features]
        ys = [np.array(y) for y in labels] if isPredictor else repeat(None)
        localEstimators = [sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y)
                           for X, y in zip(Xs, ys)]
        expectedDF = pd.DataFrame(userKeys, columns=keyCols)
        expectedDF["estimator"] = localEstimators

        def flattenAndConvertNumpy(x):
            return [Vectors.dense(i) if isinstance(i, np.ndarray) else i
                    for i in chain.from_iterable(x)]

        inputDF = pd.DataFrame.from_dict(
            {k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))] for k in keyCols})
        inputDF[xCol] = flattenAndConvertNumpy(features)
        inputDF["useless"] = flattenAndConvertNumpy(useless)
        if labels:
            inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels)
        inputDF = self.spark.createDataFrame(inputDF)

        ke = KeyedEstimator(**kwargs)
        km = ke.fit(inputDF)

        actualDF = km.keyedModels.toPandas()
        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols)

        # Test users with different amounts of points.
        nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4
        testFeatures = [[featureGen() for _ in range(nTestPerUser(i))] for i in range(NUSERS)]
        # "useless" column has nothing to do with computation, but is essential for keeping order
        # the same between the spark and non-spark versions
        useless = [range(nTestPerUser(i)) for i in range(NUSERS)]
        inputDF = pd.DataFrame.from_dict(
            {k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))] for k in keyCols})
        inputDF[xCol] = flattenAndConvertNumpy(testFeatures)
        inputDF["useless"] = flattenAndConvertNumpy(useless)

        estimatorType = km.sklearnEstimatorType  # tested to be correct elsewhere

        def makeOutput(estimator, X):
            if estimatorType == "transformer":
                return estimator.transform(X)
            else:
                assert estimatorType == "predictor" or estimatorType == "clusterer"
                return estimator.predict(X).tolist()

        Xs = [np.vstack(x) for x in testFeatures]
        expectedOutput = map(makeOutput, localEstimators, Xs)
        expectedDF = inputDF.copy(deep=True)
        expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput)

        inputDF = self.spark.createDataFrame(inputDF)
        actualDF = km.transform(inputDF).toPandas()

        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])

예제 #4

파일 보기

    def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen,
                                  **kwargs):
        NUSERS = 10
        # featureGen() should generate a np rank-1 ndarray of equal length
        # labelGen() should generate a scalar
        assert (labelGen is not None) == ("yCol" in kwargs)
        isPredictor = labelGen is not None

        # sklearn's LinearRegression estimator is stable even if undetermined.
        # User keys are just [0, NUSERS), repeated for each key if there are multiple columns.
        # The i-th user has i examples.

        keyCols = kwargs.get("keyCols",
                             KeyedEstimator._paramSpecs["keyCols"]["default"])
        outputCol = kwargs.get(
            "outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"])
        xCol = kwargs.get("xCol",
                          KeyedEstimator._paramSpecs["xCol"]["default"])

        nExamplesPerUser = lambda i: max(minExamples, i + 1)
        userKeys = [[i for _ in keyCols] for i in range(NUSERS)]
        features = [[featureGen() for _ in range(nExamplesPerUser(i))]
                    for i in range(NUSERS)]
        useless = [["useless col" for _ in range(nExamplesPerUser(i))]
                   for i in range(NUSERS)]
        if isPredictor:
            labels = [[labelGen() for _ in range(nExamplesPerUser(i))]
                      for i in range(NUSERS)]
        else:
            labels = None

        Xs = [np.vstack(x) for x in features]
        ys = [np.array(y) for y in labels] if isPredictor else repeat(None)
        localEstimators = [
            sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y)
            for X, y in zip(Xs, ys)
        ]
        expectedDF = pd.DataFrame(userKeys, columns=keyCols)
        expectedDF["estimator"] = localEstimators

        def flattenAndConvertNumpy(x):
            return [
                Vectors.dense(i) if isinstance(i, np.ndarray) else i
                for i in chain.from_iterable(x)
            ]

        inputDF = pd.DataFrame.from_dict({
            k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))]
            for k in keyCols
        })
        inputDF[xCol] = flattenAndConvertNumpy(features)
        inputDF["useless"] = flattenAndConvertNumpy(useless)
        if labels:
            inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels)
        inputDF = self.spark.createDataFrame(inputDF)

        ke = KeyedEstimator(**kwargs)
        km = ke.fit(inputDF)

        actualDF = km.keyedModels.toPandas()
        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols)

        # Test users with different amounts of points.
        nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4
        testFeatures = [[featureGen() for _ in range(nTestPerUser(i))]
                        for i in range(NUSERS)]
        # "useless" column has nothing to do with computation, but is essential for keeping order
        # the same between the spark and non-spark versions
        useless = [range(nTestPerUser(i)) for i in range(NUSERS)]
        inputDF = pd.DataFrame.from_dict({
            k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))]
            for k in keyCols
        })
        inputDF[xCol] = flattenAndConvertNumpy(testFeatures)
        inputDF["useless"] = flattenAndConvertNumpy(useless)

        estimatorType = km.sklearnEstimatorType  # tested to be correct elsewhere

        def makeOutput(estimator, X):
            if estimatorType == "transformer":
                return estimator.transform(X)
            else:
                assert estimatorType == "predictor" or estimatorType == "clusterer"
                return estimator.predict(X).tolist()

        Xs = [np.vstack(x) for x in testFeatures]
        expectedOutput = map(makeOutput, localEstimators, Xs)
        expectedDF = inputDF.copy(deep=True)
        expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput)

        inputDF = self.spark.createDataFrame(inputDF)
        actualDF = km.transform(inputDF).toPandas()

        _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])