Exemplo n.º 1
0
    def _fit(self, dataset):
        keyCols = self.getOrDefault("keyCols")
        xCol = self.getOrDefault("xCol")
        yCol = self.getOrDefault("yCol")
        isLabelled = yCol is not None
        estimatorType = self.getOrDefault("estimatorType")
        assert isLabelled == (estimatorType == "predictor"), \
            "yCol is {}, but it should {}be None for a {} estimatorType".format(
                yCol, "not " if isLabelled else "", estimatorType)

        _validateXCol(dataset.schema, xCol)

        cols = keyCols[:]
        cols.append(xCol)
        if isLabelled:
            cols.append(yCol)

        oneDimensional = _isOneDimensional(dataset.schema, xCol)
        projected = dataset.select(*cols) # also verifies all cols are present
        outputSchema = StructType().add("estimator", _SparkSklearnEstimatorUDT.sqlType())
        grouped = projected.groupBy(*keyCols)
        estimator = self.getOrDefault("sklearnEstimator")

        # Potential optimization: broadcast estimator

        import pandas as pd
        def fitEstimator(_, pandasDF):
            X = _prepareXCol(pandasDF[xCol], oneDimensional)
            y = pandasDF[yCol].values if isLabelled else None
            # Potential optimization - del pandasDF

            estimatorClone = sklearn.base.clone(estimator)
            estimatorClone.fit(X, y)
            pickled = pickle.dumps(estimatorClone)
            # Potential optimization - del estimatorClone

            # Until SPARK-15989 is resolved, we can't output the sklearn UDT directly here.
            return pd.DataFrame.from_records([(pickled,)])

        fitted = gapply(grouped, fitEstimator, outputSchema)

        extractSklearn = udf(lambda estimatorStr: SparkSklearnEstimator(pickle.loads(estimatorStr)),
                             SparkSklearnEstimator.__UDT__)
        keyedSklearnEstimators = fitted.select(
            *chain(keyCols, [extractSklearn(fitted["estimator"]).alias("estimator")]))

        if isLabelled:
            outputType = dataset.schema[yCol].dataType
        else:
            outputType = Vector.__UDT__

        return KeyedModel(keyCols=keyCols, xCol=xCol, outputCol=self.getOrDefault("outputCol"),
                          yCol=yCol, estimatorType=self.getOrDefault("estimatorType"),
                          keyedSklearnEstimators=keyedSklearnEstimators, outputType=outputType)
Exemplo n.º 2
0
    def _fit(self, dataset):
        keyCols = self.getOrDefault("keyCols")
        xCol = self.getOrDefault("xCol")
        yCol = self.getOrDefault("yCol")
        isLabelled = yCol is not None
        estimatorType = self.getOrDefault("estimatorType")
        assert isLabelled == (estimatorType == "predictor"), \
            "yCol is {}, but it should {}be None for a {} estimatorType".format(
                yCol, "not " if isLabelled else "", estimatorType)

        _validateXCol(dataset.schema, xCol)

        cols = keyCols[:]
        cols.append(xCol)
        if isLabelled:
            cols.append(yCol)

        oneDimensional = _isOneDimensional(dataset.schema, xCol)
        projected = dataset.select(*cols)  # also verifies all cols are present
        outputSchema = StructType().add("estimator",
                                        _SparkSklearnEstimatorUDT.sqlType())
        grouped = projected.groupBy(*keyCols)
        estimator = self.getOrDefault("sklearnEstimator")

        # Potential optimization: broadcast estimator

        import pandas as pd

        def fitEstimator(_, pandasDF):
            X = _prepareXCol(pandasDF[xCol], oneDimensional)
            y = pandasDF[yCol].values if isLabelled else None
            # Potential optimization - del pandasDF

            estimatorClone = sklearn.base.clone(estimator)
            estimatorClone.fit(X, y)
            pickled = pickle.dumps(estimatorClone)
            # Potential optimization - del estimatorClone

            # Until SPARK-15989 is resolved, we can't output the sklearn UDT directly here.
            return pd.DataFrame.from_records([(pickled, )])

        fitted = gapply(grouped, fitEstimator, outputSchema)

        extractSklearn = udf(
            lambda estimatorStr: SparkSklearnEstimator(
                pickle.loads(estimatorStr)), SparkSklearnEstimator.__UDT__)
        keyedSklearnEstimators = fitted.select(*chain(
            keyCols, [extractSklearn(fitted["estimator"]).alias("estimator")]))

        if isLabelled:
            outputType = dataset.schema[yCol].dataType
        else:
            outputType = Vector.__UDT__

        return KeyedModel(keyCols=keyCols,
                          xCol=xCol,
                          outputCol=self.getOrDefault("outputCol"),
                          yCol=yCol,
                          estimatorType=self.getOrDefault("estimatorType"),
                          keyedSklearnEstimators=keyedSklearnEstimators,
                          outputType=outputType)
Exemplo n.º 3
0
    def _fit(self, dataset):
        keyCols = self.getOrDefault("keyCols")
        xCol = self.getOrDefault("xCol")
        yCol = self.getOrDefault("yCol")
        isLabelled = yCol is not None
        estimatorType = self.getOrDefault("estimatorType")
        assert isLabelled == (estimatorType == "predictor"), \
            "yCol is {}, but it should {}be None for a {} estimatorType".format(
                yCol, "not " if isLabelled else "", estimatorType)

        _validateXCol(dataset.schema, xCol)

        cols = keyCols[:]
        cols.append(xCol)
        if isLabelled:
            cols.append(yCol)

        oneDimensional = _isOneDimensional(dataset.schema, xCol)
        projected = dataset.select(*cols)  # also verifies all cols are present
        outputSchema = StructType().add("estimator", _SparkSklearnEstimatorUDT.sqlType())
        grouped = projected.groupBy(*keyCols)
        estimator = self.getOrDefault("sklearnEstimator")

        # Potential optimization: broadcast estimator
        # Potential optimization (perhaps better on gapply() level): Currently,
        # batched python evaluation may cause OOM if two large key groups are put on one
        # machine. (1) Key groups should be (1) evenly distributed. (2) gapply could make
        # smarter use of memory and reduce copies. (3) Batched python function evaluation
        # can be smart on its data handoff to python - perhaps it could set up a pipe
        # with the python process for per-row data loading.

        import pandas as pd

        def fitEstimator(_, pandasDF):
            X = _prepareXCol(pandasDF[xCol], oneDimensional)
            y = pandasDF[yCol].values if isLabelled else None
            # Potential optimization - del pandasDF

            estimatorClone = sklearn.base.clone(estimator)
            if y is None:
                estimatorClone.fit(X)  # fit may have 1 argument (e.g., sklearn.cluster.bicluster)
            else:
                estimatorClone.fit(X, y)
            pickled = pickle.dumps(estimatorClone)
            # Potential optimization - del estimatorClone

            # Until SPARK-15989 is resolved, we can't output the sklearn UDT directly here.
            return pd.DataFrame.from_records([(pickled,)])

        fitted = gapply(grouped, fitEstimator, outputSchema)

        extractSklearn = udf(lambda estimatorStr: SparkSklearnEstimator(pickle.loads(estimatorStr)),
                             SparkSklearnEstimator.__UDT__)
        keyedSklearnEstimators = fitted.select(
            *chain(keyCols, [extractSklearn(fitted["estimator"]).alias("estimator")]))

        if isLabelled:
            assert estimatorType == "predictor", estimatorType
            outputType = dataset.schema[yCol].dataType
        elif estimatorType == "clusterer":
            outputType = LongType()
        else:
            assert estimatorType == "transformer", estimatorType
            outputType = Vector.__UDT__

        return KeyedModel(sklearnEstimator=estimator, keyCols=keyCols, xCol=xCol,
                          outputCol=self.getOrDefault("outputCol"),
                          yCol=yCol, estimatorType=estimatorType,
                          keyedSklearnEstimators=keyedSklearnEstimators, outputType=outputType)
Exemplo n.º 4
0
    def _fit(self, dataset):
        keyCols = self.getOrDefault("keyCols")
        xCol = self.getOrDefault("xCol")
        yCol = self.getOrDefault("yCol")
        isLabelled = yCol is not None
        estimatorType = self.getOrDefault("estimatorType")
        assert isLabelled == (estimatorType == "predictor"), \
            "yCol is {}, but it should {}be None for a {} estimatorType".format(
                yCol, "not " if isLabelled else "", estimatorType)

        _validateXCol(dataset.schema, xCol)

        cols = keyCols[:]
        cols.append(xCol)
        if isLabelled:
            cols.append(yCol)

        oneDimensional = _isOneDimensional(dataset.schema, xCol)
        projected = dataset.select(*cols)  # also verifies all cols are present
        outputSchema = StructType().add("estimator",
                                        _SparkSklearnEstimatorUDT.sqlType())
        grouped = projected.groupBy(*keyCols)
        estimator = self.getOrDefault("sklearnEstimator")

        # Potential optimization: broadcast estimator
        # Potential optimization (perhaps better on gapply() level): Currently,
        # batched python evaluation may cause OOM if two large key groups are put on one
        # machine. (1) Key groups should be (1) evenly distributed. (2) gapply could make
        # smarter use of memory and reduce copies. (3) Batched python function evaluation
        # can be smart on its data handoff to python - perhaps it could set up a pipe
        # with the python process for per-row data loading.

        import pandas as pd

        def fitEstimator(_, pandasDF):
            X = _prepareXCol(pandasDF[xCol], oneDimensional)
            y = pandasDF[yCol].values if isLabelled else None
            # Potential optimization - del pandasDF

            estimatorClone = sklearn.base.clone(estimator)
            if y is None:
                estimatorClone.fit(
                    X
                )  # fit may have 1 argument (e.g., sklearn.cluster.bicluster)
            else:
                estimatorClone.fit(X, y)
            pickled = pickle.dumps(estimatorClone)
            # Potential optimization - del estimatorClone

            # Until SPARK-15989 is resolved, we can't output the sklearn UDT directly here.
            return pd.DataFrame.from_records([(pickled, )])

        fitted = gapply(grouped, fitEstimator, outputSchema)

        extractSklearn = udf(
            lambda estimatorStr: SparkSklearnEstimator(
                pickle.loads(estimatorStr)), SparkSklearnEstimator.__UDT__)
        keyedSklearnEstimators = fitted.select(*chain(
            keyCols, [extractSklearn(fitted["estimator"]).alias("estimator")]))

        if isLabelled:
            assert estimatorType == "predictor", estimatorType
            outputType = dataset.schema[yCol].dataType
        elif estimatorType == "clusterer":
            outputType = LongType()
        else:
            assert estimatorType == "transformer", estimatorType
            outputType = Vector.__UDT__

        return KeyedModel(sklearnEstimator=estimator,
                          keyCols=keyCols,
                          xCol=xCol,
                          outputCol=self.getOrDefault("outputCol"),
                          yCol=yCol,
                          estimatorType=estimatorType,
                          keyedSklearnEstimators=keyedSklearnEstimators,
                          outputType=outputType)