Пример #1
0
    def __init__(self, sklearnEstimator=None, keyCols=None, xCol=None, outputCol=None, yCol=None,
                 estimatorType=None, keyedSklearnEstimators=None, outputType=None):
        """The constructor is used by :class:`KeyedEstimator` to generate a :class:`KeyedModel`; it
        is not intended for external use."""

        assert (estimatorType == "predictor") == (yCol is not None), \
            "yCol is {}, but it should {}be None for a {} estimatorType".format(
                yCol, "not " if isLabelled else "", estimatorType)
        assert estimatorType in ["transformer", "clusterer", "predictor"], estimatorType

        def implies(a, b):
            return not a or b
        assert implies(estimatorType == "transformer", outputType == Vector.__UDT__), outputType
        assert implies(estimatorType == "clusterer", outputType == LongType()), outputType
        assert len(keyCols) > 0, len(keyCols)
        assert set(keyedSklearnEstimators.columns) == (set(keyCols) | {"estimator"}), \
            "keyedSklearnEstimator columns {} should have both key columns {} and " + \
            "an estimator column".format(keyedSklearnEstimators.columns, keyCols)

        # The superclass expects Param attributes to already be set, so we only init it after
        # doing so.
        for paramName, paramSpec in KeyedModel._paramSpecs.items():
            setattr(self, paramName, Param(Params._dummy(), paramName, paramSpec["doc"]))
        super(KeyedModel, self).__init__()
        if yCol and type(outputType) not in KeyedModel._sql_types:
            raise TypeError("Output type {} is not an AtomicType (expected for {} estimator)"
                            .format(outputType, estimatorType))
        self._set(**self._input_kwargs)
Пример #2
0
    def __init__(self, sklearnEstimator=None, keyCols=["key"], xCol="features",
                 outputCol="output", yCol=None, estimatorType=None):
        """For all instances, the ordered list of ``keyCols`` determine the set of groups which each
        ``sklearnEstimator`` is applied to.

        For every unique ``keyCols`` value, the remaining columns are aggregated and used to train
        the scikit-learn estimator.

        ``estimatorType`` inference is conducted as follows: if ``yCol`` is specified, then this is
        assumed to be of ``"predictor"`` type, else a ``"transformer"`` or a ``"clusterer"``,
        depending on the estimator having the ``transform()`` or ``fit_predict()`` attributes, with
        ``"clusterer"`` being chosen in case both attributes are present.

        :param sklearnEstimator: An instance of a scikit-learn estimator, with parameters configured
                                 as desired for each user.
        :param keyCols: Key column names list used to group data to which models are applied, where
                        order implies lexicographical importance.
        :param xCol: Name of column of input features used for training and
                     transformation/prediction.
        :param yCol: Specifies name of label column for regression or classification pipelines.
                     Required for predictors, must be unspecified or ``None`` for transformers.
        :param estimatorType: Identifies the type of scikit-learn estimator being used, which
                              changes the interface the ``sklearnEstimator`` is expected to have.
                              This parameter's value is inferred using reflection by default,
                              but may be manually overriden.

        :raise ValueError: if ``sklearnEstimator`` is ``None``.
        :raise ValueError: if ``sklearnEstimator`` does not derive from
                           ``sklearn.base.BaseEstimator``.
        :raise ValueError: if ``keyCols`` is empty.
        :raise ValueError: if any column has the name ``"estimator"``
        :raise AttributeError: if reflection checks indicate that parameter estimator is not equipped
                               with a ``fit()`` method.
        """
        if sklearnEstimator is None:
            raise ValueError("sklearnEstimator should be specified")
        if not isinstance(sklearnEstimator, sklearn.base.BaseEstimator):
            raise ValueError("sklearnEstimator should be an sklearn.base.BaseEstimator")
        if len(keyCols) == 0:
            raise ValueError("keyCols should not be empty")
        if "estimator" in keyCols + [xCol, yCol]:
            raise ValueError("keyCols should not contain a column named \"estimator\"")

        # The superclass expects Param attributes to already be set, so we only init it after
        # doing so.
        for paramName, paramSpec in KeyedEstimator._paramSpecs.items():
            setattr(self, paramName, Param(Params._dummy(), paramName, paramSpec["doc"]))
        super(KeyedEstimator, self).__init__()
        self._setDefault(**{paramName: paramSpec["default"]
                            for paramName, paramSpec in KeyedEstimator._paramSpecs.items()
                            if "default" in paramSpec})
        kwargs = KeyedEstimator._inferredParams(sklearnEstimator, self._input_kwargs)
        self._set(**kwargs)

        self._verifyEstimatorType()
class HasTFInputGraph(Params):
    """
    Mixin for param tfInputGraph: a serializable object derived from a TensorFlow computation graph.
    """
    tfInputGraph = Param(Params._dummy(),
                         "tfInputGraph",
                         "A serializable object derived from a TensorFlow computation graph",
                         typeConverter=SparkDLTypeConverters.toTFInputGraph)

    def __init__(self):
        super(HasTFInputGraph, self).__init__()
        self._setDefault(tfInputGraph=None)

    def setTFInputGraph(self, value):
        return self._set(tfInputGraph=value)

    def getTFInputGraph(self):
        return self.getOrDefault(self.tfInputGraph)
class HasLabelCol(Params):
    """
    When training Keras image models in a supervised learning setting,
    users will provide a :py:obj:`DataFrame` column with the labels.

    .. note:: The Estimator expect this columnd to contain data directly usable for the Keras model.
              This usually means that the labels are already encoded in one-hot format.
              Please consider adding a :py:obj:`OneHotEncoder` to transform the label column.
    """
    labelCol = Param(Params._dummy(), "labelCol",
                     "name of the column storing the training data labels",
                     typeConverter=TypeConverters.toString)

    def setLabelCol(self, value):
        return self._set(labelCol=value)

    def getLabelCol(self):
        return self.getOrDefault(self.labelCol)
Пример #5
0
class _CrossValidatorParams(_ValidatorParams):
    """
    Params for :py:class:`CrossValidator` and :py:class:`CrossValidatorModel`.

    .. versionadded:: 3.0.0
    """

    numFolds = Param(Params._dummy(),
                     "numFolds",
                     "number of folds for cross validation",
                     typeConverter=TypeConverters.toInt)

    @since("1.4.0")
    def getNumFolds(self):
        """
        Gets the value of numFolds or its default value.
        """
        return self.getOrDefault(self.numFolds)
Пример #6
0
class _HasInitialRatesCol(Params):
    """
    Mixin for initial poisson rates parameter.
    """

    initialRatesCol = Param(
        Params._dummy(), "initialRatesCol",
        "Initial poisson rates of mixtures from dataframe column",
        TypeConverters.toString)

    def __init__(self):
        super(_HasInitialRatesCol, self).__init__()

    def getInitialRatesCol(self):
        """
        Gets the value of initial rates column or its default value.
        """
        return self.getOrDefault(self.initialRatesCol)
Пример #7
0
class _HasInitialRates(Params):
    """
    Mixin for initial poisson rates parameter.
    """

    initialRates = Param(
        Params._dummy(), "initialRates",
        "Initial poisson rates of mixtures, as a list of floats",
        TypeConverters.toListFloat)

    def __init__(self):
        super(_HasInitialRates, self).__init__()

    def getInitialRates(self):
        """
        Gets the value of initial rates or its default value.
        """
        return self.getOrDefault(self.initialRates)
Пример #8
0
class HasKerasOptimizer(Params):
    # TODO: docs
    kerasOptimizer = Param(
        Params._dummy(),
        "kerasOptimizer",
        "Name of the optimizer for training a Keras model",
        typeConverter=SparkDLTypeConverters.toKerasOptimizer)

    def __init__(self):
        super(HasKerasOptimizer, self).__init__()
        # NOTE(phi-dbq): This is the recommended optimizer as of September 2017.
        self._setDefault(kerasOptimizer='adam')

    def setKerasOptimizer(self, value):
        return self._set(kerasOptimizer=value)

    def getKerasOptimizer(self):
        return self.getOrDefault(self.kerasOptimizer)
Пример #9
0
class DeepImageFeaturizer(Transformer, HasInputCol, HasOutputCol):
    """
    Applies the model specified by its popular name, with its prediction layer(s) chopped off,
    to the image column in DataFrame. The output is a MLlib Vector so that DeepImageFeaturizer
    can be used in a MLlib Pipeline.
    """

    modelName = Param(
        Params._dummy(),
        "modelName",
        "A deep learning model name",
        typeConverter=SparkDLTypeConverters.supportedNameConverter(
            SUPPORTED_MODELS))

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, modelName=None):
        """
        __init__(self, inputCol=None, outputCol=None, modelName=None)
        """
        super(DeepImageFeaturizer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, modelName=None):
        """
        setParams(self, inputCol=None, outputCol=None, modelName=None)
        """
        kwargs = self._input_kwargs
        self._set(**kwargs)
        return self

    def setModelName(self, value):
        return self._set(modelName=value)

    def getModelName(self):
        return self.getOrDefault(self.modelName)

    def _transform(self, dataset):
        transformer = _NamedImageTransformer(inputCol=self.getInputCol(),
                                             outputCol=self.getOutputCol(),
                                             modelName=self.getModelName(),
                                             featurize=True)
        return transformer.transform(dataset)
Пример #10
0
class HasMeasurementModel(Params):
    """
    Mixin for param measurement model matrix.
    """

    measurementModel = Param(
        Params._dummy(),
        "measurementModel",
        "Measurement matrix, when multiplied with the state it should give the measurement vector",
        typeConverter=TypeConverters.toMatrix)

    def __init__(self):
        super(HasMeasurementModel, self).__init__()

    def getMeasurementModel(self):
        """
        Gets the value of measurement model matrix or its default value.
        """
        return self.getOrDefault(self.measurementModel)
Пример #11
0
class HasMeasurementCol(Params):
    """
    Mixin for param for measurement column.
    """

    measurementCol = Param(
        Params._dummy(),
        "measurementCol",
        "Column name for measurement vector. Missing measurements are allowed with nulls in the data",
        typeConverter=TypeConverters.toString)

    def __init__(self):
        super(HasMeasurementCol, self).__init__()

    def getMeasurementCol(self):
        """
        Gets the value of measurement column or its default value.
        """
        return self.getOrDefault(self.measurementCol)
Пример #12
0
class HasEmbeddingsProperties(Params):
    dimension = Param(Params._dummy(),
                      "dimension",
                      "Number of embedding dimensions",
                      typeConverter=TypeConverters.toInt)

    def setDimension(self, value):
        """Sets embeddings dimension.

        Parameters
        ----------
        value : int
            Embeddings dimension
        """
        return self._set(dimension=value)

    def getDimension(self):
        """Gets embeddings dimension."""
        return self.getOrDefault(self.dimension)
Пример #13
0
class HasBatchTrainTol(Params):
    """
    Mixin for batch train iteration stop tolerance
    """

    batchTrainTol = Param(
        Params._dummy(),
        "batchTrainTol",
        "Min change in loglikelihood to stop iterations in batch EM mode. Default is 0.1",
        TypeConverters.toFloat)

    def __init__(self):
        super(HasBatchTrainTol, self).__init__()

    def getBatchTrainTol(self):
        """
        Gets the value of batchTrainTol or its default value
        """
        return self.getOrDefault(self.batchTrainTol)
Пример #14
0
class HasUpdateHoldoutCol(Params):
    """
    Mixin for update holdout parameter
    """

    updateHoldoutCol = Param(
        Params._dummy(),
        "updateHoldoutCol",
        "updateHoldout from dataframe column rather than a constant value across all states",
        TypeConverters.toString)

    def __init__(self):
        super(HasUpdateHoldoutCol, self).__init__()

    def getUpdateHoldoutCol(self):
        """
        Gets the value of update holdout col or its default value
        """
        return self.getOrDefault(self.updateHoldoutCol)
Пример #15
0
class HasSampleCol(Params):
    """
    Mixin for sample column parameter.
    """

    sampleCol = Param(
        Params._dummy(),
        "sampleCol",
        "Column name for input to mixture models",
        TypeConverters.toString)

    def __init__(self):
        super(HasSampleCol, self).__init__()

    def getSampleCol(self):
        """
        Gets the value of initial weights or its default value.
        """
        return self.getOrDefault(self.sampleCol)
Пример #16
0
class HasInitialMixtureModelCol(Params):
    """
    Mixin for initial mixture model parameter.
    """

    initialMixtureModelCol = Param(
        Params._dummy(),
        "initialMixtureModelCol",
        "Sets the initial mixture model from struct column conforming to mixture distribution",
        TypeConverters.toString)

    def __init__(self):
        super(HasInitialMixtureModelCol, self).__init__()

    def getInitialMixtureModelCol(self):
        """
        Gets the value of initial mixture model col or its default value.
        """
        return self.getOrDefault(self.initialMixtureModelCol)
Пример #17
0
class HasMultipleModelAdaptiveEstimationEnabled(Params):

    multipleModelAdaptiveEstimationEnabled = Param(
        Params._dummy(),
        "multipleModelAdaptiveEstimationEnabled",
        "Flag for enabling  Multiple Model Adaptive Estimation (MMAE) output mode. When enabled,"
        + "" +
        "MMAE mode outputs a single state estimate from the output of all kalman states of the transformer."
        + "States are weighted based on their sliding likelihood",
        typeConverter=TypeConverters.toBoolean)

    def __init__(self):
        super(HasMultipleModelAdaptiveEstimationEnabled, self).__init__()

    def getMultipleModelAdaptiveEstimationEnabled(self):
        """
        Gets the value of MMAE output mode flag
        """
        return self.getOrDefault(self.multipleModelAdaptiveEstimationEnabled)
Пример #18
0
class _HasLearningRate(Params):
    """
    Mixin for param Normalized LMS learning rate
    """

    learningRate = Param(
        Params._dummy(),
        "learningRate",
        "Learning rate for Normalized LMS. If there is no interference, the default value of 1.0 is optimal",
        typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(_HasLearningRate, self).__init__()

    def getLearningRate(self):
        """
        Gets the value of learning rate or its default value.
        """
        return self.getOrDefault(self.learningRate)
Пример #19
0
class _HasRegularizationConstant(Params):
    """
    Mixin for param for regularization constant.
    """

    regularizationConstant = Param(
        Params._dummy(),
        "regularizationConstant",
        "Regularization term for stability, default is 1.0",
        typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(_HasRegularizationConstant, self).__init__()

    def getRegularizationConstant(self):
        """
        Gets the value of regularization constant or its default value.
        """
        return self.getOrDefault(self.regularizationConstant)
Пример #20
0
class HasStateTimeoutMode(Params):
    """
    Mixin for param for state timeout mode for clearing states without updates, one of "none", "process" or "event".
    """

    timeoutMode = Param(
        Params._dummy(),
        "timeoutMode",
        "Timeout mode for clearing the states that didn't receive measurements.",
        typeConverter=TypeConverters.toString)

    def __init__(self):
        super(HasStateTimeoutMode, self).__init__()

    def getTimeoutMode(self):
        """
        Gets the value of timeout mode or its default value.
        """
        return self.getOrDefault(self.timeoutMode)
Пример #21
0
class HasEventTimeCol(Params):
    """
    Mixin for param for event time column.
    """

    eventTimeCol = Param(
        Params._dummy(),
        "eventTimeCol",
        "Column marking the event time of the received measurements",
        typeConverter=TypeConverters.toString)

    def __init__(self):
        super(HasEventTimeCol, self).__init__()

    def getEventTimeCol(self):
        """
        Gets the value of event time column or its default value.
        """
        return self.getOrDefault(self.eventTimeCol)
Пример #22
0
class HasStateTimeoutDuration(Params):
    """
    Mixin for param for state timeout duration.
    """

    stateTimeoutDuration = Param(
        Params._dummy(),
        "stateTimeoutDuration",
        "Duration to wait before timing out the state",
        typeConverter=TypeConverters.toString)

    def __init__(self):
        super(HasStateTimeoutDuration, self).__init__()

    def getStateTimeoutDuration(self):
        """
        Gets the value of state timeout duration or its default value.
        """
        return self.getOrDefault(self.stateTimeoutDuration)
Пример #23
0
class HasCalculateLoglikelihood(Params):
    """
    Mixin for param for enabling loglikelihood calculation.
    """

    calculateLoglikelihood = Param(
        Params._dummy(),
        "calculateLoglikelihood",
        "When true, loglikelihood of residual will be calculated & added to output DataFrame. Default is false",
        typeConverter=TypeConverters.toBoolean)

    def __init__(self):
        super(HasCalculateLoglikelihood, self).__init__()

    def getCalculateLoglikelihood(self):
        """
        Gets the value of loglikelihood calculation flag.
        """
        return self.getOrDefault(self.calculateLoglikelihood)
Пример #24
0
class HasInitialStateDistributionCol(Params):
    """
    Mixin for param for initial state distribution column.
    """

    initialStateDistributionCol = Param(
        Params._dummy(),
        "initialStateDistributionCol",
        "Parameter for initial state distribution as struct col",
        typeConverter=TypeConverters.toString)

    def __init__(self):
        super(HasInitialStateDistributionCol, self).__init__()

    def getInitialStateDistributionCol(self):
        """
        Gets the value of initial distribution column or its default value.
        """
        return self.getOrDefault(self.initialStateDistributionCol)
Пример #25
0
class HasOutputSystemMatrices(Params):
    """
    Mixin for param for enabling the output of system matrices along with the state.
    """

    outputSystemMatrices = Param(
        Params._dummy(),
        "outputSystemMatrices",
        "When true, the system matrices will be added to output DataFrame. Default is false",
        typeConverter=TypeConverters.toBoolean)

    def __init__(self):
        super(HasOutputSystemMatrices, self).__init__()

    def getOutputSystemMatrices(self):
        """
        Gets the value of loglikelihood calculation flag.
        """
        return self.getOrDefault(self.outputSystemMatrices)
Пример #26
0
class HasProcessModel(Params):
    """
    Mixin for param process model matrix.
    """

    processModel = Param(
        Params._dummy(),
        "processModel",
        "Process model matrix, transitions the state to the next state when applied",
        typeConverter=TypeConverters.toMatrix)

    def __init__(self):
        super(HasProcessModel, self).__init__()

    def getProcessModel(self):
        """
        Gets the value of process model matrix or its default value.
        """
        return self.getOrDefault(self.processModel)
Пример #27
0
class HasInitialStateCovarianceCol(Params):
    """
    Mixin for param for initial covariance column.
    """

    initialStateCovarianceCol = Param(
        Params._dummy(),
        "initialStateCovarianceCol",
        "Column name for initial state covariance matrix.",
        typeConverter=TypeConverters.toString)

    def __init__(self):
        super(HasInitialStateCovarianceCol, self).__init__()

    def getInitialStateCovarianceCol(self):
        """
        Gets the value of initial covariance column or its default value.
        """
        return self.getOrDefault(self.initialStateCovarianceCol)
Пример #28
0
class HasBatchTrainMaxIter(Params):
    """
    Mixin for batch train max iterations
    """

    batchTrainMaxIter = Param(
        Params._dummy(),
        "batchTrainMaxIter",
        "Maximum iterations in batch train mode, default is 30",
        TypeConverters.toInt)

    def __init__(self):
        super(HasBatchTrainMaxIter, self).__init__()

    def getBatchTrainMaxIter(self):
        """
        Gets the value of maxIter or its default value
        """
        return self.getOrDefault(self.batchTrainMaxIter)
Пример #29
0
    def test_params(self):
        testParams = TestParams()
        maxIter = testParams.maxIter
        inputCol = testParams.inputCol
        seed = testParams.seed

        params = testParams.params
        self.assertEqual(params, [inputCol, maxIter, seed])

        self.assertTrue(testParams.hasParam(maxIter.name))
        self.assertTrue(testParams.hasDefault(maxIter))
        self.assertFalse(testParams.isSet(maxIter))
        self.assertTrue(testParams.isDefined(maxIter))
        self.assertEqual(testParams.getMaxIter(), 10)
        testParams.setMaxIter(100)
        self.assertTrue(testParams.isSet(maxIter))
        self.assertEqual(testParams.getMaxIter(), 100)

        self.assertTrue(testParams.hasParam(inputCol.name))
        self.assertFalse(testParams.hasDefault(inputCol))
        self.assertFalse(testParams.isSet(inputCol))
        self.assertFalse(testParams.isDefined(inputCol))
        with self.assertRaises(KeyError):
            testParams.getInputCol()

        otherParam = Param(Params._dummy(),
                           "otherParam",
                           "Parameter used to test that " +
                           "set raises an error for a non-member parameter.",
                           typeConverter=TypeConverters.toString)
        with self.assertRaises(ValueError):
            testParams.set(otherParam, "value")

        # Since the default is normally random, set it to a known number for debug str
        testParams._setDefault(seed=41)
        testParams.setSeed(43)

        self.assertEqual(
            testParams.explainParams(), "\n".join([
                "inputCol: input column name. (undefined)",
                "maxIter: max number of iterations (>= 0). (default: 10, current: 100)",
                "seed: random seed. (default: 41, current: 43)"
            ]))
Пример #30
0
class _TrainValidationSplitParams(_ValidatorParams):
    """
    Params for :py:class:`TrainValidationSplit` and :py:class:`TrainValidationSplitModel`.

    .. versionadded:: 3.0.0
    """

    trainRatio = Param(Params._dummy(),
                       "trainRatio",
                       "Param for ratio between train and\
     validation data. Must be between 0 and 1.",
                       typeConverter=TypeConverters.toFloat)

    @since("2.0.0")
    def getTrainRatio(self):
        """
        Gets the value of trainRatio or its default value.
        """
        return self.getOrDefault(self.trainRatio)
Пример #31
0
class HasUpdateHoldout(Params):
    """
    Mixin for update holdout parameter
    """

    updateHoldout = Param(
        Params._dummy(), "updateHoldout",
        "Controls after how many samples the mixture will start calculating estimates. Preventing update"
        + "in first few samples might be preferred for stability.",
        TypeConverters.toInt)

    def __init__(self):
        super(HasUpdateHoldout, self).__init__()

    def getUpdateHoldout(self):
        """
        Gets the value of update holdout or its default value
        """
        return self.getOrDefault(self.updateHoldout)
Пример #32
0
class HasDecayRate(Params):
    """
    Mixin for decaying step size parameter
    """
    decayRate = Param(
        Params._dummy(), "decayRate",
        "Step size as a decaying function rather than a constant, which might be preferred in batch training."
        +
        "If set, the step size will be replaced with the output of the function"
        + "stepSize = (2 + kIter)**(-decayRate)", TypeConverters.toFloat)

    def __init__(self):
        super(HasDecayRate, self).__init__()

    def getDecayingStepSizeEnabled(self):
        """
        Gets the value of decaying step size flag
        """
        return self.getOrDefault(self.decayRate)
Пример #33
0
    def test_params(self):
        testParams = TestParams()
        maxIter = testParams.maxIter
        inputCol = testParams.inputCol
        seed = testParams.seed

        params = testParams.params
        self.assertEqual(params, [inputCol, maxIter, seed])

        self.assertTrue(testParams.hasParam(maxIter.name))
        self.assertTrue(testParams.hasDefault(maxIter))
        self.assertFalse(testParams.isSet(maxIter))
        self.assertTrue(testParams.isDefined(maxIter))
        self.assertEqual(testParams.getMaxIter(), 10)
        testParams.setMaxIter(100)
        self.assertTrue(testParams.isSet(maxIter))
        self.assertEqual(testParams.getMaxIter(), 100)

        self.assertTrue(testParams.hasParam(inputCol.name))
        self.assertFalse(testParams.hasDefault(inputCol))
        self.assertFalse(testParams.isSet(inputCol))
        self.assertFalse(testParams.isDefined(inputCol))
        with self.assertRaises(KeyError):
            testParams.getInputCol()

        otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " +
                           "set raises an error for a non-member parameter.",
                           typeConverter=TypeConverters.toString)
        with self.assertRaises(ValueError):
            testParams.set(otherParam, "value")

        # Since the default is normally random, set it to a known number for debug str
        testParams._setDefault(seed=41)
        testParams.setSeed(43)

        self.assertEqual(
            testParams.explainParams(),
            "\n".join(["inputCol: input column name. (undefined)",
                       "maxIter: max number of iterations (>= 0). (default: 10, current: 100)",
                       "seed: random seed. (default: 41, current: 43)"]))