def __init__(self, sklearnEstimator=None, keyCols=None, xCol=None, outputCol=None, yCol=None, estimatorType=None, keyedSklearnEstimators=None, outputType=None): """The constructor is used by :class:`KeyedEstimator` to generate a :class:`KeyedModel`; it is not intended for external use.""" assert (estimatorType == "predictor") == (yCol is not None), \ "yCol is {}, but it should {}be None for a {} estimatorType".format( yCol, "not " if isLabelled else "", estimatorType) assert estimatorType in ["transformer", "clusterer", "predictor"], estimatorType def implies(a, b): return not a or b assert implies(estimatorType == "transformer", outputType == Vector.__UDT__), outputType assert implies(estimatorType == "clusterer", outputType == LongType()), outputType assert len(keyCols) > 0, len(keyCols) assert set(keyedSklearnEstimators.columns) == (set(keyCols) | {"estimator"}), \ "keyedSklearnEstimator columns {} should have both key columns {} and " + \ "an estimator column".format(keyedSklearnEstimators.columns, keyCols) # The superclass expects Param attributes to already be set, so we only init it after # doing so. for paramName, paramSpec in KeyedModel._paramSpecs.items(): setattr(self, paramName, Param(Params._dummy(), paramName, paramSpec["doc"])) super(KeyedModel, self).__init__() if yCol and type(outputType) not in KeyedModel._sql_types: raise TypeError("Output type {} is not an AtomicType (expected for {} estimator)" .format(outputType, estimatorType)) self._set(**self._input_kwargs)
def __init__(self, sklearnEstimator=None, keyCols=["key"], xCol="features", outputCol="output", yCol=None, estimatorType=None): """For all instances, the ordered list of ``keyCols`` determine the set of groups which each ``sklearnEstimator`` is applied to. For every unique ``keyCols`` value, the remaining columns are aggregated and used to train the scikit-learn estimator. ``estimatorType`` inference is conducted as follows: if ``yCol`` is specified, then this is assumed to be of ``"predictor"`` type, else a ``"transformer"`` or a ``"clusterer"``, depending on the estimator having the ``transform()`` or ``fit_predict()`` attributes, with ``"clusterer"`` being chosen in case both attributes are present. :param sklearnEstimator: An instance of a scikit-learn estimator, with parameters configured as desired for each user. :param keyCols: Key column names list used to group data to which models are applied, where order implies lexicographical importance. :param xCol: Name of column of input features used for training and transformation/prediction. :param yCol: Specifies name of label column for regression or classification pipelines. Required for predictors, must be unspecified or ``None`` for transformers. :param estimatorType: Identifies the type of scikit-learn estimator being used, which changes the interface the ``sklearnEstimator`` is expected to have. This parameter's value is inferred using reflection by default, but may be manually overriden. :raise ValueError: if ``sklearnEstimator`` is ``None``. :raise ValueError: if ``sklearnEstimator`` does not derive from ``sklearn.base.BaseEstimator``. :raise ValueError: if ``keyCols`` is empty. :raise ValueError: if any column has the name ``"estimator"`` :raise AttributeError: if reflection checks indicate that parameter estimator is not equipped with a ``fit()`` method. """ if sklearnEstimator is None: raise ValueError("sklearnEstimator should be specified") if not isinstance(sklearnEstimator, sklearn.base.BaseEstimator): raise ValueError("sklearnEstimator should be an sklearn.base.BaseEstimator") if len(keyCols) == 0: raise ValueError("keyCols should not be empty") if "estimator" in keyCols + [xCol, yCol]: raise ValueError("keyCols should not contain a column named \"estimator\"") # The superclass expects Param attributes to already be set, so we only init it after # doing so. for paramName, paramSpec in KeyedEstimator._paramSpecs.items(): setattr(self, paramName, Param(Params._dummy(), paramName, paramSpec["doc"])) super(KeyedEstimator, self).__init__() self._setDefault(**{paramName: paramSpec["default"] for paramName, paramSpec in KeyedEstimator._paramSpecs.items() if "default" in paramSpec}) kwargs = KeyedEstimator._inferredParams(sklearnEstimator, self._input_kwargs) self._set(**kwargs) self._verifyEstimatorType()
class HasTFInputGraph(Params): """ Mixin for param tfInputGraph: a serializable object derived from a TensorFlow computation graph. """ tfInputGraph = Param(Params._dummy(), "tfInputGraph", "A serializable object derived from a TensorFlow computation graph", typeConverter=SparkDLTypeConverters.toTFInputGraph) def __init__(self): super(HasTFInputGraph, self).__init__() self._setDefault(tfInputGraph=None) def setTFInputGraph(self, value): return self._set(tfInputGraph=value) def getTFInputGraph(self): return self.getOrDefault(self.tfInputGraph)
class HasLabelCol(Params): """ When training Keras image models in a supervised learning setting, users will provide a :py:obj:`DataFrame` column with the labels. .. note:: The Estimator expect this columnd to contain data directly usable for the Keras model. This usually means that the labels are already encoded in one-hot format. Please consider adding a :py:obj:`OneHotEncoder` to transform the label column. """ labelCol = Param(Params._dummy(), "labelCol", "name of the column storing the training data labels", typeConverter=TypeConverters.toString) def setLabelCol(self, value): return self._set(labelCol=value) def getLabelCol(self): return self.getOrDefault(self.labelCol)
class _CrossValidatorParams(_ValidatorParams): """ Params for :py:class:`CrossValidator` and :py:class:`CrossValidatorModel`. .. versionadded:: 3.0.0 """ numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation", typeConverter=TypeConverters.toInt) @since("1.4.0") def getNumFolds(self): """ Gets the value of numFolds or its default value. """ return self.getOrDefault(self.numFolds)
class _HasInitialRatesCol(Params): """ Mixin for initial poisson rates parameter. """ initialRatesCol = Param( Params._dummy(), "initialRatesCol", "Initial poisson rates of mixtures from dataframe column", TypeConverters.toString) def __init__(self): super(_HasInitialRatesCol, self).__init__() def getInitialRatesCol(self): """ Gets the value of initial rates column or its default value. """ return self.getOrDefault(self.initialRatesCol)
class _HasInitialRates(Params): """ Mixin for initial poisson rates parameter. """ initialRates = Param( Params._dummy(), "initialRates", "Initial poisson rates of mixtures, as a list of floats", TypeConverters.toListFloat) def __init__(self): super(_HasInitialRates, self).__init__() def getInitialRates(self): """ Gets the value of initial rates or its default value. """ return self.getOrDefault(self.initialRates)
class HasKerasOptimizer(Params): # TODO: docs kerasOptimizer = Param( Params._dummy(), "kerasOptimizer", "Name of the optimizer for training a Keras model", typeConverter=SparkDLTypeConverters.toKerasOptimizer) def __init__(self): super(HasKerasOptimizer, self).__init__() # NOTE(phi-dbq): This is the recommended optimizer as of September 2017. self._setDefault(kerasOptimizer='adam') def setKerasOptimizer(self, value): return self._set(kerasOptimizer=value) def getKerasOptimizer(self): return self.getOrDefault(self.kerasOptimizer)
class DeepImageFeaturizer(Transformer, HasInputCol, HasOutputCol): """ Applies the model specified by its popular name, with its prediction layer(s) chopped off, to the image column in DataFrame. The output is a MLlib Vector so that DeepImageFeaturizer can be used in a MLlib Pipeline. """ modelName = Param( Params._dummy(), "modelName", "A deep learning model name", typeConverter=SparkDLTypeConverters.supportedNameConverter( SUPPORTED_MODELS)) @keyword_only def __init__(self, inputCol=None, outputCol=None, modelName=None): """ __init__(self, inputCol=None, outputCol=None, modelName=None) """ super(DeepImageFeaturizer, self).__init__() kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only def setParams(self, inputCol=None, outputCol=None, modelName=None): """ setParams(self, inputCol=None, outputCol=None, modelName=None) """ kwargs = self._input_kwargs self._set(**kwargs) return self def setModelName(self, value): return self._set(modelName=value) def getModelName(self): return self.getOrDefault(self.modelName) def _transform(self, dataset): transformer = _NamedImageTransformer(inputCol=self.getInputCol(), outputCol=self.getOutputCol(), modelName=self.getModelName(), featurize=True) return transformer.transform(dataset)
class HasMeasurementModel(Params): """ Mixin for param measurement model matrix. """ measurementModel = Param( Params._dummy(), "measurementModel", "Measurement matrix, when multiplied with the state it should give the measurement vector", typeConverter=TypeConverters.toMatrix) def __init__(self): super(HasMeasurementModel, self).__init__() def getMeasurementModel(self): """ Gets the value of measurement model matrix or its default value. """ return self.getOrDefault(self.measurementModel)
class HasMeasurementCol(Params): """ Mixin for param for measurement column. """ measurementCol = Param( Params._dummy(), "measurementCol", "Column name for measurement vector. Missing measurements are allowed with nulls in the data", typeConverter=TypeConverters.toString) def __init__(self): super(HasMeasurementCol, self).__init__() def getMeasurementCol(self): """ Gets the value of measurement column or its default value. """ return self.getOrDefault(self.measurementCol)
class HasEmbeddingsProperties(Params): dimension = Param(Params._dummy(), "dimension", "Number of embedding dimensions", typeConverter=TypeConverters.toInt) def setDimension(self, value): """Sets embeddings dimension. Parameters ---------- value : int Embeddings dimension """ return self._set(dimension=value) def getDimension(self): """Gets embeddings dimension.""" return self.getOrDefault(self.dimension)
class HasBatchTrainTol(Params): """ Mixin for batch train iteration stop tolerance """ batchTrainTol = Param( Params._dummy(), "batchTrainTol", "Min change in loglikelihood to stop iterations in batch EM mode. Default is 0.1", TypeConverters.toFloat) def __init__(self): super(HasBatchTrainTol, self).__init__() def getBatchTrainTol(self): """ Gets the value of batchTrainTol or its default value """ return self.getOrDefault(self.batchTrainTol)
class HasUpdateHoldoutCol(Params): """ Mixin for update holdout parameter """ updateHoldoutCol = Param( Params._dummy(), "updateHoldoutCol", "updateHoldout from dataframe column rather than a constant value across all states", TypeConverters.toString) def __init__(self): super(HasUpdateHoldoutCol, self).__init__() def getUpdateHoldoutCol(self): """ Gets the value of update holdout col or its default value """ return self.getOrDefault(self.updateHoldoutCol)
class HasSampleCol(Params): """ Mixin for sample column parameter. """ sampleCol = Param( Params._dummy(), "sampleCol", "Column name for input to mixture models", TypeConverters.toString) def __init__(self): super(HasSampleCol, self).__init__() def getSampleCol(self): """ Gets the value of initial weights or its default value. """ return self.getOrDefault(self.sampleCol)
class HasInitialMixtureModelCol(Params): """ Mixin for initial mixture model parameter. """ initialMixtureModelCol = Param( Params._dummy(), "initialMixtureModelCol", "Sets the initial mixture model from struct column conforming to mixture distribution", TypeConverters.toString) def __init__(self): super(HasInitialMixtureModelCol, self).__init__() def getInitialMixtureModelCol(self): """ Gets the value of initial mixture model col or its default value. """ return self.getOrDefault(self.initialMixtureModelCol)
class HasMultipleModelAdaptiveEstimationEnabled(Params): multipleModelAdaptiveEstimationEnabled = Param( Params._dummy(), "multipleModelAdaptiveEstimationEnabled", "Flag for enabling Multiple Model Adaptive Estimation (MMAE) output mode. When enabled," + "" + "MMAE mode outputs a single state estimate from the output of all kalman states of the transformer." + "States are weighted based on their sliding likelihood", typeConverter=TypeConverters.toBoolean) def __init__(self): super(HasMultipleModelAdaptiveEstimationEnabled, self).__init__() def getMultipleModelAdaptiveEstimationEnabled(self): """ Gets the value of MMAE output mode flag """ return self.getOrDefault(self.multipleModelAdaptiveEstimationEnabled)
class _HasLearningRate(Params): """ Mixin for param Normalized LMS learning rate """ learningRate = Param( Params._dummy(), "learningRate", "Learning rate for Normalized LMS. If there is no interference, the default value of 1.0 is optimal", typeConverter=TypeConverters.toFloat) def __init__(self): super(_HasLearningRate, self).__init__() def getLearningRate(self): """ Gets the value of learning rate or its default value. """ return self.getOrDefault(self.learningRate)
class _HasRegularizationConstant(Params): """ Mixin for param for regularization constant. """ regularizationConstant = Param( Params._dummy(), "regularizationConstant", "Regularization term for stability, default is 1.0", typeConverter=TypeConverters.toFloat) def __init__(self): super(_HasRegularizationConstant, self).__init__() def getRegularizationConstant(self): """ Gets the value of regularization constant or its default value. """ return self.getOrDefault(self.regularizationConstant)
class HasStateTimeoutMode(Params): """ Mixin for param for state timeout mode for clearing states without updates, one of "none", "process" or "event". """ timeoutMode = Param( Params._dummy(), "timeoutMode", "Timeout mode for clearing the states that didn't receive measurements.", typeConverter=TypeConverters.toString) def __init__(self): super(HasStateTimeoutMode, self).__init__() def getTimeoutMode(self): """ Gets the value of timeout mode or its default value. """ return self.getOrDefault(self.timeoutMode)
class HasEventTimeCol(Params): """ Mixin for param for event time column. """ eventTimeCol = Param( Params._dummy(), "eventTimeCol", "Column marking the event time of the received measurements", typeConverter=TypeConverters.toString) def __init__(self): super(HasEventTimeCol, self).__init__() def getEventTimeCol(self): """ Gets the value of event time column or its default value. """ return self.getOrDefault(self.eventTimeCol)
class HasStateTimeoutDuration(Params): """ Mixin for param for state timeout duration. """ stateTimeoutDuration = Param( Params._dummy(), "stateTimeoutDuration", "Duration to wait before timing out the state", typeConverter=TypeConverters.toString) def __init__(self): super(HasStateTimeoutDuration, self).__init__() def getStateTimeoutDuration(self): """ Gets the value of state timeout duration or its default value. """ return self.getOrDefault(self.stateTimeoutDuration)
class HasCalculateLoglikelihood(Params): """ Mixin for param for enabling loglikelihood calculation. """ calculateLoglikelihood = Param( Params._dummy(), "calculateLoglikelihood", "When true, loglikelihood of residual will be calculated & added to output DataFrame. Default is false", typeConverter=TypeConverters.toBoolean) def __init__(self): super(HasCalculateLoglikelihood, self).__init__() def getCalculateLoglikelihood(self): """ Gets the value of loglikelihood calculation flag. """ return self.getOrDefault(self.calculateLoglikelihood)
class HasInitialStateDistributionCol(Params): """ Mixin for param for initial state distribution column. """ initialStateDistributionCol = Param( Params._dummy(), "initialStateDistributionCol", "Parameter for initial state distribution as struct col", typeConverter=TypeConverters.toString) def __init__(self): super(HasInitialStateDistributionCol, self).__init__() def getInitialStateDistributionCol(self): """ Gets the value of initial distribution column or its default value. """ return self.getOrDefault(self.initialStateDistributionCol)
class HasOutputSystemMatrices(Params): """ Mixin for param for enabling the output of system matrices along with the state. """ outputSystemMatrices = Param( Params._dummy(), "outputSystemMatrices", "When true, the system matrices will be added to output DataFrame. Default is false", typeConverter=TypeConverters.toBoolean) def __init__(self): super(HasOutputSystemMatrices, self).__init__() def getOutputSystemMatrices(self): """ Gets the value of loglikelihood calculation flag. """ return self.getOrDefault(self.outputSystemMatrices)
class HasProcessModel(Params): """ Mixin for param process model matrix. """ processModel = Param( Params._dummy(), "processModel", "Process model matrix, transitions the state to the next state when applied", typeConverter=TypeConverters.toMatrix) def __init__(self): super(HasProcessModel, self).__init__() def getProcessModel(self): """ Gets the value of process model matrix or its default value. """ return self.getOrDefault(self.processModel)
class HasInitialStateCovarianceCol(Params): """ Mixin for param for initial covariance column. """ initialStateCovarianceCol = Param( Params._dummy(), "initialStateCovarianceCol", "Column name for initial state covariance matrix.", typeConverter=TypeConverters.toString) def __init__(self): super(HasInitialStateCovarianceCol, self).__init__() def getInitialStateCovarianceCol(self): """ Gets the value of initial covariance column or its default value. """ return self.getOrDefault(self.initialStateCovarianceCol)
class HasBatchTrainMaxIter(Params): """ Mixin for batch train max iterations """ batchTrainMaxIter = Param( Params._dummy(), "batchTrainMaxIter", "Maximum iterations in batch train mode, default is 30", TypeConverters.toInt) def __init__(self): super(HasBatchTrainMaxIter, self).__init__() def getBatchTrainMaxIter(self): """ Gets the value of maxIter or its default value """ return self.getOrDefault(self.batchTrainMaxIter)
def test_params(self): testParams = TestParams() maxIter = testParams.maxIter inputCol = testParams.inputCol seed = testParams.seed params = testParams.params self.assertEqual(params, [inputCol, maxIter, seed]) self.assertTrue(testParams.hasParam(maxIter.name)) self.assertTrue(testParams.hasDefault(maxIter)) self.assertFalse(testParams.isSet(maxIter)) self.assertTrue(testParams.isDefined(maxIter)) self.assertEqual(testParams.getMaxIter(), 10) testParams.setMaxIter(100) self.assertTrue(testParams.isSet(maxIter)) self.assertEqual(testParams.getMaxIter(), 100) self.assertTrue(testParams.hasParam(inputCol.name)) self.assertFalse(testParams.hasDefault(inputCol)) self.assertFalse(testParams.isSet(inputCol)) self.assertFalse(testParams.isDefined(inputCol)) with self.assertRaises(KeyError): testParams.getInputCol() otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " + "set raises an error for a non-member parameter.", typeConverter=TypeConverters.toString) with self.assertRaises(ValueError): testParams.set(otherParam, "value") # Since the default is normally random, set it to a known number for debug str testParams._setDefault(seed=41) testParams.setSeed(43) self.assertEqual( testParams.explainParams(), "\n".join([ "inputCol: input column name. (undefined)", "maxIter: max number of iterations (>= 0). (default: 10, current: 100)", "seed: random seed. (default: 41, current: 43)" ]))
class _TrainValidationSplitParams(_ValidatorParams): """ Params for :py:class:`TrainValidationSplit` and :py:class:`TrainValidationSplitModel`. .. versionadded:: 3.0.0 """ trainRatio = Param(Params._dummy(), "trainRatio", "Param for ratio between train and\ validation data. Must be between 0 and 1.", typeConverter=TypeConverters.toFloat) @since("2.0.0") def getTrainRatio(self): """ Gets the value of trainRatio or its default value. """ return self.getOrDefault(self.trainRatio)
class HasUpdateHoldout(Params): """ Mixin for update holdout parameter """ updateHoldout = Param( Params._dummy(), "updateHoldout", "Controls after how many samples the mixture will start calculating estimates. Preventing update" + "in first few samples might be preferred for stability.", TypeConverters.toInt) def __init__(self): super(HasUpdateHoldout, self).__init__() def getUpdateHoldout(self): """ Gets the value of update holdout or its default value """ return self.getOrDefault(self.updateHoldout)
class HasDecayRate(Params): """ Mixin for decaying step size parameter """ decayRate = Param( Params._dummy(), "decayRate", "Step size as a decaying function rather than a constant, which might be preferred in batch training." + "If set, the step size will be replaced with the output of the function" + "stepSize = (2 + kIter)**(-decayRate)", TypeConverters.toFloat) def __init__(self): super(HasDecayRate, self).__init__() def getDecayingStepSizeEnabled(self): """ Gets the value of decaying step size flag """ return self.getOrDefault(self.decayRate)
def test_params(self): testParams = TestParams() maxIter = testParams.maxIter inputCol = testParams.inputCol seed = testParams.seed params = testParams.params self.assertEqual(params, [inputCol, maxIter, seed]) self.assertTrue(testParams.hasParam(maxIter.name)) self.assertTrue(testParams.hasDefault(maxIter)) self.assertFalse(testParams.isSet(maxIter)) self.assertTrue(testParams.isDefined(maxIter)) self.assertEqual(testParams.getMaxIter(), 10) testParams.setMaxIter(100) self.assertTrue(testParams.isSet(maxIter)) self.assertEqual(testParams.getMaxIter(), 100) self.assertTrue(testParams.hasParam(inputCol.name)) self.assertFalse(testParams.hasDefault(inputCol)) self.assertFalse(testParams.isSet(inputCol)) self.assertFalse(testParams.isDefined(inputCol)) with self.assertRaises(KeyError): testParams.getInputCol() otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " + "set raises an error for a non-member parameter.", typeConverter=TypeConverters.toString) with self.assertRaises(ValueError): testParams.set(otherParam, "value") # Since the default is normally random, set it to a known number for debug str testParams._setDefault(seed=41) testParams.setSeed(43) self.assertEqual( testParams.explainParams(), "\n".join(["inputCol: input column name. (undefined)", "maxIter: max number of iterations (>= 0). (default: 10, current: 100)", "seed: random seed. (default: 41, current: 43)"]))