Exemplo n.º 1
0
 def __init__(self):
     super(HasLabelCol, self).__init__()
     #: param for label column name
     self.labelCol = Param(self, "labelCol", "label column name")
     if 'label' is not None:
         self._setDefault(labelCol='label')
Exemplo n.º 2
0
class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
    """
    Evaluator for Regression, which expects two input
    columns: prediction and label.

    >>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5),
    ...   (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)]
    >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"])
    ...
    >>> evaluator = RegressionEvaluator(predictionCol="raw")
    >>> evaluator.evaluate(dataset)
    2.842...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
    0.993...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
    2.649...

    .. versionadded:: 1.4.0
    """
    # Because we will maximize evaluation value (ref: `CrossValidator`),
    # when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`),
    # we take and output the negative of this metric.
    metricName = Param(Params._dummy(),
                       "metricName",
                       "metric name in evaluation (mse|rmse|r2|mae)",
                       typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="rmse"):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="rmse")
        """
        super(RegressionEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
        self._setDefault(predictionCol="prediction",
                         labelCol="label",
                         metricName="rmse")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    @since("1.4.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        self._set(metricName=value)
        return self

    @since("1.4.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    @since("1.4.0")
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="rmse"):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="rmse")
        Sets params for regression evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 3
0
class CanLoadImage(Params):
    """
    In standard Keras workflow, we use provides an image loading function
    that takes a file path URI and convert it to an image tensor ready
    to be fed to the desired Keras model.

    This parameter allows users to specify such an image loading function.
    When using inside a pipeline stage, calling this function on an input DataFrame
    will load each image from the image URI column, encode the image in
    our :py:obj:`~sparkdl.imageIO.imageSchema` format and store it in the
    :py:meth:`~_loadedImageCol` column.

    Below is an example ``image_loader`` function to load Xception https://arxiv.org/abs/1610.02357
    compatible images.


    .. code-block:: python

        from keras.applications.xception import preprocess_input
        import numpy as np
        import PIL.Image

        def image_loader(uri):
            img = PIL.Image.open(uri).convert('RGB')
            img_resized = img.resize((299, 299), PIL.Image.ANTIALIAS))
            img_arr = np.array(img_resized).astype(np.float32)
            img_tnsr = preprocess_input(img_arr[np.newaxis, :])
            return img_tnsr
    """

    imageLoader = Param(
        Params._dummy(),
        "imageLoader",
        """Function containing the logic for loading and pre-processing images. The function
        should take in a URI string and return a 4-d numpy.array with shape (batch_size (1),
        height, width, num_channels). Expected to return result with color channels in RGB
        order.""")

    def setImageLoader(self, value):
        return self._set(imageLoader=value)

    def getImageLoader(self):
        return self.getOrDefault(self.imageLoader)

    def _loadedImageCol(self):  # pylint: disable=no-self-use
        return "__sdl_img"

    def loadImagesInternal(self, dataframe, inputCol):
        """
        Load image files specified in dataset as image format specified in `sparkdl.image.imageIO`.
        """
        # plan 1: udf(loader() + convert from np.array to imageSchema) -> call TFImageTransformer
        # plan 2: udf(loader()) ... we don't support np.array as a dataframe column type...
        loader = self.getImageLoader()
        # Load from external resources can fail, so we should allow None to be returned

        def load_image_uri_impl(uri):
            try:
                return imageArrayToStruct(_reverseChannels(loader(uri)))
            except BaseException:  # pylint: disable=bare-except
                return None
        load_udf = udf(load_image_uri_impl, ImageSchema.imageSchema['image'].dataType)
        return dataframe.withColumn(self._loadedImageCol(), load_udf(dataframe[inputCol]))
Exemplo n.º 4
0
class RankingEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
                       JavaMLReadable, JavaMLWritable):
    """
    .. note:: Experimental

    Evaluator for Ranking, which expects two input
    columns: prediction and label.

    >>> scoreAndLabels = [([1.0, 6.0, 2.0, 7.0, 8.0, 3.0, 9.0, 10.0, 4.0, 5.0],
    ...     [1.0, 2.0, 3.0, 4.0, 5.0]),
    ...     ([4.0, 1.0, 5.0, 6.0, 2.0, 7.0, 3.0, 8.0, 9.0, 10.0], [1.0, 2.0, 3.0]),
    ...     ([1.0, 2.0, 3.0, 4.0, 5.0], [])]
    >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
    ...
    >>> evaluator = RankingEvaluator(predictionCol="prediction")
    >>> evaluator.evaluate(dataset)
    0.35...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "precisionAtK", evaluator.k: 2})
    0.33...
    >>> ranke_path = temp_path + "/ranke"
    >>> evaluator.save(ranke_path)
    >>> evaluator2 = RankingEvaluator.load(ranke_path)
    >>> str(evaluator2.getPredictionCol())
    'prediction'

    .. versionadded:: 3.0.0
    """
    metricName = Param(Params._dummy(),
                       "metricName", "metric name in evaluation "
                       "(meanAveragePrecision|meanAveragePrecisionAtK|"
                       "precisionAtK|ndcgAtK|recallAtK)",
                       typeConverter=TypeConverters.toString)
    k = Param(
        Params._dummy(),
        "k",
        "The ranking position value used in meanAveragePrecisionAtK|precisionAtK|"
        "ndcgAtK|recallAtK. Must be > 0. The default value is 10.",
        typeConverter=TypeConverters.toInt)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="meanAveragePrecision",
                 k=10):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="meanAveragePrecision", k=10)
        """
        super(RankingEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.RankingEvaluator", self.uid)
        self._setDefault(metricName="meanAveragePrecision", k=10)
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @since("3.0.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        return self._set(metricName=value)

    @since("3.0.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @since("3.0.0")
    def setK(self, value):
        """
        Sets the value of :py:attr:`k`.
        """
        return self._set(k=value)

    @since("3.0.0")
    def getK(self):
        """
        Gets the value of k or its default value.
        """
        return self.getOrDefault(self.k)

    @keyword_only
    @since("3.0.0")
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="meanAveragePrecision",
                  k=10):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="meanAveragePrecision", k=10)
        Sets params for ranking evaluator.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 5
0
class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol,
                                    HasRawPredictionCol):
    """
    Evaluator for binary classification, which expects two input
    columns: rawPrediction and label.

    >>> from pyspark.mllib.linalg import Vectors
    >>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]),
    ...    [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)])
    >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"])
    ...
    >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw")
    >>> evaluator.evaluate(dataset)
    0.70...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
    0.83...
    """

    # a placeholder to make it appear in the generated doc
    metricName = Param(Params._dummy(), "metricName",
                       "metric name in evaluation (areaUnderROC|areaUnderPR)")

    @keyword_only
    def __init__(self,
                 rawPredictionCol="rawPrediction",
                 labelCol="label",
                 metricName="areaUnderROC"):
        """
        __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \
                 metricName="areaUnderROC")
        """
        super(BinaryClassificationEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator",
            self.uid)
        #: param for metric name in evaluation (areaUnderROC|areaUnderPR)
        self.metricName = Param(
            self, "metricName",
            "metric name in evaluation (areaUnderROC|areaUnderPR)")
        self._setDefault(rawPredictionCol="rawPrediction",
                         labelCol="label",
                         metricName="areaUnderROC")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        self._paramMap[self.metricName] = value
        return self

    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    def setParams(self,
                  rawPredictionCol="rawPrediction",
                  labelCol="label",
                  metricName="areaUnderROC"):
        """
        setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \
                  metricName="areaUnderROC")
        Sets params for binary classification evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 6
0
class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
                          HasWeightCol, JavaMLReadable, JavaMLWritable):
    """
    .. note:: Experimental

    Evaluator for Regression, which expects input columns prediction, label
    and an optional weight column.

    >>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5),
    ...   (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)]
    >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
    ...
    >>> evaluator = RegressionEvaluator(predictionCol="raw")
    >>> evaluator.evaluate(dataset)
    2.842...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
    0.993...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
    2.649...
    >>> re_path = temp_path + "/re"
    >>> evaluator.save(re_path)
    >>> evaluator2 = RegressionEvaluator.load(re_path)
    >>> str(evaluator2.getPredictionCol())
    'raw'
    >>> scoreAndLabelsAndWeight = [(-28.98343821, -27.0, 1.0), (20.21491975, 21.5, 0.8),
    ...   (-25.98418959, -22.0, 1.0), (30.69731842, 33.0, 0.6), (74.69283752, 71.0, 0.2)]
    >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["raw", "label", "weight"])
    ...
    >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight")
    >>> evaluator.evaluate(dataset)
    2.740...

    .. versionadded:: 1.4.0
    """
    metricName = Param(Params._dummy(),
                       "metricName",
                       """metric name in evaluation - one of:
                       rmse - root mean squared error (default)
                       mse - mean squared error
                       r2 - r^2 metric
                       mae - mean absolute error.""",
                       typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="rmse",
                 weightCol=None):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="rmse", weightCol=None)
        """
        super(RegressionEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
        self._setDefault(metricName="rmse")
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @since("1.4.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        return self._set(metricName=value)

    @since("1.4.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    @since("1.4.0")
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="rmse",
                  weightCol=None):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="rmse", weightCol=None)
        Sets params for regression evaluator.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 7
0
class MultilabelClassificationEvaluator(JavaEvaluator, HasLabelCol,
                                        HasPredictionCol, JavaMLReadable,
                                        JavaMLWritable):
    """
    .. note:: Experimental

    Evaluator for Multilabel Classification, which expects two input
    columns: prediction and label.

    >>> scoreAndLabels = [([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]),
    ...     ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]),
    ...     ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])]
    >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
    ...
    >>> evaluator = MultilabelClassificationEvaluator(predictionCol="prediction")
    >>> evaluator.evaluate(dataset)
    0.63...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
    0.54...
    >>> mlce_path = temp_path + "/mlce"
    >>> evaluator.save(mlce_path)
    >>> evaluator2 = MultilabelClassificationEvaluator.load(mlce_path)
    >>> str(evaluator2.getPredictionCol())
    'prediction'

    .. versionadded:: 3.0.0
    """
    metricName = Param(
        Params._dummy(),
        "metricName", "metric name in evaluation "
        "(subsetAccuracy|accuracy|hammingLoss|precision|recall|f1Measure|"
        "precisionByLabel|recallByLabel|f1MeasureByLabel|microPrecision|"
        "microRecall|microF1Measure)",
        typeConverter=TypeConverters.toString)
    metricLabel = Param(
        Params._dummy(),
        "metricLabel",
        "The class whose metric will be computed in precisionByLabel|"
        "recallByLabel|f1MeasureByLabel. "
        "Must be >= 0. The default value is 0.",
        typeConverter=TypeConverters.toFloat)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="f1Measure",
                 metricLabel=0.0):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="f1Measure", metricLabel=0.0)
        """
        super(MultilabelClassificationEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.MultilabelClassificationEvaluator",
            self.uid)
        self._setDefault(metricName="f1Measure", metricLabel=0.0)
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @since("3.0.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        return self._set(metricName=value)

    @since("3.0.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @since("3.0.0")
    def setMetricLabel(self, value):
        """
        Sets the value of :py:attr:`metricLabel`.
        """
        return self._set(metricLabel=value)

    @since("3.0.0")
    def getMetricLabel(self):
        """
        Gets the value of metricLabel or its default value.
        """
        return self.getOrDefault(self.metricLabel)

    @keyword_only
    @since("3.0.0")
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="f1Measure",
                  metricLabel=0.0):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="f1Measure", metricLabel=0.0)
        Sets params for multilabel classification evaluator.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 8
0
 def __init__(self):
     super(HasFeaturesCol, self).__init__()
     #: param for features column name
     self.featuresCol = Param(self, "featuresCol", "features column name")
     if 'features' is not None:
         self._setDefault(featuresCol='features')
Exemplo n.º 9
0
class TFImageTransformer(Transformer, HasInputCol, HasOutputCol, HasOutputMode):
    """
    Applies the Tensorflow graph to the image column in DataFrame.

    Restrictions of the current API:

    * Does not use minibatches, which is a major low-hanging fruit for performance.
    * Only one output node can be specified.
    * The output is expected to be an image or a 1-d vector.
    * All images in the dataframe are expected be of the same numerical data type
      (i.e. the dtype of the values in the numpy array representation is the same.)

    We assume all graphs have a "minibatch" dimension (i.e. an unknown leading
    dimension) in the tensor shapes.

    .. note:: The input tensorflow graph should have appropriate weights constantified,
              since a new session is created inside this transformer.
    """

    USER_GRAPH_NAMESPACE = 'given'
    NEW_OUTPUT_PREFIX = 'sdl_flattened'

    graph = Param(Params._dummy(), "graph", "A TensorFlow computation graph",
                  typeConverter=SparkDLTypeConverters.toTFGraph)
    inputTensor = Param(Params._dummy(), "inputTensor",
                        "A TensorFlow tensor object or name representing the input image",
                        typeConverter=SparkDLTypeConverters.toStringOrTFTensor)
    outputTensor = Param(Params._dummy(), "outputTensor",
                         "A TensorFlow tensor object or name representing the output",
                         typeConverter=SparkDLTypeConverters.toStringOrTFTensor)

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, graph=None,
                 inputTensor=utils.IMAGE_INPUT_PLACEHOLDER_NAME, outputTensor=None,
                 outputMode="vector"):
        """
        __init__(self, inputCol=None, outputCol=None, graph=None,
                 inputTensor=utils.IMAGE_INPUT_PLACEHOLDER_NAME, outputTensor=None,
                 outputMode="vector")
        """
        super(TFImageTransformer, self).__init__()
        self._setDefault(inputTensor=utils.IMAGE_INPUT_PLACEHOLDER_NAME)
        self._setDefault(outputMode="vector")
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, graph=None,
                  inputTensor=utils.IMAGE_INPUT_PLACEHOLDER_NAME, outputTensor=None,
                  outputMode="vector"):
        """
        setParams(self, inputCol=None, outputCol=None, graph=None,
                  inputTensor=utils.IMAGE_INPUT_PLACEHOLDER_NAME, outputTensor=None,
                  outputMode="vector")
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setGraph(self, value):
        return self._set(graph=value)

    def setInputTensor(self, value):
        return self._set(inputTensor=value)

    def setOutputTensor(self, value):
        return self._set(outputTensor=value)

    def getGraph(self):
        return self.getOrDefault(self.graph)

    def getInputTensor(self):
        tensor_or_name = self.getOrDefault(self.inputTensor)
        if isinstance(tensor_or_name, tf.Tensor):
            return tensor_or_name
        else:
            return self.getGraph().get_tensor_by_name(tensor_or_name)

    def getOutputTensor(self):
        tensor_or_name = self.getOrDefault(self.outputTensor)
        if isinstance(tensor_or_name, tf.Tensor):
            return tensor_or_name
        else:
            return self.getGraph().get_tensor_by_name(tensor_or_name)

    def _transform(self, dataset):
        graph = self.getGraph()
        composed_graph = self._addReshapeLayers(graph, self._getImageDtype(dataset))
        final_graph = self._stripGraph(composed_graph)

        with final_graph.as_default():
            image = dataset[self.getInputCol()]
            image_df_exploded = (dataset
              .withColumn("__sdl_image_height", image.height)
              .withColumn("__sdl_image_width", image.width)
              .withColumn("__sdl_image_nchannels", image.nChannels)
              .withColumn("__sdl_image_data", image.data)
            )

            final_output_name = self._getFinalOutputTensorName()
            output_tensor = final_graph.get_tensor_by_name(final_output_name)
            final_df = (
                tfs.map_rows([output_tensor], image_df_exploded,
                             feed_dict={
                                 "height": "__sdl_image_height",
                                 "width": "__sdl_image_width",
                                 "num_channels": "__sdl_image_nchannels",
                                 "image_buffer": "__sdl_image_data"})
                .drop("__sdl_image_height", "__sdl_image_width", "__sdl_image_nchannels",
                      "__sdl_image_data")
            )

            tfs_output_name = tfx.op_name(final_graph, output_tensor)
            original_output_name = self._getOriginalOutputTensorName()
            output_shape = final_graph.get_tensor_by_name(original_output_name).shape
            output_mode = self.getOrDefault(self.outputMode)
            # TODO: support non-1d tensors (return np.array).
            if output_mode == "image":
                return self._convertOutputToImage(final_df, tfs_output_name, output_shape)
            else:
                assert output_mode == "vector", "Unknown output mode: %s" % output_mode
                return self._convertOutputToVector(final_df, tfs_output_name)

    def _getImageDtype(self, dataset):
        # This may not be the best way to get the type of image, but it is one way.
        # Assumes that the dtype for all images is the same in the given dataframe.
        pdf = dataset.select(self.getInputCol()).take(1)
        img = pdf[0][self.getInputCol()]
        img_type = sparkModeLookup[img.mode]
        return img_type.dtype

    def _addReshapeLayers(self, tf_graph, dtype="uint8"):
        input_tensor_name = self.getInputTensor().name

        gdef = tf_graph.as_graph_def(add_shapes=True)
        g = tf.Graph()
        with g.as_default():
            # Flat image data -> image dimensions
            height = tf.placeholder(tf.int32, [], name="height")
            width = tf.placeholder(tf.int32, [], name="width")
            num_channels = tf.placeholder(tf.int32, [], name="num_channels")
            image_buffer = tf.placeholder(tf.string, [], name="image_buffer")
            # Note: the shape argument is required for tensorframes as it uses a
            # slightly older version of tensorflow.
            shape = tf.reshape(tf.stack([height, width, num_channels], axis=0), shape=(3,),
                               name='shape')
            if dtype == "uint8":
                image_uint8 = tf.decode_raw(image_buffer, tf.uint8, name="decode_raw")
                image_float = tf.to_float(image_uint8)
            else:
                assert dtype == SparkMode.FLOAT32, "Unsupported dtype for image: %s" % dtype
                image_float = tf.decode_raw(image_buffer, tf.float32, name="decode_raw")
            image_reshaped = tf.reshape(image_float, shape, name="reshaped")
            image_reshaped_expanded = tf.expand_dims(image_reshaped, 0, name="expanded")

            # Add on the original graph
            tf.import_graph_def(gdef, input_map={input_tensor_name: image_reshaped_expanded},
                                return_elements=[self.getOutputTensor().name],
                                name=self.USER_GRAPH_NAMESPACE)

            # Flatten the output for tensorframes
            output_node = g.get_tensor_by_name(self._getOriginalOutputTensorName())
            _ = tf.reshape(output_node[0],  # batch-size = 1,
                           shape=[-1], name=self._getFinalOutputOpName())
        return g

    # Sometimes the tf graph contains a bunch of stuff that doesn't lead to the
    # output. TensorFrames does not like that, so we strip out the parts that
    # are not necessary for the computation at hand.
    def _stripGraph(self, tf_graph):
        gdef = tfx.strip_and_freeze_until([self._getFinalOutputOpName()], tf_graph)
        g = tf.Graph()
        with g.as_default():
            tf.import_graph_def(gdef, name='')
        return g

    def _getOriginalOutputTensorName(self):
        return self.USER_GRAPH_NAMESPACE + '/' + self.getOutputTensor().name

    def _getFinalOutputTensorName(self):
        return self.NEW_OUTPUT_PREFIX + '_' + self.getOutputTensor().name

    def _getFinalOutputOpName(self):
        return tfx.as_op_name(self._getFinalOutputTensorName())

    def _convertOutputToImage(self, df, tfs_output_col, output_shape):
        assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions"
        height = int(output_shape[1])
        width = int(output_shape[2])
        def to_image(orig_image, numeric_data):
            # Assume the returned image has float pixels but same #channels as input
            mode = orig_image.mode if orig_image.mode == "float32" else "RGB-float32"
            return [mode, height, width, orig_image.nChannels,
                    bytearray(np.array(numeric_data).astype(np.float32).tobytes())]
        to_image_udf = udf(to_image, imageSchema)
        return (
            df.withColumn(self.getOutputCol(),
                          to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
              .drop(tfs_output_col)
        )

    def _convertOutputToVector(self, df, tfs_output_col):
        """
        Converts the output python list to MLlib Vector.
        """
        return (
            df.withColumn(self.getOutputCol(), JVMAPI.listToMLlibVectorUDF(df[tfs_output_col]))
              .drop(tfs_output_col)
        )
Exemplo n.º 10
0
 def __init__(self):
     super(HasMaxIter, self).__init__()
     #: param for max number of iterations
     self.maxIter = Param(self, "maxIter", "max number of iterations")
     if None is not None:
         self._setDefault(maxIter=None)
Exemplo n.º 11
0
 def __init__(self):
     super(HasRegParam, self).__init__()
     #: param for regularization constant
     self.regParam = Param(self, "regParam", "regularization constant")
     if None is not None:
         self._setDefault(regParam=None)
Exemplo n.º 12
0
 def __init__(self):
     super(HasNumFeatures, self).__init__()
     #: param for number of features
     self.numFeatures = Param(self, "numFeatures", "number of features")
     if None is not None:
         self._setDefault(numFeatures=None)
Exemplo n.º 13
0
 def __init__(self):
     super(HasOutputCol, self).__init__()
     #: param for output column name
     self.outputCol = Param(self, "outputCol", "output column name")
     if None is not None:
         self._setDefault(outputCol=None)
Exemplo n.º 14
0
 def __init__(self):
     super(HasInputCols, self).__init__()
     #: param for input column names
     self.inputCols = Param(self, "inputCols", "input column names")
     if None is not None:
         self._setDefault(inputCols=None)
Exemplo n.º 15
0
class TFImageTransformer(Transformer, HasInputCol, HasOutputCol,
                         HasOutputMode):
    """
    Applies the Tensorflow graph to the image column in DataFrame.

    Restrictions of the current API:

    * Does not use minibatches, which is a major low-hanging fruit for performance.
    * Only one output node can be specified.
    * The output is expected to be an image or a 1-d vector.
    * All images in the dataframe are expected be of the same numerical data type
      (i.e. the dtype of the values in the numpy array representation is the same.)

    We assume all graphs have a "minibatch" dimension (i.e. an unknown leading
    dimension) in the tensor shapes.

    .. note:: The input tensorflow graph should have appropriate weights constantified,
              since a new session is created inside this transformer.
    """

    graph = Param(Params._dummy(),
                  "graph",
                  "A TensorFlow computation graph",
                  typeConverter=SparkDLTypeConverters.toTFGraph)
    inputTensor = Param(
        Params._dummy(),
        "inputTensor",
        "A TensorFlow tensor object or name representing the input image",
        typeConverter=SparkDLTypeConverters.toTFTensorName)
    outputTensor = Param(
        Params._dummy(),
        "outputTensor",
        "A TensorFlow tensor object or name representing the output",
        typeConverter=SparkDLTypeConverters.toTFTensorName)
    channelOrder = Param(
        Params._dummy(),
        "channelOrder",
        "Strign specifying the expected color channel order, can be one of L,RGB,BGR",
        typeConverter=SparkDLTypeConverters.toChannelOrder)

    @keyword_only
    def __init__(self,
                 channelOrder,
                 inputCol=None,
                 outputCol=None,
                 graph=None,
                 inputTensor=IMAGE_INPUT_TENSOR_NAME,
                 outputTensor=None,
                 outputMode="vector"):
        """
        __init__(self, channelOrder, inputCol=None, outputCol=None, graph=None,
                 inputTensor=IMAGE_INPUT_TENSOR_NAME, outputTensor=None, outputMode="vector")
          :param: channelOrder: specify the ordering of the color channel, can be one of RGB,
          BGR, L (grayscale)
        """
        super(TFImageTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)
        self._setDefault(inputTensor=IMAGE_INPUT_TENSOR_NAME)
        self.channelOrder = channelOrder

    @keyword_only
    def setParams(self,
                  channelOrder=None,
                  inputCol=None,
                  outputCol=None,
                  graph=None,
                  inputTensor=IMAGE_INPUT_TENSOR_NAME,
                  outputTensor=None,
                  outputMode="vector"):
        """
        setParams(self, channelOrder=None, inputCol=None, outputCol=None, graph=None,
                  inputTensor=IMAGE_INPUT_TENSOR_NAME, outputTensor=None, outputMode="vector")
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setGraph(self, value):
        return self._set(graph=value)

    def setInputTensor(self, value):
        return self._set(inputTensor=value)

    def setOutputTensor(self, value):
        return self._set(outputTensor=value)

    def getGraph(self):
        return self.getOrDefault(self.graph)

    def getInputTensor(self):
        tensor_name = self.getOrDefault(self.inputTensor)
        return self.getGraph().get_tensor_by_name(tensor_name)

    def getOutputTensor(self):
        tensor_name = self.getOrDefault(self.outputTensor)
        return self.getGraph().get_tensor_by_name(tensor_name)

    def _transform(self, dataset):
        graph = self.getGraph()
        composed_graph = self._addReshapeLayers(graph,
                                                self._getImageDtype(dataset))
        final_graph = self._stripGraph(composed_graph)
        with final_graph.as_default():  # pylint: disable=not-context-manager
            image = dataset[self.getInputCol()]
            image_df_exploded = (dataset
                                 .withColumn("__sdl_image_height", image.height)
                                 .withColumn("__sdl_image_width", image.width)
                                 .withColumn("__sdl_image_nchannels", image.nChannels)
                                 .withColumn("__sdl_image_data", image.data)
                                )  # yapf: disable

            final_output_name = self._getFinalOutputTensorName()
            output_tensor = final_graph.get_tensor_by_name(final_output_name)
            final_df = (
                tfs.map_rows([output_tensor], image_df_exploded,
                             feed_dict={
                                 "height": "__sdl_image_height",
                                 "width": "__sdl_image_width",
                                 "num_channels": "__sdl_image_nchannels",
                                 "image_buffer": "__sdl_image_data"})
                .drop("__sdl_image_height", "__sdl_image_width", "__sdl_image_nchannels",
                      "__sdl_image_data")
            )   # yapf: disable

            tfs_output_name = tfx.op_name(output_tensor, final_graph)
            original_output_name = self._getOriginalOutputTensorName()
            output_shape = final_graph.get_tensor_by_name(
                original_output_name).shape
            output_mode = self.getOrDefault(self.outputMode)
            # TODO: support non-1d tensors (return np.array).
            if output_mode == "image":
                return self._convertOutputToImage(final_df, tfs_output_name,
                                                  output_shape)
            else:
                assert output_mode == "vector", "Unknown output mode: %s" % output_mode
                return self._convertOutputToVector(final_df, tfs_output_name)

    def _getImageDtype(self, dataset):
        # This may not be the best way to get the type of image, but it is one way.
        # Assumes that the dtype for all images is the same in the given dataframe.
        pdf = dataset.select(self.getInputCol()).take(1)
        img = pdf[0][self.getInputCol()]
        img_type = imageIO.imageTypeByOrdinal(img.mode)
        return img_type.dtype

    # TODO: duplicate code, same functionality as sparkdl.graph.pieces.py::builSpImageConverter
    # TODO: It should be extracted as a util function and shared
    def _addReshapeLayers(self, tf_graph, dtype="uint8"):
        input_tensor_name = self.getInputTensor().name

        gdef = tf_graph.as_graph_def(add_shapes=True)
        g = tf.Graph()  # pylint: disable=invalid-name
        with g.as_default():  # pylint: disable=not-context-manager
            # Flat image data -> image dimensions
            height = tf.placeholder(tf.int32, [], name="height")
            width = tf.placeholder(tf.int32, [], name="width")
            num_channels = tf.placeholder(tf.int32, [], name="num_channels")
            image_buffer = tf.placeholder(tf.string, [], name="image_buffer")
            # Note: the shape argument is required for tensorframes as it uses a
            # slightly older version of tensorflow.
            shape_tensor = tf.stack([height, width, num_channels], axis=0)
            shape = tf.reshape(shape_tensor, shape=(3, ), name='shape')
            if dtype == "uint8":
                image_uint8 = tf.decode_raw(image_buffer,
                                            tf.uint8,
                                            name="decode_raw")
                image_float = tf.to_float(image_uint8)
            else:
                assert dtype == "float32", "Unsupported dtype for image: %s" % dtype
                image_float = tf.decode_raw(image_buffer,
                                            tf.float32,
                                            name="decode_raw")
            image_reshaped = tf.reshape(image_float, shape, name="reshaped")
            image_reshaped = imageIO.fixColorChannelOrdering(
                self.channelOrder, image_reshaped)
            image_reshaped_expanded = tf.expand_dims(image_reshaped,
                                                     0,
                                                     name="expanded")

            # Add on the original graph
            tf.import_graph_def(
                gdef,
                input_map={input_tensor_name: image_reshaped_expanded},
                return_elements=[self.getOutputTensor().name],
                name=USER_GRAPH_NAMESPACE)

            # Flatten the output for tensorframes
            output_node = g.get_tensor_by_name(
                self._getOriginalOutputTensorName())
            _ = tf.reshape(output_node[0],
                           shape=[-1],
                           name=self._getFinalOutputOpName())
        return g

    # Sometimes the tf graph contains a bunch of stuff that doesn't lead to the
    # output. TensorFrames does not like that, so we strip out the parts that
    # are not necessary for the computation at hand.
    def _stripGraph(self, tf_graph):
        gdef = tfx.strip_and_freeze_until([self._getFinalOutputOpName()],
                                          tf_graph)
        g = tf.Graph()  # pylint: disable=invalid-name
        with g.as_default():  # pylint: disable=not-context-manager
            tf.import_graph_def(gdef, name='')
        return g

    def _getOriginalOutputTensorName(self):
        return USER_GRAPH_NAMESPACE + '/' + self.getOutputTensor().name

    def _getFinalOutputTensorName(self):
        return NEW_OUTPUT_PREFIX + '_' + self.getOutputTensor().name

    def _getFinalOutputOpName(self):
        return tfx.op_name(self._getFinalOutputTensorName())

    def _convertOutputToImage(self, df, tfs_output_col, output_shape):
        assert len(output_shape
                   ) == 4, str(output_shape) + " does not have 4 dimensions"
        height = int(output_shape[1])
        width = int(output_shape[2])

        def to_image(orig_image, numeric_data):
            # Assume the returned image has float pixels but same #channels as input
            mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels)
            data = bytearray(
                np.array(numeric_data).astype(np.float32).tobytes())
            nChannels = orig_image.nChannels
            return Row(origin="",
                       mode=mode.ord,
                       height=height,
                       width=width,
                       nChannels=nChannels,
                       data=data)

        to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType)
        resDf = df.withColumn(
            self.getOutputCol(),
            to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
        return resDf.drop(tfs_output_col)

    def _convertOutputToVector(self, df, tfs_output_col):
        """
        Converts the output python list to MLlib Vector.
        """
        return df\
            .withColumn(self.getOutputCol(), JVMAPI.listToMLlibVectorUDF(df[tfs_output_col]))\
            .drop(tfs_output_col)
Exemplo n.º 16
0
 def __init__(self):
     super(HasFake, self).__init__()
     self.fake = Param(self, "fake", "fake param")
Exemplo n.º 17
0
class _ALSParams(_ALSModelParams, HasMaxIter, HasRegParam, HasCheckpointInterval, HasSeed):
    """
    Params for :py:class:`ALS`.

    .. versionadded:: 3.0.0
    """

    rank = Param(
        Params._dummy(), "rank", "rank of the factorization", typeConverter=TypeConverters.toInt
    )
    numUserBlocks = Param(
        Params._dummy(),
        "numUserBlocks",
        "number of user blocks",
        typeConverter=TypeConverters.toInt,
    )
    numItemBlocks = Param(
        Params._dummy(),
        "numItemBlocks",
        "number of item blocks",
        typeConverter=TypeConverters.toInt,
    )
    implicitPrefs = Param(
        Params._dummy(),
        "implicitPrefs",
        "whether to use implicit preference",
        typeConverter=TypeConverters.toBoolean,
    )
    alpha = Param(
        Params._dummy(),
        "alpha",
        "alpha for implicit preference",
        typeConverter=TypeConverters.toFloat,
    )

    ratingCol = Param(
        Params._dummy(),
        "ratingCol",
        "column name for ratings",
        typeConverter=TypeConverters.toString,
    )
    nonnegative = Param(
        Params._dummy(),
        "nonnegative",
        "whether to use nonnegative constraint for least squares",
        typeConverter=TypeConverters.toBoolean,
    )
    intermediateStorageLevel = Param(
        Params._dummy(),
        "intermediateStorageLevel",
        "StorageLevel for intermediate datasets. Cannot be 'NONE'.",
        typeConverter=TypeConverters.toString,
    )
    finalStorageLevel = Param(
        Params._dummy(),
        "finalStorageLevel",
        "StorageLevel for ALS model factors.",
        typeConverter=TypeConverters.toString,
    )

    def __init__(self, *args):
        super(_ALSParams, self).__init__(*args)
        self._setDefault(
            rank=10,
            maxIter=10,
            regParam=0.1,
            numUserBlocks=10,
            numItemBlocks=10,
            implicitPrefs=False,
            alpha=1.0,
            userCol="user",
            itemCol="item",
            ratingCol="rating",
            nonnegative=False,
            checkpointInterval=10,
            intermediateStorageLevel="MEMORY_AND_DISK",
            finalStorageLevel="MEMORY_AND_DISK",
            coldStartStrategy="nan",
        )

    @since("1.4.0")
    def getRank(self):
        """
        Gets the value of rank or its default value.
        """
        return self.getOrDefault(self.rank)

    @since("1.4.0")
    def getNumUserBlocks(self):
        """
        Gets the value of numUserBlocks or its default value.
        """
        return self.getOrDefault(self.numUserBlocks)

    @since("1.4.0")
    def getNumItemBlocks(self):
        """
        Gets the value of numItemBlocks or its default value.
        """
        return self.getOrDefault(self.numItemBlocks)

    @since("1.4.0")
    def getImplicitPrefs(self):
        """
        Gets the value of implicitPrefs or its default value.
        """
        return self.getOrDefault(self.implicitPrefs)

    @since("1.4.0")
    def getAlpha(self):
        """
        Gets the value of alpha or its default value.
        """
        return self.getOrDefault(self.alpha)

    @since("1.4.0")
    def getRatingCol(self):
        """
        Gets the value of ratingCol or its default value.
        """
        return self.getOrDefault(self.ratingCol)

    @since("1.4.0")
    def getNonnegative(self):
        """
        Gets the value of nonnegative or its default value.
        """
        return self.getOrDefault(self.nonnegative)

    @since("2.0.0")
    def getIntermediateStorageLevel(self):
        """
        Gets the value of intermediateStorageLevel or its default value.
        """
        return self.getOrDefault(self.intermediateStorageLevel)

    @since("2.0.0")
    def getFinalStorageLevel(self):
        """
        Gets the value of finalStorageLevel or its default value.
        """
        return self.getOrDefault(self.finalStorageLevel)
Exemplo n.º 18
0
 def __init__(self):
     super(HasThrowableProperty, self).__init__()
     self.p = Param(self, "none", "empty param")
Exemplo n.º 19
0
class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol,
                                        HasPredictionCol, HasWeightCol,
                                        JavaMLReadable, JavaMLWritable):
    """
    .. note:: Experimental

    Evaluator for Multiclass Classification, which expects two input
    columns: prediction and label.

    >>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
    ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]
    >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
    ...
    >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    >>> evaluator.evaluate(dataset)
    0.66...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
    0.66...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "truePositiveRateByLabel",
    ...     evaluator.metricLabel: 1.0})
    0.75...
    >>> mce_path = temp_path + "/mce"
    >>> evaluator.save(mce_path)
    >>> evaluator2 = MulticlassClassificationEvaluator.load(mce_path)
    >>> str(evaluator2.getPredictionCol())
    'prediction'
    >>> scoreAndLabelsAndWeight = [(0.0, 0.0, 1.0), (0.0, 1.0, 1.0), (0.0, 0.0, 1.0),
    ...     (1.0, 0.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0),
    ...     (2.0, 2.0, 1.0), (2.0, 0.0, 1.0)]
    >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["prediction", "label", "weight"])
    ...
    >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
    ...     weightCol="weight")
    >>> evaluator.evaluate(dataset)
    0.66...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
    0.66...

    .. versionadded:: 1.5.0
    """
    metricName = Param(
        Params._dummy(),
        "metricName", "metric name in evaluation "
        "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate|"
        "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel|"
        "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel)",
        typeConverter=TypeConverters.toString)
    metricLabel = Param(
        Params._dummy(),
        "metricLabel",
        "The class whose metric will be computed in truePositiveRateByLabel|"
        "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel."
        " Must be >= 0. The default value is 0.",
        typeConverter=TypeConverters.toFloat)
    beta = Param(Params._dummy(),
                 "beta",
                 "The beta value used in weightedFMeasure|fMeasureByLabel."
                 " Must be > 0. The default value is 1.",
                 typeConverter=TypeConverters.toFloat)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="f1",
                 weightCol=None,
                 metricLabel=0.0,
                 beta=1.0):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0)
        """
        super(MulticlassClassificationEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator",
            self.uid)
        self._setDefault(metricName="f1", metricLabel=0.0, beta=1.0)
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @since("1.5.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        return self._set(metricName=value)

    @since("1.5.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @since("3.0.0")
    def setMetricLabel(self, value):
        """
        Sets the value of :py:attr:`metricLabel`.
        """
        return self._set(metricLabel=value)

    @since("3.0.0")
    def getMetricLabel(self):
        """
        Gets the value of metricLabel or its default value.
        """
        return self.getOrDefault(self.metricLabel)

    @since("3.0.0")
    def setBeta(self, value):
        """
        Sets the value of :py:attr:`beta`.
        """
        return self._set(beta=value)

    @since("3.0.0")
    def getBeta(self):
        """
        Gets the value of beta or its default value.
        """
        return self.getOrDefault(self.beta)

    @keyword_only
    @since("1.5.0")
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="f1",
                  weightCol=None,
                  metricLabel=0.0,
                  beta=1.0):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0)
        Sets params for multiclass classification evaluator.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 20
0
class XGBoostSageMakerEstimator(SageMakerEstimatorBase):
    """
    A :class:`~sagemaker_pyspark.SageMakerEstimator` that runs an XGBoost training job in
    Amazon SageMaker and returns a :class:`~sagemaker_pyspark.SageMakerModel` that can be used to
    transform a DataFrame using he hosted XGBoost model.  XGBoost is an open-source distributed
    gradient boosting library that Amazon SageMaker has adapted to run on Amazon SageMaker.

    XGBoost trains and infers on LibSVM-formatted data. XGBoostSageMakerEstimator uses Spark's
    LibSVMFileFormat to write the training DataFrame to S3, and serializes Rows to LibSVM for
    inference, selecting the column named "features" by default, expected to contain a Vector of
    Doubles.

    Inferences made against an Endpoint hosting an XGBoost model contain a "prediction" field
    appended to the input DataFrame as a column of Doubles, containing the prediction corresponding
    to the given Vector of features.

    See `XGBoost github <https://github.com/dmlc/xgboost>`__ for more on XGBoost

    Args:
        sageMakerRole (IAMRole): The SageMaker TrainingJob and Hosting IAM Role. Used by
            SageMaker to access S3 and ECR Resources. SageMaker hosted Endpoint instances
            launched by this Estimator run with this role.
        trainingInstanceType (str): The SageMaker TrainingJob Instance Type to use.
        trainingInstanceCount (int): The number of instances of instanceType to run an
            SageMaker Training Job with.
        endpointInstanceType (str): The SageMaker Endpoint Config instance type.
        endpointInitialInstanceCount (int): The SageMaker Endpoint Config minimum number of
            instances that can be used to host modelImage.
        requestRowSerializer (RequestRowSerializer): Serializes Spark DataFrame Rows for
            transformation by Models built from this Estimator.
        responseRowDeserializer (ResponseRowDeserializer): Deserializes an Endpoint response into a
            series of Rows.
        trainingInputS3DataPath (S3Resource): An S3 location to upload SageMaker Training Job input
            data to.
        trainingOutputS3DataPath (S3Resource): An S3 location for SageMaker to store Training Job
            output data to.
        trainingInstanceVolumeSizeInGB (int): The EBS volume size in gigabytes of each instance.
        trainingProjectedColumns (List): The columns to project from the Dataset being fit before
            training. If an Optional.empty is passed then no specific projection will occur and
            all columns will be serialized.
        trainingChannelName (str): The SageMaker Channel name to input serialized Dataset fit
            input to.
        trainingContentType (str): The MIME type of the training data.
        trainingS3DataDistribution (str): The SageMaker Training Job S3 data distribution scheme.
        trainingSparkDataFormat (str): The Spark Data Format name used to serialize the Dataset
            being fit for input to SageMaker.
        trainingSparkDataFormatOptions (dict): The Spark Data Format Options used during
            serialization of the Dataset being fit.
        trainingInputMode (str): The SageMaker Training Job Channel input mode.
        trainingCompressionCodec (str): The type of compression to use when serializing the
            Dataset being fit for input to SageMaker.
        trainingMaxRuntimeInSeconds (int): A SageMaker Training Job Termination Condition
            MaxRuntimeInHours.
        trainingKmsKeyId (str): A KMS key ID for the Output Data Source.
        modelEnvironmentVariables (dict): The environment variables that SageMaker will set on the
            model container during execution.
        endpointCreationPolicy (EndpointCreationPolicy): Defines how a SageMaker Endpoint
            referenced by a SageMakerModel is created.
        sagemakerClient (AmazonSageMaker) Amazon SageMaker client. Used to send CreateTrainingJob,
            CreateModel, and CreateEndpoint requests.
        region (str): The region in which to run the algorithm. If not specified, gets the region
            from the DefaultAwsRegionProviderChain.
        s3Client (AmazonS3): Used to create a bucket for staging SageMaker Training Job
            input and/or output if either are set to S3AutoCreatePath.
        stsClient (AmazonSTS): Used to resolve the account number when creating staging
            input / output buckets.
        modelPrependInputRowsToTransformationRows (bool): Whether the transformation result on
            Models built by this Estimator should also include the input Rows. If true,
            each output Row is formed by a concatenation of the input Row with the corresponding
            Row produced by SageMaker Endpoint invocation, produced by responseRowDeserializer.
            If false, each output Row is just taken from responseRowDeserializer.
        deleteStagingDataAfterTraining (bool): Whether to remove the training data on s3 after
            training is complete or failed.
        namePolicyFactory (NamePolicyFactory): The NamePolicyFactory to use when naming SageMaker
            entities created during fit.
        uid (str): The unique identifier of this Estimator. Used to represent this stage in Spark
            ML pipelines.

       """
    _wrapped_class = \
        "com.amazonaws.services.sagemaker.sparksdk.algorithms.XGBoostSageMakerEstimator"

    booster = Param(
        Params._dummy(), "booster",
        "Which booster to use. Can be 'gbtree', 'gblinear' or 'dart'. "
        "gbtree and dart use tree based model while gblinear uses linear function.",
        typeConverter=TypeConverters.toString)

    silent = Param(
        Params._dummy(), "silent",
        "Whether in silent mode."
        "0 means print running messages, 1 means silent mode.",
        typeConverter=TypeConverters.toInt)

    nthread = Param(
        Params._dummy(), "nthread",
        "Number of parallel threads used to run xgboot. Must be >= 1.",
        typeConverter=TypeConverters.toInt)

    eta = Param(
        Params._dummy(), "eta",
        "Step size shrinkage used in update to prevent overfitting. After each boosting step, "
        "we can directly get the weights of new features. and eta shrinks the feature weights "
        "to make the boosting process more conservative. Must be in [0, 1].",
        typeConverter=TypeConverters.toFloat)

    gamma = Param(
        Params._dummy(), "gamma",
        "Minimum loss reduction required to make an additional partition on a leaf node"
        " of the tree. The larger the value, the more conservative the algorithm will be."
        "Must be >= 0.",
        typeConverter=TypeConverters.toFloat)

    max_depth = Param(
        Params._dummy(), "max_depth",
        "Minimum loss reduction required to make a further partition on a leaf node of the tree. "
        "The larger the value, the more conservative the algorithm will be. Must be >= 0",
        typeConverter=TypeConverters.toFloat)

    min_child_weight = Param(
        Params._dummy(), "min_child_weight",
        "Minimum sum of instance weight (hessian) needed in a child. If the tree partition step "
        "results in a leaf node with the sum of instance weight less than min_child_weight, then "
        "the building process will give up further partitioning. In linear regression mode, "
        "this simply corresponds to minimum number of instances needed to be in each node. "
        "The larger the value, the more conservative the algorithm will be. Must be >= 0.",
        typeConverter=TypeConverters.toFloat)

    max_delta_step = Param(
        Params._dummy(), "max_delta_step",
        "Maximum delta step we allow each tree's weight estimation to be. "
        "If the value is set to 0, it means there is no constraint. If it is set to a positive "
        "value, it can help make the update step more conservative. Usually this parameter is "
        "not needed, but it might help in logistic regression when the classes are extremely"
        " imbalanced. Setting it to value of 1-10 might help control the update. Must be >= 0.",
        typeConverter=TypeConverters.toFloat)

    subsample = Param(
        Params._dummy(), "subsample",
        "Subsample ratio of the training instance. Setting it to 0.5 means that XGBoost will "
        "randomly collect half of the data instances to grow trees and this will "
        "prevent overfitting. Must be (0, 1].",
        typeConverter=TypeConverters.toFloat)

    colsample_bytree = Param(
        Params._dummy(), "colsample_bytree",
        "Subsample ratio of columns when constructing each tree. Must be in (0, 1].",
        typeConverter=TypeConverters.toFloat)

    colsample_bylevel = Param(
        Params._dummy(), "colsample_bylevel",
        "Subsample ratio of columns for each split, in each level. Must be in (0, 1].",
        typeConverter=TypeConverters.toFloat)

    _lambda = Param(
        Params._dummy(), "lambda",
        "L2 regularization term on weights, increase this value"
        " will make model more conservative.",
        typeConverter=TypeConverters.toFloat)

    alpha = Param(
        Params._dummy(), "alpha",
        "L1 regularization term on weights, increase this value "
        "will make model more conservative.",
        typeConverter=TypeConverters.toFloat)

    tree_method = Param(
        Params._dummy(), "tree_method",
        "The tree construction algorithm used in XGBoost. Can be "
        "'auto', 'exact', 'approx' or 'hist'",
        typeConverter=TypeConverters.toString)

    sketch_eps = Param(
        Params._dummy(), "sketch_eps",
        "Used only for approximate greedy algorithm. This translates into O(1 / sketch_eps) number"
        "of bins. Compared to directly select number of bins, this comes with theoretical guarantee"
        "with sketch accuracy."
        "Must be in (0, 1).",
        typeConverter=TypeConverters.toFloat)

    scale_pos_weight = Param(
        Params._dummy(), "scale_pos_weight",
        "Controls the balance of positive and negative weights. It's useful for unbalanced classes."
        "A typical value to consider: sum(negative cases) / sum(positive cases).",
        typeConverter=TypeConverters.toFloat)

    updater = Param(
        Params._dummy(), "updater",
        "A comma separated string defining the sequence of tree updaters to run, "
        "providing a modular way to construct and to modify the trees. "
        "This is an advanced parameter that is usually set automatically, "
        "depending on some other parameters. Can be "
        "'grow_colmaker', 'distcol', 'grow_histmaker', 'grow_local_histmaker',"
        "'grow_skmaker', 'sync', 'refresh', 'prune'.",
        typeConverter=TypeConverters.toString)

    refresh_leaf = Param(
        Params._dummy(), "refresh_leaf",
        "This is a parameter of the 'refresh' updater plugin. When set to true, tree leaves and"
        "tree node stats are updated. When set to false, only tree node stats are updated.",
        typeConverter=TypeConverters.toInt)

    process_type = Param(
        Params._dummy(), "process_type",
        "The type of boosting process to run. Can be 'default', 'update'",
        typeConverter=TypeConverters.toString)

    grow_policy = Param(
        Params._dummy(), "grow_policy",
        "Controls the way that new nodes are added to the tree. Currently supported"
        "only if tree_method is set to hist. Can be 'depthwise', 'lossguide'",
        typeConverter=TypeConverters.toString)

    max_leaves = Param(
        Params._dummy(), "max_leaves",
        "Maximum number of nodes to be added. Relevant only if grow_policy = lossguide.",
        typeConverter=TypeConverters.toInt)

    max_bin = Param(
        Params._dummy(), "max_bin",
        "Maximum number of discrete bins to bucket continuous features."
        "Used only if tree_method = hist.",
        typeConverter=TypeConverters.toInt)

    sample_type = Param(
        Params._dummy(), "sample_type",
        "Type of sampling algorithm. Can be 'uniform' or 'weighted'."
        "'uniform': dropped trees are selected uniformly."
        "'weighted': dropped trees are selected in proportion to weight.",
        typeConverter=TypeConverters.toString)

    normalize_type = Param(
        Params._dummy(), "normalize_type",
        "type of normalization algorithm. Can be 'tree' or 'forest'"
        "'tree': new trees have the same weight of each of dropped trees."
        "'forest': new trees have the same weight of sum of dropped trees (forest).",
        typeConverter=TypeConverters.toString)

    rate_drop = Param(
        Params._dummy(), "rate_drop",
        "dropout rate (a fraction of previous trees to drop during the dropout). "
        "Must be in [0.0, 1.0]",
        typeConverter=TypeConverters.toFloat)

    one_drop = Param(
        Params._dummy(), "one_drop",
        "When this flag is enabled, at least one tree is always dropped during the dropout.",
        typeConverter=TypeConverters.toInt)

    skip_drop = Param(
        Params._dummy(), "skip_drop",
        "Probability of skipping the dropout procedure during a boosting iteration."
        "Must be in [0.0, 1.0]",
        typeConverter=TypeConverters.toFloat)

    lambda_bias = Param(
        Params._dummy(), "lambda_bias",
        "L2 regularization term on bias. Must be in [0, 1].",
        typeConverter=TypeConverters.toFloat)

    tweedie_variance_power = Param(
        Params._dummy(), "tweedie_variance_power",
        "parameter that controls the variance of the Tweedie distribution. Must be in (1.0, 2.0).",
        typeConverter=TypeConverters.toFloat)

    objective = Param(
        Params._dummy(), "objective",
        "Specifies the learning objective."
        "\"reg:logistic\" --logistic regression "
        "\"binary:logistic\" --logistic regression for binary classification, "
        "output is probability "
        "\"binary:logitraw\" --logistic regression for binary classification, output is"
        " score before logistic transformation "
        "\"count:poisson\" --poisson regression for count data, output mean of poisson"
        " distribution max_delta_step is set to 0.7 by default in poisson regression "
        "(used to safeguard optimization) "
        "\"multi:softmax\" --multiclass classification using the softmax objective. "
        "You also need to set num_class(number of classes)"
        "\"multi:softprob\" --same as softmax, but output a vector of ndata * nclass, "
        "which can be further reshaped to ndata, nclass matrix. "
        "The result contains predicted probability of each data point belonging to each class. "
        "\"rank:pairwise\" --set XGBoost to do ranking task by minimizing the pairwise loss "
        "\"reg:gamma\" --gamma regression with log-link. Output is a mean of gamma distribution. "
        "It might be useful, e.g., for modeling insurance claims severity, or for any outcome "
        "that might be gamma-distributed"
        "\"reg:tweedie\" --Tweedie regression with log-link. It might be useful, e.g., for "
        "modeling total loss in insurance, or for any outcome that might be"
        " Tweedie-distributed.",
        typeConverter=TypeConverters.toString)

    num_class = Param(
        Params._dummy(), "num_class",
        "Number of classes. >= 1",
        typeConverter=TypeConverters.toInt)

    base_score = Param(
        Params._dummy(), "base_score",
        "the initial prediction score of all instances, global bias. Value range: [0.0, 1.0]",
        typeConverter=TypeConverters.toFloat)

    eval_metric = Param(
        Params._dummy(), "eval_metric",
        "Evaluation metrics for validation data. A default metric will be assigned according to"
        " objective (rmse for regression, and error for classification, mean average "
        "precision for ranking). Values: 'rmse', 'mae', 'logloss', 'error', 'error@t', 'merror',"
        "'mlogloss', 'auc', 'ndcg', 'ndcg@n', 'ndcg@n-', 'map-', 'map@n-'.",
        typeConverter=TypeConverters.toString)

    seed = Param(
        Params._dummy(), "seed",
        "Random number seed",
        typeConverter=TypeConverters.toFloat)

    num_round = Param(
        Params._dummy(), "num_round",
        "The number of rounds to run the training. Must be >= 1",
        typeConverter=TypeConverters.toInt)

    def __init__(self,
                 trainingInstanceType,
                 trainingInstanceCount,
                 endpointInstanceType,
                 endpointInitialInstanceCount,
                 sagemakerRole=IAMRoleFromConfig(),
                 requestRowSerializer=LibSVMRequestRowSerializer(),
                 responseRowDeserializer=XGBoostCSVRowDeserializer(),
                 trainingInputS3DataPath=S3AutoCreatePath(),
                 trainingOutputS3DataPath=S3AutoCreatePath(),
                 trainingInstanceVolumeSizeInGB=1024,
                 trainingProjectedColumns=None,
                 trainingChannelName="train",
                 trainingContentType=None,
                 trainingS3DataDistribution="ShardedByS3Key",
                 trainingSparkDataFormat="libsvm",
                 trainingSparkDataFormatOptions=None,
                 trainingInputMode="File",
                 trainingCompressionCodec=None,
                 trainingMaxRuntimeInSeconds=24*60*60,
                 trainingKmsKeyId=None,
                 modelEnvironmentVariables=None,
                 endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_CONSTRUCT,
                 sagemakerClient=SageMakerClients.create_sagemaker_client(),
                 region=None,
                 s3Client=SageMakerClients.create_s3_default_client(),
                 stsClient=SageMakerClients.create_sts_default_client(),
                 modelPrependInputRowsToTransformationRows=True,
                 deleteStagingDataAfterTraining=True,
                 namePolicyFactory=RandomNamePolicyFactory(),
                 uid=None):

        if trainingSparkDataFormatOptions is None:
            trainingSparkDataFormatOptions = {}

        if modelEnvironmentVariables is None:
            modelEnvironmentVariables = {}

        if uid is None:
            uid = Identifiable._randomUID()

        kwargs = locals()
        del kwargs['self']
        super(XGBoostSageMakerEstimator, self).__init__(**kwargs)

    def _get_java_obj(self, **kwargs):
        return self._new_java_obj(
            XGBoostSageMakerEstimator._wrapped_class,
            kwargs['sagemakerRole'],
            kwargs['trainingInstanceType'],
            kwargs['trainingInstanceCount'],
            kwargs['endpointInstanceType'],
            kwargs['endpointInitialInstanceCount'],
            kwargs['requestRowSerializer'],
            kwargs['responseRowDeserializer'],
            kwargs['trainingInputS3DataPath'],
            kwargs['trainingOutputS3DataPath'],
            kwargs['trainingInstanceVolumeSizeInGB'],
            Option(kwargs['trainingProjectedColumns']),
            kwargs['trainingChannelName'],
            Option(kwargs['trainingContentType']),
            kwargs['trainingS3DataDistribution'],
            kwargs['trainingSparkDataFormat'],
            kwargs['trainingSparkDataFormatOptions'],
            kwargs['trainingInputMode'],
            Option(kwargs['trainingCompressionCodec']),
            kwargs['trainingMaxRuntimeInSeconds'],
            Option(kwargs['trainingKmsKeyId']),
            kwargs['modelEnvironmentVariables'],
            kwargs['endpointCreationPolicy'],
            kwargs['sagemakerClient'],
            Option(kwargs['region']),
            kwargs['s3Client'],
            kwargs['stsClient'],
            kwargs['modelPrependInputRowsToTransformationRows'],
            kwargs['deleteStagingDataAfterTraining'],
            kwargs['namePolicyFactory'],
            kwargs['uid']
        )

    def getBooster(self):
        return self.getOrDefault(self.booster)

    def setBooster(self, value):
        if value not in ('gbtree', 'gblinear', 'dart'):
            raise ValueError("booster must be 'gbtree', 'gblinear' or 'dart'. got: %s" % value)
        self._set(booster=value)

    def getSilent(self):
        return self.getOrDefault(self.silent)

    def setSilent(self, value):
        if value not in (0, 1):
            raise ValueError("silent must be either 0 or 1. got: %s" % value)
        self._set(silent=value)

    def getNThread(self):
        return self.getOrDefault(self.nthread)

    def setNThread(self, value):
        if value < 1:
            raise ValueError("nthread must be >= 1 got: %s" % value)
        self._set(nthread=value)

    def getEta(self):
        return self.getOrDefault(self.eta)

    def setEta(self, value):
        if value < 0 or value > 1:
            raise ValueError("eta must be within range [0.0, 1.0] got: %s" % value)
        self._set(eta=value)

    def getGamma(self):
        return self.getOrDefault(self.gamma)

    def setGamma(self, value):
        if value < 0:
            raise ValueError("gamma must be >= 0  got: %s" % value)
        self._set(gamma=value)

    def getMaxDepth(self):
        return self.getOrDefault(self.max_depth)

    def setMaxDepth(self, value):
        if value < 0:
            raise ValueError("gamma must be >=0 got: %s" % value)
        self._set(max_depth=value)

    def getMinChildWeight(self):
        return self.getOrDefault(self.min_child_weight)

    def setMinChildWeight(self, value):
        if value < 0:
            raise ValueError("min_child_weight must be >= 0 got: %s" % value)
        self._set(min_child_weight=value)

    def getMaxDeltaStep(self):
        return self.getOrDefault(self.max_delta_step)

    def setMaxDeltaStep(self, value):
        if value < 0:
            raise ValueError("max_delta_weight must be >=0 got: %s" % value)
        self._set(max_delta_step=value)

    def getSubsample(self):
        return self.getOrDefault(self.subsample)

    def setSubsample(self, value):
        if value <= 0 or value > 1:
            raise ValueError("subsample must be in range (0, 1] got: %s" % value)
        self._set(subsample=value)

    def getColSampleByTree(self):
        return self.getOrDefault(self.colsample_bytree)

    def setColSampleByTree(self, value):
        if value <= 0 or value > 1:
            raise ValueError("colsample_bytree must be in range (0, 1] got: %s" % value)
        self._set(colsample_bytree=value)

    def getColSampleByLevel(self):
        return self.getOrDefault(self.colsample_bylevel)

    def setColSampleByLevel(self, value):
        if value <= 0 or value > 1:
            raise ValueError("colsample_by_level must be in range (0, 1] got: %s" % value)
        self._set(colsample_bylevel=value)

    def getLambda(self):
        return self.getOrDefault(self._lambda)

    def setLambda(self, value):
        self._set(_lambda=value)

    def getAlpha(self):
        return self.getOrDefault(self.alpha)

    def setAlpha(self, value):
        self._set(alpha=value)

    def getTreeMethod(self):
        return self.getOrDefault(self.tree_method)

    def setTreeMethod(self, value):
        if value not in ("auto", "exact", "approx", "hist"):
            raise ValueError("tree_method must be one of: 'auto', 'exact', 'approx', 'hist', "
                             "got: %s" % value)
        self._set(tree_method=value)

    def getSketchEps(self):
        return self.getOrDefault(self.sketch_eps)

    def setSketchEps(self, value):
        if value <= 0 or value >= 1:
            raise ValueError("sketch_eps must be in range (0, 1) got: %s" % value)
        self._set(sketch_eps=value)

    def getScalePosWeight(self):
        return self.getOrDefault(self.scale_pos_weight)

    def setScalePosWeight(self, value):
        self._set(scale_pos_weight=value)

    def getUpdater(self):
        return self.getOrDefault(self.updater)

    def setUpdater(self, value):
        valid_tokens = ("grow_colmaker", "distcol", "grow_histmaker", "grow_local_histmaker",
                        "grow_skmaker", "sync", "refresh", "prune")
        tokens = value.split(",")
        for token in tokens:
            if token.strip() not in valid_tokens:
                raise ValueError("values allowed in updater are: %s, found: %s " %
                                 (','.join(valid_tokens), token))
        self._set(updater=value)

    def getRefreshLeaf(self):
        return self.getOrDefault(self.refresh_leaf)

    def setRefreshLeaf(self, value):
        if value not in (0, 1):
            raise ValueError("refresh_leaf must be either 0 or 1, got: %s" % value)
        self._set(refresh_leaf=value)

    def getProcessType(self):
        return self.getOrDefault(self.process_type)

    def setProcessType(self, value):
        if value not in ("default", "update"):
            raise ValueError("process_type must be 'default' or 'update', got: %s" % value)
        self._set(process_type=value)

    def getGrowPolicy(self):
        return self.getOrDefault(self.grow_policy)

    def setGrowPolicy(self, value):
        if value not in ("depthwise", "lossguide"):
            raise ValueError("grow_policy must be 'depthwise' or 'lossguide', got: %s" % value)
        self._set(grow_policy=value)

    def getMaxLeaves(self):
        return self.getOrDefault(self.max_leaves)

    def setMaxLeaves(self, value):
        if value < 0:
            raise ValueError("max_leaves must be >=0, got: %s" % value)
        self._set(max_leaves=value)

    def getMaxBin(self):
        return self.getOrDefault(self.max_bin)

    def setMaxBin(self, value):
        if value < 1:
            raise ValueError("max_bin must be >=1, got: %s" % value)
        self._set(max_bin=value)

    def getSampleType(self):
        return self.getOrDefault(self.sample_type)

    def setSampleType(self, value):
        if value not in ("uniform", "weighted"):
            raise ValueError("sample_type must be 'uniform' or 'weighted', got: %s" % value)
        self._set(sample_type=value)

    def getNormalizeType(self):
        return self.getOrDefault(self.normalize_type)

    def setNormalizeType(self, value):
        if value not in ("tree", "forest"):
            raise ValueError("normalize_type must be 'tree' or 'forest', got: %s" % value)
        self._set(normalize_type=value)

    def getRateDrop(self):
        return self.getOrDefault(self.rate_drop)

    def setRateDrop(self, value):
        if value < 0 or value > 1:
            raise ValueError("rate_drop must be in range [0.0, 1.0], got: %s" % value)
        self._set(rate_drop=value)

    def getOneDrop(self):
        return self.getOrDefault(self.one_drop)

    def setOneDrop(self, value):
        if value not in (0, 1):
            raise ValueError("one_drop must be 0 or 1, got: %s" % value)
        self._set(one_drop=value)

    def getSkipDrop(self):
        return self.getOrDefault(self.skip_drop)

    def setSkipDrop(self, value):
        if value < 0 or value > 1:
            raise ValueError("skip_drop must be in range [0.0, 1.0], got: %s" % value)
        self._set(skip_drop=value)

    def getLambdaBias(self):
        return self.getOrDefault(self.lambda_bias)

    def setLambdaBias(self, value):
        if value < 0 or value > 1:
            raise ValueError("lambda_bias must in range [0.0, 1.0], got: %s" % value)
        self._set(lambda_bias=value)

    def getTweedieVariancePower(self):
        return self.getOrDefault(self.tweedie_variance_power)

    def setTweedieVariancePower(self, value):
        if value <= 1 or value >= 2:
            raise ValueError("tweedie_variance_power must be in range (1.0, 2.0), got: %s" % value)
        self._set(tweedie_variance_power=value)

    def getObjective(self):
        return self.getOrDefault(self.objective)

    def setObjective(self, value):
        allowed_values = ("reg:linear", "reg:logistic", "binary:logistic", "binary:logistraw",
                          "count:poisson", "multi:softmax", "multi:softprob", "rank:pairwise",
                          "reg:gamma", "reg:tweedie")

        if value not in allowed_values:
            raise ValueError("objective must be one of (%s), got: %s" %
                             (','.join(allowed_values), value))
        self._set(objective=value)

    def getNumClasses(self):
        return self.getOrDefault(self.num_class)

    def setNumClasses(self, value):
        if value < 1:
            raise ValueError("num_class must be >=1, got: %s" % value)
        self._set(num_class=value)

    def getBaseScore(self):
        return self.getOrDefault(self.base_score)

    def setBaseScore(self, value):
        self._set(base_score=value)

    def getEvalMetric(self):
        return self.getOrDefault(self.eval_metric)

    def setEvalMetric(self, value):
        allowed_values = ("rmse", "mae", "logloss", "error", "error@t", "merror",
                          "mlogloss", "auc", "ndcg", "map", "ndcg@n", "ndcg-", "ndcg@n-",
                          "map-", "map@n-")

        if value not in allowed_values:
            raise ValueError("eval_metric must be one of (%s), got: %s" %
                             (','.join(allowed_values), value))
        self._set(eval_metric=value)

    def getSeed(self):
        return self.getOrDefault(self.seed)

    def setSeed(self, value):
        self._set(seed=value)

    def getNumRound(self):
        return self.getOrDefault(self.num_round)

    def setNumRound(self, value):
        if value < 1:
            raise ValueError("num_round must be  >= 1, got: %s" % value)
        self._set(num_round=value)

    @classmethod
    def _from_java(cls, javaObject):
        return XGBoostSageMakerEstimator(sagemakerRole=None, javaObject=javaObject)
Exemplo n.º 21
0
class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol,
                          JavaMLReadable, JavaMLWritable):
    """
    .. note:: Experimental

    Evaluator for Clustering results, which expects two input
    columns: prediction and features. The metric computes the Silhouette
    measure using the squared Euclidean distance.

    The Silhouette is a measure for the validation of the consistency
    within clusters. It ranges between 1 and -1, where a value close to
    1 means that the points in a cluster are close to the other points
    in the same cluster and far from the points of the other clusters.

    >>> from pyspark.ml.linalg import Vectors
    >>> featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
    ...     [([0.0, 0.5], 0.0), ([0.5, 0.0], 0.0), ([10.0, 11.0], 1.0),
    ...     ([10.5, 11.5], 1.0), ([1.0, 1.0], 0.0), ([8.0, 6.0], 1.0)])
    >>> dataset = spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
    ...
    >>> evaluator = ClusteringEvaluator(predictionCol="prediction")
    >>> evaluator.evaluate(dataset)
    0.9079...
    >>> ce_path = temp_path + "/ce"
    >>> evaluator.save(ce_path)
    >>> evaluator2 = ClusteringEvaluator.load(ce_path)
    >>> str(evaluator2.getPredictionCol())
    'prediction'

    .. versionadded:: 2.3.0
    """
    metricName = Param(Params._dummy(),
                       "metricName",
                       "metric name in evaluation (silhouette)",
                       typeConverter=TypeConverters.toString)
    distanceMeasure = Param(
        Params._dummy(),
        "distanceMeasure",
        "The distance measure. " +
        "Supported options: 'squaredEuclidean' and 'cosine'.",
        typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 featuresCol="features",
                 metricName="silhouette",
                 distanceMeasure="squaredEuclidean"):
        """
        __init__(self, predictionCol="prediction", featuresCol="features", \
                 metricName="silhouette", distanceMeasure="squaredEuclidean")
        """
        super(ClusteringEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.ClusteringEvaluator", self.uid)
        self._setDefault(metricName="silhouette",
                         distanceMeasure="squaredEuclidean")
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @since("2.3.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        return self._set(metricName=value)

    @since("2.3.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    @since("2.3.0")
    def setParams(self,
                  predictionCol="prediction",
                  featuresCol="features",
                  metricName="silhouette",
                  distanceMeasure="squaredEuclidean"):
        """
        setParams(self, predictionCol="prediction", featuresCol="features", \
                  metricName="silhouette", distanceMeasure="squaredEuclidean")
        Sets params for clustering evaluator.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    @since("2.4.0")
    def setDistanceMeasure(self, value):
        """
        Sets the value of :py:attr:`distanceMeasure`.
        """
        return self._set(distanceMeasure=value)

    @since("2.4.0")
    def getDistanceMeasure(self):
        """
        Gets the value of `distanceMeasure`
        """
        return self.getOrDefault(self.distanceMeasure)
Exemplo n.º 22
0
 def __init__(self):
     super(MockTransformer, self).__init__()
     self.fake = Param(self, "fake", "fake")
     self.dataset_index = None
     self.fake_param_value = None
Exemplo n.º 23
0
class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
    """
    Evaluator for Regression, which expects two input
    columns: prediction and label.

    >>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5),
    ...   (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)]
    >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"])
    ...
    >>> evaluator = RegressionEvaluator(predictionCol="raw")
    >>> evaluator.evaluate(dataset)
    2.842...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
    0.993...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
    2.649...
    """
    # a placeholder to make it appear in the generated doc
    metricName = Param(Params._dummy(), "metricName",
                       "metric name in evaluation (mse|rmse|r2|mae)")

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="rmse"):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="rmse")
        """
        super(RegressionEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
        #: param for metric name in evaluation (mse|rmse|r2|mae)
        self.metricName = Param(self, "metricName",
                                "metric name in evaluation (mse|rmse|r2|mae)")
        self._setDefault(predictionCol="prediction",
                         labelCol="label",
                         metricName="rmse")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        self._paramMap[self.metricName] = value
        return self

    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="rmse"):
        """
        setParams(self, predictionCol="prediction", labelCol="label",
                  metricName="rmse")
        Sets params for regression evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 24
0
 def __init__(self):
     super(MockEstimator, self).__init__()
     self.fake = Param(self, "fake", "fake")
     self.dataset_index = None
     self.fake_param_value = None
     self.model = None
Exemplo n.º 25
0
class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol,
                                    HasRawPredictionCol):
    """
    Evaluator for binary classification, which expects two input columns: rawPrediction and label.
    The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
    1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).

    >>> from pyspark.mllib.linalg import Vectors
    >>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]),
    ...    [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)])
    >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"])
    ...
    >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw")
    >>> evaluator.evaluate(dataset)
    0.70...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
    0.83...

    .. versionadded:: 1.4.0
    """

    metricName = Param(Params._dummy(),
                       "metricName",
                       "metric name in evaluation (areaUnderROC|areaUnderPR)",
                       typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self,
                 rawPredictionCol="rawPrediction",
                 labelCol="label",
                 metricName="areaUnderROC"):
        """
        __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \
                 metricName="areaUnderROC")
        """
        super(BinaryClassificationEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator",
            self.uid)
        self._setDefault(rawPredictionCol="rawPrediction",
                         labelCol="label",
                         metricName="areaUnderROC")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    @since("1.4.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        self._set(metricName=value)
        return self

    @since("1.4.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    @since("1.4.0")
    def setParams(self,
                  rawPredictionCol="rawPrediction",
                  labelCol="label",
                  metricName="areaUnderROC"):
        """
        setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \
                  metricName="areaUnderROC")
        Sets params for binary classification evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 26
0
class CrossValidator(Estimator, ValidatorParams):
    """
    .. note:: Experimental

    K-fold cross validation.

    >>> from pyspark.ml.classification import LogisticRegression
    >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator
    >>> from pyspark.mllib.linalg import Vectors
    >>> dataset = sqlContext.createDataFrame(
    ...     [(Vectors.dense([0.0]), 0.0),
    ...      (Vectors.dense([0.4]), 1.0),
    ...      (Vectors.dense([0.5]), 0.0),
    ...      (Vectors.dense([0.6]), 1.0),
    ...      (Vectors.dense([1.0]), 1.0)] * 10,
    ...     ["features", "label"])
    >>> lr = LogisticRegression()
    >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
    >>> evaluator = BinaryClassificationEvaluator()
    >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    >>> cvModel = cv.fit(dataset)
    >>> evaluator.evaluate(cvModel.transform(dataset))
    0.8333...

    .. versionadded:: 1.4.0
    """

    numFolds = Param(Params._dummy(),
                     "numFolds",
                     "number of folds for cross validation",
                     typeConverter=TypeConverters.toInt)

    @keyword_only
    def __init__(self,
                 estimator=None,
                 estimatorParamMaps=None,
                 evaluator=None,
                 numFolds=3,
                 seed=None):
        """
        __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,\
                 seed=None)
        """
        super(CrossValidator, self).__init__()
        self._setDefault(numFolds=3)
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    @keyword_only
    @since("1.4.0")
    def setParams(self,
                  estimator=None,
                  estimatorParamMaps=None,
                  evaluator=None,
                  numFolds=3,
                  seed=None):
        """
        setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,\
                  seed=None):
        Sets params for cross validator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)

    @since("1.4.0")
    def setNumFolds(self, value):
        """
        Sets the value of :py:attr:`numFolds`.
        """
        return self._set(numFolds=value)

    @since("1.4.0")
    def getNumFolds(self):
        """
        Gets the value of numFolds or its default value.
        """
        return self.getOrDefault(self.numFolds)

    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels
        for i in range(nFolds):
            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] <
                                                       validateUB)
            validation = df.filter(condition)
            train = df.filter(~condition)
            for j in range(numModels):
                model = est.fit(train, epm[j])
                # TODO: duplicate evaluator to take extra params from input
                metric = eva.evaluate(model.transform(validation, epm[j]))
                metrics[j] += metric

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics))

    @since("1.4.0")
    def copy(self, extra=None):
        """
        Creates a copy of this instance with a randomly generated uid
        and some extra params. This copies creates a deep copy of
        the embedded paramMap, and copies the embedded and extra parameters over.

        :param extra: Extra parameters to copy to the new instance
        :return: Copy of this instance
        """
        if extra is None:
            extra = dict()
        newCV = Params.copy(self, extra)
        if self.isSet(self.estimator):
            newCV.setEstimator(self.getEstimator().copy(extra))
        # estimatorParamMaps remain the same
        if self.isSet(self.evaluator):
            newCV.setEvaluator(self.getEvaluator().copy(extra))
        return newCV
Exemplo n.º 27
0
class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol,
                                        HasPredictionCol):
    """
    Evaluator for Multiclass Classification, which expects two input
    columns: prediction and label.
    >>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
    ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]
    >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["prediction", "label"])
    ...
    >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    >>> evaluator.evaluate(dataset)
    0.66...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "precision"})
    0.66...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "recall"})
    0.66...

    .. versionadded:: 1.5.0
    """
    metricName = Param(
        Params._dummy(),
        "metricName", "metric name in evaluation "
        "(f1|precision|recall|weightedPrecision|weightedRecall)",
        typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="f1"):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="f1")
        """
        super(MulticlassClassificationEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator",
            self.uid)
        self._setDefault(predictionCol="prediction",
                         labelCol="label",
                         metricName="f1")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    @since("1.5.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        self._set(metricName=value)
        return self

    @since("1.5.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    @since("1.5.0")
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="f1"):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="f1")
        Sets params for multiclass classification evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)
Exemplo n.º 28
0
class TrainValidationSplit(Estimator, ValidatorParams):
    """
    .. note:: Experimental

    Validation for hyper-parameter tuning. Randomly splits the input dataset into train and
    validation sets, and uses evaluation metric on the validation set to select the best model.
    Similar to :class:`CrossValidator`, but only splits the set once.

    >>> from pyspark.ml.classification import LogisticRegression
    >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator
    >>> from pyspark.mllib.linalg import Vectors
    >>> dataset = sqlContext.createDataFrame(
    ...     [(Vectors.dense([0.0]), 0.0),
    ...      (Vectors.dense([0.4]), 1.0),
    ...      (Vectors.dense([0.5]), 0.0),
    ...      (Vectors.dense([0.6]), 1.0),
    ...      (Vectors.dense([1.0]), 1.0)] * 10,
    ...     ["features", "label"])
    >>> lr = LogisticRegression()
    >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
    >>> evaluator = BinaryClassificationEvaluator()
    >>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    >>> tvsModel = tvs.fit(dataset)
    >>> evaluator.evaluate(tvsModel.transform(dataset))
    0.8333...

    .. versionadded:: 2.0.0
    """

    trainRatio = Param(Params._dummy(),
                       "trainRatio",
                       "Param for ratio between train and\
     validation data. Must be between 0 and 1.",
                       typeConverter=TypeConverters.toFloat)

    @keyword_only
    def __init__(self,
                 estimator=None,
                 estimatorParamMaps=None,
                 evaluator=None,
                 trainRatio=0.75,
                 seed=None):
        """
        __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,\
                 seed=None)
        """
        super(TrainValidationSplit, self).__init__()
        self._setDefault(trainRatio=0.75)
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    @since("2.0.0")
    @keyword_only
    def setParams(self,
                  estimator=None,
                  estimatorParamMaps=None,
                  evaluator=None,
                  trainRatio=0.75,
                  seed=None):
        """
        setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,\
                  seed=None):
        Sets params for the train validation split.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)

    @since("2.0.0")
    def setTrainRatio(self, value):
        """
        Sets the value of :py:attr:`trainRatio`.
        """
        return self._set(trainRatio=value)

    @since("2.0.0")
    def getTrainRatio(self):
        """
        Gets the value of trainRatio or its default value.
        """
        return self.getOrDefault(self.trainRatio)

    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        tRatio = self.getOrDefault(self.trainRatio)
        seed = self.getOrDefault(self.seed)
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = np.zeros(numModels)
        condition = (df[randCol] >= tRatio)
        validation = df.filter(condition)
        train = df.filter(~condition)
        for j in range(numModels):
            model = est.fit(train, epm[j])
            metric = eva.evaluate(model.transform(validation, epm[j]))
            metrics[j] += metric
        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(TrainValidationSplitModel(bestModel))

    @since("2.0.0")
    def copy(self, extra=None):
        """
        Creates a copy of this instance with a randomly generated uid
        and some extra params. This copies creates a deep copy of
        the embedded paramMap, and copies the embedded and extra parameters over.

        :param extra: Extra parameters to copy to the new instance
        :return: Copy of this instance
        """
        if extra is None:
            extra = dict()
        newTVS = Params.copy(self, extra)
        if self.isSet(self.estimator):
            newTVS.setEstimator(self.getEstimator().copy(extra))
        # estimatorParamMaps remain the same
        if self.isSet(self.evaluator):
            newTVS.setEvaluator(self.getEvaluator().copy(extra))
        return newTVS
Exemplo n.º 29
0
 def __init__(self):
     super(HasInducedError, self).__init__()
     self.inducedError = Param(
         self, "inducedError",
         "Uniformly-distributed error added to feature")
Exemplo n.º 30
0
class _ValidatorParams(HasSeed):
    """
    Common params for TrainValidationSplit and CrossValidator.
    """

    estimator = Param(Params._dummy(), "estimator",
                      "estimator to be cross-validated")
    estimatorParamMaps = Param(Params._dummy(), "estimatorParamMaps",
                               "estimator param maps")
    evaluator = Param(
        Params._dummy(), "evaluator",
        "evaluator used to select hyper-parameters that maximize the validator metric"
    )

    @since("2.0.0")
    def getEstimator(self):
        """
        Gets the value of estimator or its default value.
        """
        return self.getOrDefault(self.estimator)

    @since("2.0.0")
    def getEstimatorParamMaps(self):
        """
        Gets the value of estimatorParamMaps or its default value.
        """
        return self.getOrDefault(self.estimatorParamMaps)

    @since("2.0.0")
    def getEvaluator(self):
        """
        Gets the value of evaluator or its default value.
        """
        return self.getOrDefault(self.evaluator)

    @classmethod
    def _from_java_impl(cls, java_stage):
        """
        Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams.
        """

        # Load information from java_stage to the instance.
        estimator = JavaParams._from_java(java_stage.getEstimator())
        evaluator = JavaParams._from_java(java_stage.getEvaluator())
        epms = [
            estimator._transfer_param_map_from_java(epm)
            for epm in java_stage.getEstimatorParamMaps()
        ]
        return estimator, epms, evaluator

    def _to_java_impl(self):
        """
        Return Java estimator, estimatorParamMaps, and evaluator from this Python instance.
        """

        gateway = SparkContext._gateway
        cls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap

        java_epms = gateway.new_array(cls, len(self.getEstimatorParamMaps()))
        for idx, epm in enumerate(self.getEstimatorParamMaps()):
            java_epms[idx] = self.getEstimator()._transfer_param_map_to_java(
                epm)

        java_estimator = self.getEstimator()._to_java()
        java_evaluator = self.getEvaluator()._to_java()
        return java_estimator, java_epms, java_evaluator