示例#1
0
	def putOption(self, pipelineStage, key, value):
		javaKey = _py2java(self.sc, key)
		javaValue = _py2java(self.sc, value)
		if pipelineStage is None:
			self.javaPmmlBuilder.putOption(javaKey, javaValue)
		else:
			javaPipelineStage = pipelineStage._to_java()
			self.javaPmmlBuilder.putOption(javaPipelineStage, javaKey, javaValue)
		return self
示例#2
0
def _call_java(sc, java_obj, name, *args):
    """
    Method copied from pyspark.ml.wrapper.  Uses private Spark APIs.
    """
    m = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return _java2py(sc, m(*java_args))
示例#3
0
    def test(dataset, featuresCol, labelCol):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :return:
          DataFrame containing the test result for every feature against the label.
          This DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[Int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
        return _java2py(sc, javaTestObj.test(*args))
示例#4
0
    def _call_java(self, name: str, *args: Any) -> Any:
        m = getattr(self._java_obj, name)
        sc = SparkContext._active_spark_context
        assert sc is not None

        java_args = [_py2java(sc, arg) for arg in args]
        return _java2py(sc, m(*java_args))
示例#5
0
文件: tuning.py 项目: Ditto0/Sparks
    def _to_java(self):
        """
        Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence.
        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context
        # TODO: persst validation metrics as well
        _java_obj = JavaParams._new_java_obj(
            "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid,
            self.bestModel._to_java(), _py2java(sc, []))
        estimator, epms, evaluator = super(TrainValidationSplitModel,
                                           self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)

        if self.subModels is not None:
            java_sub_models = [
                sub_model._to_java() for sub_model in self.subModels
            ]
            _java_obj.setSubModels(java_sub_models)

        return _java_obj
示例#6
0
文件: stat.py 项目: Brett-A/spark
    def test(dataset, featuresCol, labelCol):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :return:
          DataFrame containing the test result for every feature against the label.
          This DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[Int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
        return _java2py(sc, javaTestObj.test(*args))
def to_java_params(sc, model, pyParamMap):
    paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap")
    for param, value in pyParamMap.items():
        java_param = model._java_obj.getParam(param.name)
        java_value = _py2java(sc, value)
        paramMap.put([java_param.w(java_value)])
    return paramMap
示例#8
0
    def _to_java(self):
        """
        Transfer this instance to a Java CrossValidatorModel. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context
        _java_obj = JavaParams._new_java_obj(
            "org.apache.spark.ml.tuning.CrossValidatorModel", self.uid,
            self.bestModel._to_java(), _py2java(sc, self.avgMetrics))
        estimator, epms, evaluator = super(CrossValidatorModel,
                                           self)._to_java_impl()

        params = {
            "evaluator": evaluator,
            "estimator": estimator,
            "estimatorParamMaps": epms,
            "numFolds": self.getNumFolds(),
            "foldCol": self.getFoldCol(),
            "seed": self.getSeed(),
        }
        for param_name, param_val in params.items():
            java_param = _java_obj.getParam(param_name)
            pair = java_param.w(param_val)
            _java_obj.set(pair)

        if self.subModels is not None:
            java_sub_models = [[
                sub_model._to_java() for sub_model in fold_sub_models
            ] for fold_sub_models in self.subModels]
            _java_obj.setSubModels(java_sub_models)
        return _java_obj
示例#9
0
def _call_java(sc, java_obj, name, *args):
    """
    Method copied from pyspark.ml.wrapper.  Uses private Spark APIs.
    """
    m = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return _java2py(sc, m(*java_args))
示例#10
0
    def meta_estimator_transfer_param_maps_to_java(pyEstimator, pyParamMaps):
        pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator)
        stagePairs = list(
            map(lambda stage: (stage, stage._to_java()), pyStages))
        sc = SparkContext._active_spark_context

        paramMapCls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap
        javaParamMaps = SparkContext._gateway.new_array(
            paramMapCls, len(pyParamMaps))

        for idx, pyParamMap in enumerate(pyParamMaps):
            javaParamMap = JavaWrapper._new_java_obj(
                "org.apache.spark.ml.param.ParamMap")
            for pyParam, pyValue in pyParamMap.items():
                javaParam = None
                for pyStage, javaStage in stagePairs:
                    if pyStage._testOwnParam(pyParam.parent, pyParam.name):
                        javaParam = javaStage.getParam(pyParam.name)
                        break
                if javaParam is None:
                    raise ValueError(
                        'Resolve param in estimatorParamMaps failed: ' +
                        str(pyParam))
                if isinstance(pyValue, Params) and hasattr(
                        pyValue, "_to_java"):
                    javaValue = pyValue._to_java()
                else:
                    javaValue = _py2java(sc, pyValue)
                pair = javaParam.w(javaValue)
                javaParamMap.put([pair])
            javaParamMaps[idx] = javaParamMap
        return javaParamMaps
示例#11
0
def bundle(spark_session, spark_df_schema, spark_pipeline_model):
    #spark_df_as_java = _py2java(spark_session, spark_df)
    #spark_df_schema_as_java = spark_df_as_java.schema.__call__()
    spark_df_schema_as_json = spark_df_schema.json()
    with open('model.schema', 'wb') as pkl_file:
        pickle.dump(spark_df_schema_as_json, pkl_file)

    spark_pipeline_model.write().overwrite().save('model.parquet')

    ## SERVE FROM HERE
    with open('model.schema', 'rb') as pkl_file:
        from pyspark.sql.types import _parse_datatype_json_string
        restored_spark_df_schema_as_json = pickle.load(pkl_file)
        restored_spark_df_schema = _parse_datatype_json_string(
            restored_spark_df_schema_as_json)
        restored_spark_df_schema_as_java = _py2java(spark_session,
                                                    restored_spark_df_schema)

    restored_spark_pipeline_model = PipelineModel.read().load('model.parquet')
    restored_spark_pipeline_model_as_java = restored_spark_pipeline_model._to_java(
    )

    return spark_session._jvm.org.jpmml.sparkml.ConverterUtil.toPMMLByteArray(
        restored_spark_df_schema_as_java,
        restored_spark_pipeline_model_as_java)
示例#12
0
 def corr(dataset, column, method="pearson"):
     """
     Compute the correlation matrix with specified method using dataset.
     """
     sc = SparkContext._active_spark_context
     javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
     args = [_py2java(sc, arg) for arg in (dataset, column, method)]
     return _java2py(sc, javaCorrObj.corr(*args))
示例#13
0
 def __init__(self, spark_df, spark_session=None):
     if spark_session is None:
         spark_session = SparkSession.builder.getOrCreate()
     super().__init__(
         _py2java(spark_session.sparkContext, spark_df),
         SQLContext(spark_session.sparkContext, spark_session),
     )
     self._validate()
示例#14
0
 def test(dataset, featuresCol, labelCol):
     """
     Perform a Pearson's independence test using dataset.
     """
     sc = SparkContext._active_spark_context
     javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
     args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
     return _java2py(sc, javaTestObj.test(*args))
示例#15
0
文件: stat.py 项目: LY3918/spark
 def corr(dataset, column, method="pearson"):
     """
     Compute the correlation matrix with specified method using dataset.
     """
     sc = SparkContext._active_spark_context
     javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
     args = [_py2java(sc, arg) for arg in (dataset, column, method)]
     return _java2py(sc, javaCorrObj.corr(*args))
示例#16
0
文件: stat.py 项目: LY3918/spark
 def test(dataset, featuresCol, labelCol):
     """
     Perform a Pearson's independence test using dataset.
     """
     sc = SparkContext._active_spark_context
     javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
     args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
     return _java2py(sc, javaTestObj.test(*args))
示例#17
0
def _new_java_obj(sc, java_class, *args):
    """
    Construct a new Java object.
    """
    java_obj = _jvm()
    for name in java_class.split("."):
        java_obj = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return java_obj(*java_args)
示例#18
0
 def _make_java_param_pair(self, param, value):
     """
     Makes a Java param pair.
     """
     sc = SparkContext._active_spark_context
     param = self._resolveParam(param)
     java_param = self._java_obj.getParam(param.name)
     java_value = _py2java(sc, value)
     return java_param.w(java_value)
示例#19
0
def _new_java_obj(sc, java_class, *args):
    """
    Construct a new Java object.
    """
    java_obj = _jvm()
    for name in java_class.split("."):
        java_obj = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return java_obj(*java_args)
示例#20
0
    def test(dataset: DataFrame, sampleCol: str, distName: str,
             *params: float) -> DataFrame:
        """
        Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution
        equality. Currently supports the normal distribution, taking as parameters the mean and
        standard deviation.

        .. versionadded:: 2.4.0

        Parameters
        ----------
        dataset : :py:class:`pyspark.sql.DataFrame`
            a Dataset or a DataFrame containing the sample of data to test.
        sampleCol : str
            Name of sample column in dataset, of any numerical type.
        distName : str
            a `string` name for a theoretical distribution, currently only support "norm".
        params : float
            a list of `float` values specifying the parameters to be used for the theoretical
            distribution. For "norm" distribution, the parameters includes mean and variance.

        Returns
        -------
        A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data.
        This DataFrame will contain a single Row with the following fields:

        - `pValue: Double`
        - `statistic: Double`

        Examples
        --------
        >>> from pyspark.ml.stat import KolmogorovSmirnovTest
        >>> dataset = [[-1.0], [0.0], [1.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        >>> dataset = [[2.0], [3.0], [4.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        """
        sc = SparkContext._active_spark_context
        assert sc is not None

        javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest
        dataset = _py2java(sc, dataset)
        params = [float(param) for param in params]  # type: ignore[assignment]
        return _java2py(
            sc,
            javaTestObj.test(dataset, sampleCol, distName,
                             _jvm().PythonUtils.toSeq(params)))
示例#21
0
 def _make_java_param_pair(self, param, value):
     """
     Makes a Java parm pair.
     """
     sc = SparkContext._active_spark_context
     param = self._resolveParam(param)
     java_param = self._java_obj.getParam(param.name)
     java_value = _py2java(sc, value)
     return java_param.w(java_value)
示例#22
0
	def __init__(self, sc, df, pipelineModel):
		javaDf = _py2java(sc, df)
		javaSchema = javaDf.schema.__call__()
		javaPipelineModel = pipelineModel._to_java()
		javaPmmlBuilder = sc._jvm.org.jpmml.sparkml.PMMLBuilder(javaSchema, javaPipelineModel)
		if(not isinstance(javaPmmlBuilder, JavaObject)):
			raise RuntimeError("JPMML-SparkML not found on classpath")
		self.sc = sc
		self.javaPmmlBuilder = javaPmmlBuilder
示例#23
0
 def _new_java_obj(java_class, *args):
     """
     Returns a new Java object.
     """
     sc = SparkContext._active_spark_context
     java_obj = _jvm()
     for name in java_class.split("."):
         java_obj = getattr(java_obj, name)
     java_args = [_py2java(sc, arg) for arg in args]
     return java_obj(*java_args)
示例#24
0
 def _new_java_obj(java_class, *args):
     """
     Returns a new Java object.
     """
     sc = SparkContext._active_spark_context
     java_obj = _jvm()
     for name in java_class.split("."):
         java_obj = getattr(java_obj, name)
     java_args = [_py2java(sc, arg) for arg in args]
     return java_obj(*java_args)
示例#25
0
def toPMMLBytes(sc, df, pipelineModel):
    javaDF = _py2java(sc, df)
    javaSchema = javaDF.schema.__call__()

    javaPipelineModel = pipelineModel._to_java()

    javaConverter = sc._jvm.org.jpmml.sparkml.ConverterUtil
    if (not isinstance(javaConverter, JavaClass)):
        raise RuntimeError("JPMML-SparkML not found on classpath")
    return javaConverter.toPMMLByteArray(javaSchema, javaPipelineModel)
示例#26
0
    def _make_java_param_pair(self, param: Param[T], value: T) -> "JavaObject":
        """
        Makes a Java param pair.
        """
        sc = SparkContext._active_spark_context
        assert sc is not None and self._java_obj is not None

        param = self._resolveParam(param)
        java_param = self._java_obj.getParam(param.name)
        java_value = _py2java(sc, value)
        return java_param.w(java_value)
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform a F Regression test using dataset.

        :param dataset:
          DataFrame of continuous labels and continuous features.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `fValue: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `fValues: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import FValueTest
        >>> dataset = [[0.57495218, Vectors.dense([0.43486404, 0.57153633, 0.43175686,
        ...                                        0.51418671, 0.61632374, 0.96565515])],
        ...            [0.84619853, Vectors.dense([0.49162732, 0.6785187, 0.85460572,
        ...                                        0.59784822, 0.12394819, 0.53783355])],
        ...            [0.39777647, Vectors.dense([0.30879653, 0.54904515, 0.17103889,
        ...                                        0.40492506, 0.18957493, 0.5440016])],
        ...            [0.79201573, Vectors.dense([0.68114391, 0.60549825, 0.69094651,
        ...                                        0.62102109, 0.05471483, 0.96449167])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> fValueResult = FValueTest.test(dataset, 'features', 'label')
        >>> row = fValueResult.select("fValues", "pValues").collect()
        >>> row[0].fValues
        DenseVector([3.741, 7.5807, 142.0684, 34.9849, 0.4112, 0.0539])
        >>> row[0].pValues
        DenseVector([0.1928, 0.1105, 0.007, 0.0274, 0.5871, 0.838])
        >>> fValueResult = FValueTest.test(dataset, 'features', 'label', True)
        >>> row = fValueResult.orderBy("featureIndex").collect()
        >>> row[0].fValue
        3.7409548308350593
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.FValueTest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform an ANOVA test using dataset.

        :param dataset:
          DataFrame of categorical labels and continuous features.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `fValue: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `fValues: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ANOVATest
        >>> dataset = [[2.0, Vectors.dense([0.43486404, 0.57153633, 0.43175686,
        ...                                 0.51418671, 0.61632374, 0.96565515])],
        ...            [1.0, Vectors.dense([0.49162732, 0.6785187, 0.85460572,
        ...                                 0.59784822, 0.12394819, 0.53783355])],
        ...            [2.0, Vectors.dense([0.30879653, 0.54904515, 0.17103889,
        ...                                 0.40492506, 0.18957493, 0.5440016])],
        ...            [3.0, Vectors.dense([0.68114391, 0.60549825, 0.69094651,
        ...                                 0.62102109, 0.05471483, 0.96449167])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> anovaResult = ANOVATest.test(dataset, 'features', 'label')
        >>> row = anovaResult.select("fValues", "pValues").collect()
        >>> row[0].fValues
        DenseVector([4.0264, 18.4713, 3.4659, 1.9042, 0.5532, 0.512])
        >>> row[0].pValues
        DenseVector([0.3324, 0.1623, 0.3551, 0.456, 0.689, 0.7029])
        >>> anovaResult = ANOVATest.test(dataset, 'features', 'label', True)
        >>> row = anovaResult.orderBy("featureIndex").collect()
        >>> row[0].fValue
        4.026438671875297
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ANOVATest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
示例#29
0
    def corr(dataset: DataFrame,
             column: str,
             method: str = "pearson") -> DataFrame:
        """
        Compute the correlation matrix with specified method using dataset.

        .. versionadded:: 2.2.0

        Parameters
        ----------
        dataset : :py:class:`pyspark.sql.DataFrame`
            A DataFrame.
        column : str
            The name of the column of vectors for which the correlation coefficient needs
            to be computed. This must be a column of the dataset, and it must contain
            Vector objects.
        method : str, optional
            String specifying the method to use for computing correlation.
            Supported: `pearson` (default), `spearman`.

        Returns
        -------
        A DataFrame that contains the correlation matrix of the column of vectors. This
        DataFrame contains a single row and a single column of name `METHODNAME(COLUMN)`.

        Examples
        --------
        >>> from pyspark.ml.linalg import DenseMatrix, Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        assert sc is not None

        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
示例#30
0
    def _new_java_obj(java_class: str, *args: Any) -> "JavaObject":
        """
        Returns a new Java object.
        """
        sc = SparkContext._active_spark_context
        assert sc is not None

        java_obj = _jvm()
        for name in java_class.split("."):
            java_obj = getattr(java_obj, name)
        java_args = [_py2java(sc, arg) for arg in args]
        return java_obj(*java_args)
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `statistic: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        .. versionchanged:: 3.1.0
           Added optional ``flatten`` argument.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label', True)
        >>> row = chiSqResult.orderBy("featureIndex").collect()
        >>> row[0].statistic
        4.0
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
 def _make_java_param_pair(self, param, value):
     """
     Makes a Java param pair.
     """
     sc = SparkContext._active_spark_context
     param = self._resolveParam(param)
     java_param = sc._jvm.org.apache.spark.ml.param.Param(param.parent, param.name, param.doc)
     if isinstance(value, Params) and hasattr(value, "_to_java"):
         # Convert JavaEstimator/JavaTransformer object or Estimator/Transformer object which
         # implements `_to_java` method (such as OneVsRest, Pipeline object) to java object.
         # used in the case of an estimator having another estimator as a parameter
         # the reason why this is not in _py2java in common.py is that importing
         # Estimator and Model in common.py results in a circular import with inherit_doc
         java_value = value._to_java()
     else:
         java_value = _py2java(sc, value)
     return java_param.w(java_value)
示例#33
0
文件: tuning.py 项目: zzq1/Spark
    def _to_java(self):
        """
        Transfer this instance to a Java CrossValidatorModel. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context
        # TODO: persist average metrics as well
        _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel",
                                             self.uid,
                                             self.bestModel._to_java(),
                                             _py2java(sc, []))
        estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)
        return _java_obj
示例#34
0
文件: stat.py 项目: Brett-A/spark
    def test(dataset, sampleCol, distName, *params):
        """
        Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution
        equality. Currently supports the normal distribution, taking as parameters the mean and
        standard deviation.

        :param dataset:
          a Dataset or a DataFrame containing the sample of data to test.
        :param sampleCol:
          Name of sample column in dataset, of any numerical type.
        :param distName:
          a `string` name for a theoretical distribution, currently only support "norm".
        :param params:
          a list of `Double` values specifying the parameters to be used for the theoretical
          distribution. For "norm" distribution, the parameters includes mean and variance.
        :return:
          A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data.
          This DataFrame will contain a single Row with the following fields:
          - `pValue: Double`
          - `statistic: Double`

        >>> from pyspark.ml.stat import KolmogorovSmirnovTest
        >>> dataset = [[-1.0], [0.0], [1.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        >>> dataset = [[2.0], [3.0], [4.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest
        dataset = _py2java(sc, dataset)
        params = [float(param) for param in params]
        return _java2py(sc, javaTestObj.test(dataset, sampleCol, distName,
                                             _jvm().PythonUtils.toSeq(params)))
示例#35
0
文件: tuning.py 项目: Altiscale/spark
    def _to_java(self):
        """
        Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence.
        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context
        # TODO: persst validation metrics as well
        _java_obj = JavaParams._new_java_obj(
            "org.apache.spark.ml.tuning.TrainValidationSplitModel",
            self.uid,
            self.bestModel._to_java(),
            _py2java(sc, []))
        estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)
        return _java_obj
示例#36
0
文件: stat.py 项目: Brett-A/spark
    def corr(dataset, column, method="pearson"):
        """
        Compute the correlation matrix with specified method using dataset.

        :param dataset:
          A Dataset or a DataFrame.
        :param column:
          The name of the column of vectors for which the correlation coefficient needs
          to be computed. This must be a column of the dataset, and it must contain
          Vector objects.
        :param method:
          String specifying the method to use for computing correlation.
          Supported: `pearson` (default), `spearman`.
        :return:
          A DataFrame that contains the correlation matrix of the column of vectors. This
          DataFrame contains a single row and a single column of name
          '$METHODNAME($COLUMN)'.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
    def corr(dataset, column, method="pearson"):
        """
        Compute the correlation matrix with specified method using dataset.

        :param dataset:
          A Dataset or a DataFrame.
        :param column:
          The name of the column of vectors for which the correlation coefficient needs
          to be computed. This must be a column of the dataset, and it must contain
          Vector objects.
        :param method:
          String specifying the method to use for computing correlation.
          Supported: `pearson` (default), `spearman`.
        :return:
          A DataFrame that contains the correlation matrix of the column of vectors. This
          DataFrame contains a single row and a single column of name
          '$METHODNAME($COLUMN)'.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
示例#38
0
文件: tuning.py 项目: zhaohc10/spark
    def _to_java(self):
        """
        Transfer this instance to a Java CrossValidatorModel. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context
        _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel",
                                             self.uid,
                                             self.bestModel._to_java(),
                                             _py2java(sc, self.avgMetrics))
        estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)

        if self.subModels is not None:
            java_sub_models = [[sub_model._to_java() for sub_model in fold_sub_models]
                               for fold_sub_models in self.subModels]
            _java_obj.setSubModels(java_sub_models)
        return _java_obj
示例#39
0
    def _to_java(self):
        """
        Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence.

        Returns
        -------
        py4j.java_gateway.JavaObject
            Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context
        _java_obj = JavaParams._new_java_obj(
            "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid,
            self.bestModel._to_java(), _py2java(sc, self.validationMetrics))
        estimator, epms, evaluator = super(TrainValidationSplitModel,
                                           self)._to_java_impl()

        params = {
            "evaluator": evaluator,
            "estimator": estimator,
            "estimatorParamMaps": epms,
            "trainRatio": self.getTrainRatio(),
            "seed": self.getSeed(),
        }
        for param_name, param_val in params.items():
            java_param = _java_obj.getParam(param_name)
            pair = java_param.w(param_val)
            _java_obj.set(pair)

        if self.subModels is not None:
            java_sub_models = [
                sub_model._to_java() for sub_model in self.subModels
            ]
            _java_obj.setSubModels(java_sub_models)

        return _java_obj
示例#40
0
文件: tuning.py 项目: BaiBenny/spark
    def _to_java(self):
        """
        Transfer this instance to a Java CrossValidatorModel. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context
        # TODO: persist average metrics as well
        _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel",
                                             self.uid,
                                             self.bestModel._to_java(),
                                             _py2java(sc, []))
        estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)

        if self.subModels is not None:
            java_sub_models = [[sub_model._to_java() for sub_model in fold_sub_models]
                               for fold_sub_models in self.subModels]
            _java_obj.setSubModels(java_sub_models)
        return _java_obj
示例#41
0
def toPMMLBytes(sc, data, pipelineModel):
	javaData = _py2java(sc, data)
	javaSchema = javaData.schema.__call__()
	javaPipelineModel = pipelineModel._to_java()
	return sc._jvm.org.jpmml.sparkml.ConverterUtil.toPMMLByteArray(javaSchema, javaPipelineModel)
示例#42
0
 def _call_java(self, name, *args):
     m = getattr(self._java_obj, name)
     sc = SparkContext._active_spark_context
     java_args = [_py2java(sc, arg) for arg in args]
     return _java2py(sc, m(*java_args))
示例#43
0
 def _call_java(self, name, *args):
     m = getattr(self._java_obj, name)
     sc = SparkContext._active_spark_context
     java_args = [_py2java(sc, arg) for arg in args]
     return _java2py(sc, m(*java_args))