def _from_java(cls, JavaObject): sc = SparkContext._active_spark_context bucket = _java2py(sc, JavaObject.bucket()) object_path = _java2py(sc, JavaObject.objectPath()) return S3DataPath(bucket, object_path)
def _transfer_params_from_java(self): """ Transforms the embedded com.microsoft.azure.synapse.ml.core.serialize.params from the companion Java object. """ sc = SparkContext._active_spark_context for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) # SPARK-14931: Only check set com.microsoft.azure.synapse.ml.core.serialize.params back to avoid default com.microsoft.azure.synapse.ml.core.serialize.params mismatch. complex_param_class = sc._gateway.jvm.com.microsoft.azure.synapse.ml.core.serialize.ComplexParam._java_lang_class is_complex_param = complex_param_class.isAssignableFrom( java_param.getClass()) service_param_class = sc._gateway.jvm.org.apache.spark.ml.param.ServiceParam._java_lang_class is_service_param = service_param_class.isAssignableFrom( java_param.getClass()) if self._java_obj.isSet(java_param): if is_complex_param: value = self._java_obj.getOrDefault(java_param) elif is_service_param: jvObj = self._java_obj.getOrDefault(java_param) if jvObj.isLeft(): value = _java2py(sc, jvObj.value()) else: value = None else: value = _java2py( sc, self._java_obj.getOrDefault(java_param)) self._set(**{param.name: value})
def meta_estimator_transfer_param_maps_from_java(pyEstimator, javaParamMaps): pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator) stagePairs = list(map(lambda stage: (stage, stage._to_java()), pyStages)) sc = SparkContext._active_spark_context pyParamMaps = [] for javaParamMap in javaParamMaps: pyParamMap = dict() for javaPair in javaParamMap.toList(): javaParam = javaPair.param() pyParam = None for pyStage, javaStage in stagePairs: if pyStage._testOwnParam(javaParam.parent(), javaParam.name()): pyParam = pyStage.getParam(javaParam.name()) if pyParam is None: raise ValueError('Resolve param in estimatorParamMaps failed: ' + javaParam.parent() + '.' + javaParam.name()) javaValue = javaPair.value() if sc._jvm.Class.forName("org.apache.spark.ml.util.DefaultParamsWritable") \ .isInstance(javaValue): pyValue = JavaParams._from_java(javaValue) else: pyValue = _java2py(sc, javaValue) pyParamMap[pyParam] = pyValue pyParamMaps.append(pyParamMap) return pyParamMaps
def _call_java(sc, java_obj, name, *args): """ Method copied from pyspark.ml.wrapper. Uses private Spark APIs. """ m = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))
def _from_java(cls, java_stage): """ Given a Java CrossValidatorModel, create and return a Python wrapper of it. Used for ML persistence. """ sc = SparkContext._active_spark_context bestModel = JavaParams._from_java(java_stage.bestModel()) avgMetrics = _java2py(sc, java_stage.avgMetrics()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) py_stage = cls(bestModel=bestModel, avgMetrics=avgMetrics) params = { "evaluator": evaluator, "estimator": estimator, "estimatorParamMaps": epms, "numFolds": java_stage.getNumFolds(), "foldCol": java_stage.getFoldCol(), "seed": java_stage.getSeed(), } for param_name, param_val in params.items(): py_stage = py_stage._set(**{param_name: param_val}) if java_stage.hasSubModels(): py_stage.subModels = [[ JavaParams._from_java(sub_model) for sub_model in fold_sub_models ] for fold_sub_models in java_stage.subModels()] py_stage._resetUid(java_stage.uid()) return py_stage
def meta_estimator_transfer_param_maps_from_java(pyEstimator, javaParamMaps): pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator) stagePairs = list( map(lambda stage: (stage, stage._to_java()), pyStages)) sc = SparkContext._active_spark_context pyParamMaps = [] for javaParamMap in javaParamMaps: pyParamMap = dict() for javaPair in javaParamMap.toList(): javaParam = javaPair.param() pyParam = None for pyStage, javaStage in stagePairs: if pyStage._testOwnParam(javaParam.parent(), javaParam.name()): pyParam = pyStage.getParam(javaParam.name()) if pyParam is None: raise ValueError( 'Resolve param in estimatorParamMaps failed: ' + javaParam.parent() + '.' + javaParam.name()) javaValue = javaPair.value() if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage" ).isInstance(javaValue): # Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class # and Estimator/Transformer class which implements `_from_java` static method # (such as OneVsRest, Pipeline class). pyValue = JavaParams._from_java(javaValue) else: pyValue = _java2py(sc, javaValue) pyParamMap[pyParam] = pyValue pyParamMaps.append(pyParamMap) return pyParamMaps
def test(dataset, featuresCol, labelCol): """ Perform a Pearson's independence test using dataset. :param dataset: DataFrame of categorical labels and categorical features. Real-valued features will be treated as categorical for each distinct value. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :return: DataFrame containing the test result for every feature against the label. This DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[Int]` - `statistics: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ChiSquareTest >>> dataset = [[0, Vectors.dense([0, 0, 1])], ... [0, Vectors.dense([1, 0, 1])], ... [1, Vectors.dense([2, 1, 1])], ... [1, Vectors.dense([3, 1, 1])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label') >>> chiSqResult.select("degreesOfFreedom").collect()[0] Row(degreesOfFreedom=[3, 1, 0]) """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] return _java2py(sc, javaTestObj.test(*args))
def _transfer_params_from_java(self): """ Transforms the embedded params from the companion Java object. """ sc = SparkContext._active_spark_context for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) # SPARK-14931: Only check set params back to avoid default params mismatch. if self._java_obj.isSet(java_param): value = _java2py(sc, self._java_obj.getOrDefault(java_param)) self._set(**{param.name: value}) # SPARK-10931: Temporary fix for params that have a default in Java if self._java_obj.hasDefault(java_param) and not self.isDefined(param): value = _java2py(sc, self._java_obj.getDefault(java_param)).get() self._setDefault(**{param.name: value})
def _from_java(cls, java_stage): """ Given a Java TrainValidationSplitModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. sc = SparkContext._active_spark_context bestModel = JavaParams._from_java(java_stage.bestModel()) validationMetrics = _java2py(sc, java_stage.validationMetrics()) estimator, epms, evaluator = super(TrainValidationSplitModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. py_stage = cls( bestModel=bestModel, validationMetrics=validationMetrics)._set(estimator=estimator) py_stage = py_stage._set(estimatorParamMaps=epms)._set( evaluator=evaluator) if java_stage.hasSubModels(): py_stage.subModels = [ JavaParams._from_java(sub_model) for sub_model in java_stage.subModels() ] py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java(cls, java_stage): """ Given a Java TrainValidationSplitModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. sc = SparkContext._active_spark_context bestModel = JavaParams._from_java(java_stage.bestModel()) validationMetrics = _java2py(sc, java_stage.validationMetrics()) estimator, epms, evaluator = super(TrainValidationSplitModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. py_stage = cls(bestModel=bestModel, validationMetrics=validationMetrics) params = { "evaluator": evaluator, "estimator": estimator, "estimatorParamMaps": epms, "trainRatio": java_stage.getTrainRatio(), "seed": java_stage.getSeed(), } for param_name, param_val in params.items(): py_stage = py_stage._set(**{param_name: param_val}) if java_stage.hasSubModels(): py_stage.subModels = [ JavaParams._from_java(sub_model) for sub_model in java_stage.subModels() ] py_stage._resetUid(java_stage.uid()) return py_stage
def _call_java(self, name: str, *args: Any) -> Any: m = getattr(self._java_obj, name) sc = SparkContext._active_spark_context assert sc is not None java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))
def corr(dataset, column, method="pearson"): """ Compute the correlation matrix with specified method using dataset. """ sc = SparkContext._active_spark_context javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def test(dataset, featuresCol, labelCol): """ Perform a Pearson's independence test using dataset. """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] return _java2py(sc, javaTestObj.test(*args))
def test(dataset: DataFrame, sampleCol: str, distName: str, *params: float) -> DataFrame: """ Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution equality. Currently supports the normal distribution, taking as parameters the mean and standard deviation. .. versionadded:: 2.4.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` a Dataset or a DataFrame containing the sample of data to test. sampleCol : str Name of sample column in dataset, of any numerical type. distName : str a `string` name for a theoretical distribution, currently only support "norm". params : float a list of `float` values specifying the parameters to be used for the theoretical distribution. For "norm" distribution, the parameters includes mean and variance. Returns ------- A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data. This DataFrame will contain a single Row with the following fields: - `pValue: Double` - `statistic: Double` Examples -------- >>> from pyspark.ml.stat import KolmogorovSmirnovTest >>> dataset = [[-1.0], [0.0], [1.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 >>> dataset = [[2.0], [3.0], [4.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 """ sc = SparkContext._active_spark_context assert sc is not None javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest dataset = _py2java(sc, dataset) params = [float(param) for param in params] # type: ignore[assignment] return _java2py( sc, javaTestObj.test(dataset, sampleCol, distName, _jvm().PythonUtils.toSeq(params)))
def _transfer_param_map_from_java(self, javaParamMap): """ Transforms a Java ParamMap into a Python ParamMap. """ sc = SparkContext._active_spark_context paramMap = dict() for pair in javaParamMap.toList(): param = pair.param() if self.hasParam(str(param.name())): paramMap[self.getParam(param.name())] = _java2py(sc, pair.value()) return paramMap
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform a F Regression test using dataset. :param dataset: DataFrame of continuous labels and continuous features. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `fValue: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `fValues: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import FValueTest >>> dataset = [[0.57495218, Vectors.dense([0.43486404, 0.57153633, 0.43175686, ... 0.51418671, 0.61632374, 0.96565515])], ... [0.84619853, Vectors.dense([0.49162732, 0.6785187, 0.85460572, ... 0.59784822, 0.12394819, 0.53783355])], ... [0.39777647, Vectors.dense([0.30879653, 0.54904515, 0.17103889, ... 0.40492506, 0.18957493, 0.5440016])], ... [0.79201573, Vectors.dense([0.68114391, 0.60549825, 0.69094651, ... 0.62102109, 0.05471483, 0.96449167])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> fValueResult = FValueTest.test(dataset, 'features', 'label') >>> row = fValueResult.select("fValues", "pValues").collect() >>> row[0].fValues DenseVector([3.741, 7.5807, 142.0684, 34.9849, 0.4112, 0.0539]) >>> row[0].pValues DenseVector([0.1928, 0.1105, 0.007, 0.0274, 0.5871, 0.838]) >>> fValueResult = FValueTest.test(dataset, 'features', 'label', True) >>> row = fValueResult.orderBy("featureIndex").collect() >>> row[0].fValue 3.7409548308350593 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.FValueTest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform an ANOVA test using dataset. :param dataset: DataFrame of categorical labels and continuous features. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `fValue: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `fValues: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ANOVATest >>> dataset = [[2.0, Vectors.dense([0.43486404, 0.57153633, 0.43175686, ... 0.51418671, 0.61632374, 0.96565515])], ... [1.0, Vectors.dense([0.49162732, 0.6785187, 0.85460572, ... 0.59784822, 0.12394819, 0.53783355])], ... [2.0, Vectors.dense([0.30879653, 0.54904515, 0.17103889, ... 0.40492506, 0.18957493, 0.5440016])], ... [3.0, Vectors.dense([0.68114391, 0.60549825, 0.69094651, ... 0.62102109, 0.05471483, 0.96449167])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> anovaResult = ANOVATest.test(dataset, 'features', 'label') >>> row = anovaResult.select("fValues", "pValues").collect() >>> row[0].fValues DenseVector([4.0264, 18.4713, 3.4659, 1.9042, 0.5532, 0.512]) >>> row[0].pValues DenseVector([0.3324, 0.1623, 0.3551, 0.456, 0.689, 0.7029]) >>> anovaResult = ANOVATest.test(dataset, 'features', 'label', True) >>> row = anovaResult.orderBy("featureIndex").collect() >>> row[0].fValue 4.026438671875297 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ANOVATest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def corr(dataset: DataFrame, column: str, method: str = "pearson") -> DataFrame: """ Compute the correlation matrix with specified method using dataset. .. versionadded:: 2.2.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` A DataFrame. column : str The name of the column of vectors for which the correlation coefficient needs to be computed. This must be a column of the dataset, and it must contain Vector objects. method : str, optional String specifying the method to use for computing correlation. Supported: `pearson` (default), `spearman`. Returns ------- A DataFrame that contains the correlation matrix of the column of vectors. This DataFrame contains a single row and a single column of name `METHODNAME(COLUMN)`. Examples -------- >>> from pyspark.ml.linalg import DenseMatrix, Vectors >>> from pyspark.ml.stat import Correlation >>> dataset = [[Vectors.dense([1, 0, 0, -2])], ... [Vectors.dense([4, 5, 0, 3])], ... [Vectors.dense([6, 7, 0, 8])], ... [Vectors.dense([9, 0, 0, 1])]] >>> dataset = spark.createDataFrame(dataset, ['features']) >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0] >>> print(str(pearsonCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...], [ 0.0556..., 1. , NaN, 0.9135...], [ NaN, NaN, 1. , NaN], [ 0.4004..., 0.9135..., NaN, 1. ]]) >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0] >>> print(str(spearmanCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ], [ 0.1054..., 1. , NaN, 0.9486... ], [ NaN, NaN, 1. , NaN], [ 0.4 , 0.9486... , NaN, 1. ]]) """ sc = SparkContext._active_spark_context assert sc is not None javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform a Pearson's independence test using dataset. :param dataset: DataFrame of categorical labels and categorical features. Real-valued features will be treated as categorical for each distinct value. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `statistic: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `statistics: Vector` Each of these fields has one value per feature. .. versionchanged:: 3.1.0 Added optional ``flatten`` argument. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ChiSquareTest >>> dataset = [[0, Vectors.dense([0, 0, 1])], ... [0, Vectors.dense([1, 0, 1])], ... [1, Vectors.dense([2, 1, 1])], ... [1, Vectors.dense([3, 1, 1])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label') >>> chiSqResult.select("degreesOfFreedom").collect()[0] Row(degreesOfFreedom=[3, 1, 0]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label', True) >>> row = chiSqResult.orderBy("featureIndex").collect() >>> row[0].statistic 4.0 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def _transfer_params_from_java(self) -> None: """ Transforms the embedded params from the companion Java object. """ sc = SparkContext._active_spark_context assert sc is not None and self._java_obj is not None for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) # SPARK-14931: Only check set params back to avoid default params mismatch. if self._java_obj.isSet(java_param): java_value = self._java_obj.getOrDefault(java_param) if param.typeConverter.__name__.startswith("toList"): value = [_java2py(sc, x) for x in list(java_value)] else: value = _java2py(sc, java_value) self._set(**{param.name: value}) # SPARK-10931: Temporary fix for params that have a default in Java if self._java_obj.hasDefault( java_param) and not self.isDefined(param): value = _java2py( sc, self._java_obj.getDefault(java_param)).get() self._setDefault(**{param.name: value})
def test(dataset, sampleCol, distName, *params): """ Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution equality. Currently supports the normal distribution, taking as parameters the mean and standard deviation. :param dataset: a Dataset or a DataFrame containing the sample of data to test. :param sampleCol: Name of sample column in dataset, of any numerical type. :param distName: a `string` name for a theoretical distribution, currently only support "norm". :param params: a list of `Double` values specifying the parameters to be used for the theoretical distribution. For "norm" distribution, the parameters includes mean and variance. :return: A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data. This DataFrame will contain a single Row with the following fields: - `pValue: Double` - `statistic: Double` >>> from pyspark.ml.stat import KolmogorovSmirnovTest >>> dataset = [[-1.0], [0.0], [1.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 >>> dataset = [[2.0], [3.0], [4.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest dataset = _py2java(sc, dataset) params = [float(param) for param in params] return _java2py(sc, javaTestObj.test(dataset, sampleCol, distName, _jvm().PythonUtils.toSeq(params)))
def _transfer_param_map_from_java(self, javaParamMap): """ Transforms a Java ParamMap into a Python ParamMap. """ sc = SparkContext._active_spark_context paramMap = dict() for pair in javaParamMap.toList(): param = pair.param() if self.hasParam(str(param.name())): java_obj = pair.value() if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage").isInstance(java_obj): # Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class # and Estimator/Transformer class which implements `_from_java` static method # (such as OneVsRest, Pipeline class). py_obj = JavaParams._from_java(java_obj) else: py_obj = _java2py(sc, java_obj) paramMap[self.getParam(param.name())] = py_obj return paramMap
def _transfer_params_from_java(self): """ Transforms the embedded params from the companion Java object. """ sc = SparkContext._active_spark_context for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) # SPARK-14931: Only check set params back to avoid default params mismatch. complex_param_class = sc._gateway.jvm.org.apache.spark.ml.param.ComplexParam._java_lang_class is_complex_param = complex_param_class.isAssignableFrom( java_param.getClass()) if self._java_obj.isSet(java_param): if is_complex_param: value = self._java_obj.getOrDefault(java_param) else: value = _java2py( sc, self._java_obj.getOrDefault(java_param)) self._set(**{param.name: value})
def corr(dataset, column, method="pearson"): """ Compute the correlation matrix with specified method using dataset. :param dataset: A Dataset or a DataFrame. :param column: The name of the column of vectors for which the correlation coefficient needs to be computed. This must be a column of the dataset, and it must contain Vector objects. :param method: String specifying the method to use for computing correlation. Supported: `pearson` (default), `spearman`. :return: A DataFrame that contains the correlation matrix of the column of vectors. This DataFrame contains a single row and a single column of name '$METHODNAME($COLUMN)'. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import Correlation >>> dataset = [[Vectors.dense([1, 0, 0, -2])], ... [Vectors.dense([4, 5, 0, 3])], ... [Vectors.dense([6, 7, 0, 8])], ... [Vectors.dense([9, 0, 0, 1])]] >>> dataset = spark.createDataFrame(dataset, ['features']) >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0] >>> print(str(pearsonCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...], [ 0.0556..., 1. , NaN, 0.9135...], [ NaN, NaN, 1. , NaN], [ 0.4004..., 0.9135..., NaN, 1. ]]) >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0] >>> print(str(spearmanCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ], [ 0.1054..., 1. , NaN, 0.9486... ], [ NaN, NaN, 1. , NaN], [ 0.4 , 0.9486... , NaN, 1. ]]) """ sc = SparkContext._active_spark_context javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def _from_java(cls, java_stage): """ Given a Java CrossValidatorModel, create and return a Python wrapper of it. Used for ML persistence. """ sc = SparkContext._active_spark_context bestModel = JavaParams._from_java(java_stage.bestModel()) avgMetrics = _java2py(sc, java_stage.avgMetrics()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) py_stage = cls(bestModel=bestModel, avgMetrics=avgMetrics).setEstimator(estimator) py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator) if java_stage.hasSubModels(): py_stage.subModels = [[JavaParams._from_java(sub_model) for sub_model in fold_sub_models] for fold_sub_models in java_stage.subModels()] py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java(cls, java_object): # primitives and spark data types are converted automatically by # _java2py(), in those cases there is nothing to do if type(java_object) != py4j.java_gateway.JavaObject: return java_object # construct a mapping of our python wrapped classes to # java/scala classes wrapped_classes = {} for cls in SageMakerJavaWrapper.__subclasses__(): wrapped_classes[cls._wrapped_class] = cls class_name = java_object.getClass().getName() # SageMakerJavaWrapper classes know how to convert themselves from a Java Object # otherwise hand over to _java2py and hope for the best. if class_name in wrapped_classes: return wrapped_classes[class_name]._from_java(java_object) elif class_name.startswith("scala.None"): return None else: sc = SparkContext._active_spark_context return _java2py(sc, java_object)
from pyspark.sql import SparkSession from pyspark.ml.common import _java2py from Chapter01.utilities01_py.helper_python import create_session # ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit --driver-class-path ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Exercise4_06/Exercise4_06.py if __name__ == "__main__": session: SparkSession = create_session(2, "PySpark <> JVM") session.sparkContext.setLogLevel('ERROR') python_rdd = session.sparkContext.range(0, 5) java_rdd = session.sparkContext._jvm.SerDe.pythonToJava( python_rdd._jrdd, True) mapped_java_rdd = session.sparkContext._jvm.Exercise4_06.ScalaObject.executeInScala( java_rdd) mapped_python_rdd = _java2py(session.sparkContext, mapped_java_rdd) print(mapped_python_rdd.collect())
def _call_java(self, name, *args): m = getattr(self._java_obj, name) sc = SparkContext._active_spark_context java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))
from sys import argv from pyspark.ml.common import _java2py from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc # ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit --driver-class-path ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar:/Users/a/.m2/repository/com/google/guava/guava/28.2-jre/guava-28.2-jre.jar:/Users/a/.m2/repository/org/apache/commons/commons-compress/1.20/commons-compress-1.20.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Activity4_03/Activity4_03.py ~/Output_Act4_3 if __name__ == "__main__": output_dir = argv[1] session = create_session(3, 'WARC Parser') warc_records = extract_raw_records(sample_warc_loc, session) \ .flatMap(lambda record: parse_raw_warc(record)) \ .filter(lambda record: record.warc_type == 'response') plaintexts_rdd = warc_records.map(lambda record: record.html_source) java_rdd = session.sparkContext._jvm.SerDe.pythonToJava( plaintexts_rdd._jrdd, True) tagged_java_rdd = session.sparkContext._jvm.Activity4_03.Activity4_03.tagJavaRDD( java_rdd) tagged_python_rdd = _java2py(session.sparkContext, tagged_java_rdd) tagged_python_rdd.saveAsTextFile(output_dir)
def feature_aggregated_shap(self, input_cols): return _java2py(self._sc, self._shapley_model.getShapValuesFromModel(input_cols))
def calculate(self): return _java2py(self._sc, self._shapley_model.calculate())