def test(dataset: DataFrame, sampleCol: str, distName: str, *params: float) -> DataFrame: """ Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution equality. Currently supports the normal distribution, taking as parameters the mean and standard deviation. .. versionadded:: 2.4.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` a Dataset or a DataFrame containing the sample of data to test. sampleCol : str Name of sample column in dataset, of any numerical type. distName : str a `string` name for a theoretical distribution, currently only support "norm". params : float a list of `float` values specifying the parameters to be used for the theoretical distribution. For "norm" distribution, the parameters includes mean and variance. Returns ------- A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data. This DataFrame will contain a single Row with the following fields: - `pValue: Double` - `statistic: Double` Examples -------- >>> from pyspark.ml.stat import KolmogorovSmirnovTest >>> dataset = [[-1.0], [0.0], [1.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 >>> dataset = [[2.0], [3.0], [4.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 """ sc = SparkContext._active_spark_context assert sc is not None javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest dataset = _py2java(sc, dataset) params = [float(param) for param in params] # type: ignore[assignment] return _java2py( sc, javaTestObj.test(dataset, sampleCol, distName, _jvm().PythonUtils.toSeq(params)))
def test(dataset, featuresCol, labelCol): """ Perform a Pearson's independence test using dataset. :param dataset: DataFrame of categorical labels and categorical features. Real-valued features will be treated as categorical for each distinct value. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :return: DataFrame containing the test result for every feature against the label. This DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[Int]` - `statistics: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ChiSquareTest >>> dataset = [[0, Vectors.dense([0, 0, 1])], ... [0, Vectors.dense([1, 0, 1])], ... [1, Vectors.dense([2, 1, 1])], ... [1, Vectors.dense([3, 1, 1])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label') >>> chiSqResult.select("degreesOfFreedom").collect()[0] Row(degreesOfFreedom=[3, 1, 0]) """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] return _java2py(sc, javaTestObj.test(*args))
def test(dataset, featuresCol, labelCol): """ Perform a Pearson's independence test using dataset. """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] return _java2py(sc, javaTestObj.test(*args))
def corr(dataset, column, method="pearson"): """ Compute the correlation matrix with specified method using dataset. """ sc = SparkContext._active_spark_context javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def test(dataset, sampleCol, distName, *params): """ Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution equality. Currently supports the normal distribution, taking as parameters the mean and standard deviation. :param dataset: a Dataset or a DataFrame containing the sample of data to test. :param sampleCol: Name of sample column in dataset, of any numerical type. :param distName: a `string` name for a theoretical distribution, currently only support "norm". :param params: a list of `Double` values specifying the parameters to be used for the theoretical distribution. For "norm" distribution, the parameters includes mean and variance. :return: A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data. This DataFrame will contain a single Row with the following fields: - `pValue: Double` - `statistic: Double` >>> from pyspark.ml.stat import KolmogorovSmirnovTest >>> dataset = [[-1.0], [0.0], [1.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 >>> dataset = [[2.0], [3.0], [4.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest dataset = _py2java(sc, dataset) params = [float(param) for param in params] return _java2py(sc, javaTestObj.test(dataset, sampleCol, distName, _jvm().PythonUtils.toSeq(params)))
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform a F Regression test using dataset. :param dataset: DataFrame of continuous labels and continuous features. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `fValue: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `fValues: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import FValueTest >>> dataset = [[0.57495218, Vectors.dense([0.43486404, 0.57153633, 0.43175686, ... 0.51418671, 0.61632374, 0.96565515])], ... [0.84619853, Vectors.dense([0.49162732, 0.6785187, 0.85460572, ... 0.59784822, 0.12394819, 0.53783355])], ... [0.39777647, Vectors.dense([0.30879653, 0.54904515, 0.17103889, ... 0.40492506, 0.18957493, 0.5440016])], ... [0.79201573, Vectors.dense([0.68114391, 0.60549825, 0.69094651, ... 0.62102109, 0.05471483, 0.96449167])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> fValueResult = FValueTest.test(dataset, 'features', 'label') >>> row = fValueResult.select("fValues", "pValues").collect() >>> row[0].fValues DenseVector([3.741, 7.5807, 142.0684, 34.9849, 0.4112, 0.0539]) >>> row[0].pValues DenseVector([0.1928, 0.1105, 0.007, 0.0274, 0.5871, 0.838]) >>> fValueResult = FValueTest.test(dataset, 'features', 'label', True) >>> row = fValueResult.orderBy("featureIndex").collect() >>> row[0].fValue 3.7409548308350593 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.FValueTest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform an ANOVA test using dataset. :param dataset: DataFrame of categorical labels and continuous features. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `fValue: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `fValues: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ANOVATest >>> dataset = [[2.0, Vectors.dense([0.43486404, 0.57153633, 0.43175686, ... 0.51418671, 0.61632374, 0.96565515])], ... [1.0, Vectors.dense([0.49162732, 0.6785187, 0.85460572, ... 0.59784822, 0.12394819, 0.53783355])], ... [2.0, Vectors.dense([0.30879653, 0.54904515, 0.17103889, ... 0.40492506, 0.18957493, 0.5440016])], ... [3.0, Vectors.dense([0.68114391, 0.60549825, 0.69094651, ... 0.62102109, 0.05471483, 0.96449167])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> anovaResult = ANOVATest.test(dataset, 'features', 'label') >>> row = anovaResult.select("fValues", "pValues").collect() >>> row[0].fValues DenseVector([4.0264, 18.4713, 3.4659, 1.9042, 0.5532, 0.512]) >>> row[0].pValues DenseVector([0.3324, 0.1623, 0.3551, 0.456, 0.689, 0.7029]) >>> anovaResult = ANOVATest.test(dataset, 'features', 'label', True) >>> row = anovaResult.orderBy("featureIndex").collect() >>> row[0].fValue 4.026438671875297 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ANOVATest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def corr(dataset: DataFrame, column: str, method: str = "pearson") -> DataFrame: """ Compute the correlation matrix with specified method using dataset. .. versionadded:: 2.2.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` A DataFrame. column : str The name of the column of vectors for which the correlation coefficient needs to be computed. This must be a column of the dataset, and it must contain Vector objects. method : str, optional String specifying the method to use for computing correlation. Supported: `pearson` (default), `spearman`. Returns ------- A DataFrame that contains the correlation matrix of the column of vectors. This DataFrame contains a single row and a single column of name `METHODNAME(COLUMN)`. Examples -------- >>> from pyspark.ml.linalg import DenseMatrix, Vectors >>> from pyspark.ml.stat import Correlation >>> dataset = [[Vectors.dense([1, 0, 0, -2])], ... [Vectors.dense([4, 5, 0, 3])], ... [Vectors.dense([6, 7, 0, 8])], ... [Vectors.dense([9, 0, 0, 1])]] >>> dataset = spark.createDataFrame(dataset, ['features']) >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0] >>> print(str(pearsonCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...], [ 0.0556..., 1. , NaN, 0.9135...], [ NaN, NaN, 1. , NaN], [ 0.4004..., 0.9135..., NaN, 1. ]]) >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0] >>> print(str(spearmanCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ], [ 0.1054..., 1. , NaN, 0.9486... ], [ NaN, NaN, 1. , NaN], [ 0.4 , 0.9486... , NaN, 1. ]]) """ sc = SparkContext._active_spark_context assert sc is not None javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform a Pearson's independence test using dataset. :param dataset: DataFrame of categorical labels and categorical features. Real-valued features will be treated as categorical for each distinct value. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `statistic: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `statistics: Vector` Each of these fields has one value per feature. .. versionchanged:: 3.1.0 Added optional ``flatten`` argument. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ChiSquareTest >>> dataset = [[0, Vectors.dense([0, 0, 1])], ... [0, Vectors.dense([1, 0, 1])], ... [1, Vectors.dense([2, 1, 1])], ... [1, Vectors.dense([3, 1, 1])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label') >>> chiSqResult.select("degreesOfFreedom").collect()[0] Row(degreesOfFreedom=[3, 1, 0]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label', True) >>> row = chiSqResult.orderBy("featureIndex").collect() >>> row[0].statistic 4.0 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def corr(dataset, column, method="pearson"): """ Compute the correlation matrix with specified method using dataset. :param dataset: A Dataset or a DataFrame. :param column: The name of the column of vectors for which the correlation coefficient needs to be computed. This must be a column of the dataset, and it must contain Vector objects. :param method: String specifying the method to use for computing correlation. Supported: `pearson` (default), `spearman`. :return: A DataFrame that contains the correlation matrix of the column of vectors. This DataFrame contains a single row and a single column of name '$METHODNAME($COLUMN)'. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import Correlation >>> dataset = [[Vectors.dense([1, 0, 0, -2])], ... [Vectors.dense([4, 5, 0, 3])], ... [Vectors.dense([6, 7, 0, 8])], ... [Vectors.dense([9, 0, 0, 1])]] >>> dataset = spark.createDataFrame(dataset, ['features']) >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0] >>> print(str(pearsonCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...], [ 0.0556..., 1. , NaN, 0.9135...], [ NaN, NaN, 1. , NaN], [ 0.4004..., 0.9135..., NaN, 1. ]]) >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0] >>> print(str(spearmanCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ], [ 0.1054..., 1. , NaN, 0.9486... ], [ NaN, NaN, 1. , NaN], [ 0.4 , 0.9486... , NaN, 1. ]]) """ sc = SparkContext._active_spark_context javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def _load_java_obj(cls, java_class): """Load the peer Java object of the ML instance.""" java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) return java_obj
def dataframeToLocalFiles(df, localPath): javaMethod = _jvm().xuwch.sparkmpi.demo1.Util.dataframeToLocalFile jdf = javaMethod(df._jdf, localPath) return DataFrame(jdf, df.sql_ctx)