def corr(x, y=None, method=None): """ Compute the correlation (matrix) for the input RDD(s) using the specified method. Methods currently supported: I{pearson (default), spearman}. If a single RDD of Vectors is passed in, a correlation matrix comparing the columns in the input RDD is returned. Use C{method=} to specify the method to be used for single RDD inout. If two RDDs of floats are passed in, a single float is returned. >>> x = sc.parallelize([1.0, 0.0, -2.0], 2) >>> y = sc.parallelize([4.0, 5.0, 3.0], 2) >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2) >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7 True >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson") True >>> Statistics.corr(x, y, "spearman") 0.5 >>> from math import isnan >>> isnan(Statistics.corr(x, zeros)) True >>> from pyspark.mllib.linalg import Vectors >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), ... Vectors.dense([6, 7, 0, 8]), Vectors.dense([9, 0, 0, 1])]) >>> pearsonCorr = Statistics.corr(rdd) >>> print str(pearsonCorr).replace('nan', 'NaN') [[ 1. 0.05564149 NaN 0.40047142] [ 0.05564149 1. NaN 0.91359586] [ NaN NaN 1. NaN] [ 0.40047142 0.91359586 NaN 1. ]] >>> spearmanCorr = Statistics.corr(rdd, method="spearman") >>> print str(spearmanCorr).replace('nan', 'NaN') [[ 1. 0.10540926 NaN 0.4 ] [ 0.10540926 1. NaN 0.9486833 ] [ NaN NaN 1. NaN] [ 0.4 0.9486833 NaN 1. ]] >>> try: ... Statistics.corr(rdd, "spearman") ... print "Method name as second argument without 'method=' shouldn't be allowed." ... except TypeError: ... pass """ # Check inputs to determine whether a single value or a matrix is needed for output. # Since it's legal for users to use the method name as the second argument, we need to # check if y is used to specify the method name instead. if type(y) == str: raise TypeError("Use 'method=' to specify method name.") if not y: return callMLlibFunc("corr", x.map(_convert_to_vector), method).toArray() else: return callMLlibFunc("corr", x.map(float), y.map(float), method)
def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604): """ Runs the bisecting k-means algorithm return the model. :param rdd: Training points as an `RDD` of `Vector` or convertible sequence types. :param k: The desired number of leaf clusters. The actual number could be smaller if there are no divisible leaf clusters. (default: 4) :param maxIterations: Maximum number of iterations allowed to split clusters. (default: 20) :param minDivisibleClusterSize: Minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster. (default: 1) :param seed: Random seed value for cluster initialization. (default: -1888008604 from classOf[BisectingKMeans].getName.##) """ java_model = callMLlibFunc( "trainBisectingKMeans", rdd.map(_convert_to_vector), k, maxIterations, minDivisibleClusterSize, seed) return BisectingKMeansModel(java_model)
def update(self, data, decayFactor, timeUnit): """Update the centroids, according to data :param data: RDD with new data for the model update. :param decayFactor: Forgetfulness of the previous centroids. :param timeUnit: Can be "batches" or "points". If points, then the decay factor is raised to the power of number of new points and if batches, then decay factor will be used as is. """ if not isinstance(data, RDD): raise TypeError("Data should be of an RDD, got %s." % type(data)) data = data.map(_convert_to_vector) decayFactor = float(decayFactor) if timeUnit not in ["batches", "points"]: raise ValueError( "timeUnit should be 'batches' or 'points', got %s." % timeUnit) vectorCenters = [_convert_to_vector(center) for center in self.centers] updatedModel = callMLlibFunc( "updateStreamingKMeansModel", vectorCenters, self._clusterWeights, data, decayFactor, timeUnit) self.centers = array(updatedModel[0]) self._clusterWeights = list(updatedModel[1]) return self
def colStats(rdd): """ Computes column-wise summary statistics for the input RDD[Vector]. :param rdd: an RDD[Vector] for which column-wise summary statistics are to be computed. :return: :class:`MultivariateStatisticalSummary` object containing column-wise summary statistics. >>> from pyspark.mllib.linalg import Vectors >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]), ... Vectors.dense([4, 5, 0, 3]), ... Vectors.dense([6, 7, 0, 8])]) >>> cStats = Statistics.colStats(rdd) >>> cStats.mean() array([ 4., 4., 0., 3.]) >>> cStats.variance() array([ 4., 13., 0., 25.]) >>> cStats.count() 3L >>> cStats.numNonzeros() array([ 3., 2., 0., 3.]) >>> cStats.max() array([ 6., 7., 0., 8.]) >>> cStats.min() array([ 2., 0., 0., -2.]) """ cStats = callMLlibFunc("colStats", rdd.map(_convert_to_vector)) return MultivariateStatisticalSummary(cStats)
def __init__(self, rows, numRows=0, numCols=0): """ Note: This docstring is not shown publicly. Create a wrapper over a Java RowMatrix. Publicly, we require that `rows` be an RDD. However, for internal usage, `rows` can also be a Java RowMatrix object, in which case we can wrap it directly. This assists in clean matrix conversions. >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) >>> mat = RowMatrix(rows) >>> mat_diff = RowMatrix(rows) >>> (mat_diff._java_matrix_wrapper._java_model == ... mat._java_matrix_wrapper._java_model) False >>> mat_same = RowMatrix(mat._java_matrix_wrapper._java_model) >>> (mat_same._java_matrix_wrapper._java_model == ... mat._java_matrix_wrapper._java_model) True """ if isinstance(rows, RDD): rows = rows.map(_convert_to_vector) java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols)) elif (isinstance(rows, JavaObject) and rows.getClass().getSimpleName() == "RowMatrix"): java_matrix = rows else: raise TypeError("rows should be an RDD of vectors, got %s" % type(rows)) self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Gamma distribution with the input shape and scale. :param sc: SparkContext used to create the RDD. :param shape: shape (> 0) parameter for the Gamma distribution :param scale: scale (> 0) parameter for the Gamma distribution :param size: Size of the RDD. :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). :param seed: Random seed (default: a random long integer). :return: RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale). >>> from math import sqrt >>> shape = 1.0 >>> scale = 2.0 >>> expMean = shape * scale >>> expStd = sqrt(shape * scale * scale) >>> x = RandomRDDs.gammaRDD(sc, shape, scale, 1000, seed=2) >>> stats = x.stats() >>> stats.count() 1000 >>> abs(stats.mean() - expMean) < 0.5 True >>> abs(stats.stdev() - expStd) < 0.5 True """ return callMLlibFunc("gammaRDD", sc._jsc, float(shape), float(scale), size, numPartitions, seed)
def loadLabeledPoints(sc, path, minPartitions=None): """ Load labeled points saved using RDD.saveAsTextFile. :param sc: Spark context :param path: file or directory path in any Hadoop-supported file system URI :param minPartitions: min number of partitions @return: labeled data stored as an RDD of LabeledPoint >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \ LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.close() >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name) >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect() >>> type(loaded[0]) == LabeledPoint True >>> print examples[0] (1.1,(3,[0,2],[-1.23,4.56e-07])) >>> type(examples[1]) == LabeledPoint True >>> print examples[1] (0.0,[1.01,2.02,3.03]) """ minPartitions = minPartitions or min(sc.defaultParallelism, 2) return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
def train(cls, rdd, k, maxIterations=100, initMode="random"): """ :param rdd: An RDD of (i, j, s\ :sub:`ij`\) tuples representing the affinity matrix, which is the matrix A in the PIC paper. The similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored, because it is assumed s\ :sub:`ij`\ = 0.0. :param k: Number of clusters. :param maxIterations: Maximum number of iterations of the PIC algorithm. (default: 100) :param initMode: Initialization mode. This can be either "random" to use a random vector as vertex properties, or "degree" to use normalized sum similarities. (default: "random") """ model = callMLlibFunc( "trainPowerIterationClusteringModel", rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode ) return PowerIterationClusteringModel(model)
def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None): """ Train a matrix factorization model given an RDD of ratings by users for a subset of products. The ratings matrix is approximated as the product of two lower-rank matrices of a given rank (number of features). To solve for these features, ALS is run iteratively with a configurable level of parallelism. :param ratings: RDD of `Rating` or (userID, productID, rating) tuple. :param rank: Rank of the feature matrices computed (number of features). :param iterations: Number of iterations of ALS. (default: 5) :param lambda_: Regularization parameter. (default: 0.01) :param blocks: Number of blocks used to parallelize the computation. A value of -1 will use an auto-configured number of blocks. (default: -1) :param nonnegative: A value of True will solve least-squares with nonnegativity constraints. (default: False) :param seed: Random seed for initial matrix factorization model. A value of None will use system time as the seed. (default: None) """ model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, nonnegative, seed) return MatrixFactorizationModel(model)
def fit(self, data): """ Computes a [[PCAModel]] that contains the principal components of the input vectors. :param data: source vectors """ jmodel = callMLlibFunc("fitPCA", self.k, data) return PCAModel(jmodel)
def train( cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None, ): """Train a k-means clustering model.""" clusterInitialModel = [] if initialModel is not None: if not isinstance(initialModel, KMeansModel): raise Exception( "initialModel is of " + str(type(initialModel)) + ". It needs " "to be of <type 'KMeansModel'>" ) clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters] model = callMLlibFunc( "trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations, runs, initializationMode, seed, initializationSteps, epsilon, clusterInitialModel, ) centers = callJavaFunc(rdd.context, model.clusterCenters) return KMeansModel([c.toArray() for c in centers])
def __init__(self, predictionAndLabels): sc = predictionAndLabels.ctx sql_ctx = SQLContext.getOrCreate(sc) df = sql_ctx.createDataFrame(predictionAndLabels, schema=sql_ctx._inferSchema(predictionAndLabels)) java_model = callMLlibFunc("newRankingMetrics", df._jdf) super(RankingMetrics, self).__init__(java_model)
def train( cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", seed=None, initializationSteps=5, epsilon=1e-4, ): """Train a k-means clustering model.""" model = callMLlibFunc( "trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations, runs, initializationMode, seed, initializationSteps, epsilon, ) centers = callJavaFunc(rdd.context, model.clusterCenters) return KMeansModel([c.toArray() for c in centers])
def uniformRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the uniform distribution U(0.0, 1.0). To transform the distribution in the generated RDD from U(0.0, 1.0) to U(a, b), use C{RandomRDDs.uniformRDD(sc, n, p, seed)\ .map(lambda v: a + (b - a) * v)} :param sc: SparkContext used to create the RDD. :param size: Size of the RDD. :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). :param seed: Random seed (default: a random long integer). :return: RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`. >>> x = RandomRDDs.uniformRDD(sc, 100).collect() >>> len(x) 100 >>> max(x) <= 1.0 and min(x) >= 0.0 True >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions() 4 >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions() >>> parts == sc.defaultParallelism True """ return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed)
def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None): """Train a Gaussian Mixture clustering model.""" weight, mu, sigma = callMLlibFunc("trainGaussianMixture", rdd.map(_convert_to_vector), k, convergenceTol, maxIterations, seed) mvg_obj = [MultivariateGaussian(mu[i], sigma[i]) for i in range(k)] return GaussianMixtureModel(weight, mvg_obj)
def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000): """ Finds the complete set of frequent sequential patterns in the input sequences of itemsets. :param data: The input data set, each element contains a sequence of itemsets. :param minSupport: The minimal support level of the sequential pattern, any pattern that appears more than (minSupport * size-of-the-dataset) times will be output. (default: 0.1) :param maxPatternLength: The maximal length of the sequential pattern, any pattern that appears less than maxPatternLength will be output. (default: 10) :param maxLocalProjDBSize: The maximum number of items (including delimiters used in the internal storage format) allowed in a projected database before local processing. If a projected database exceeds this size, another iteration of distributed prefix growth is run. (default: 32000000) """ model = callMLlibFunc("trainPrefixSpanModel", data, minSupport, maxPatternLength, maxLocalProjDBSize) return PrefixSpanModel(model)
def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the standard normal distribution. To transform the distribution in the generated RDD from standard normal to some other normal N(mean, sigma^2), use C{RandomRDDs.normal(sc, n, p, seed)\ .map(lambda v: mean + sigma * v)} :param sc: SparkContext used to create the RDD. :param size: Size of the RDD. :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). :param seed: Random seed (default: a random long integer). :return: RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0). >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1) >>> stats = x.stats() >>> stats.count() 1000 >>> abs(stats.mean() - 0.0) < 0.1 True >>> abs(stats.stdev() - 1.0) < 0.1 True """ return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the log normal distribution with the input mean and standard distribution. :param sc: SparkContext used to create the RDD. :param mean: mean for the log Normal distribution :param std: std for the log Normal distribution :param size: Size of the RDD. :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). :param seed: Random seed (default: a random long integer). :return: RDD of float comprised of i.i.d. samples ~ log N(mean, std). >>> from math import sqrt, exp >>> mean = 0.0 >>> std = 1.0 >>> expMean = exp(mean + 0.5 * std * std) >>> expStd = sqrt((exp(std * std) - 1.0) * exp(2.0 * mean + std * std)) >>> x = RandomRDDs.logNormalRDD(sc, mean, std, 1000, seed=2) >>> stats = x.stats() >>> stats.count() 1000 >>> abs(stats.mean() - expMean) < 0.5 True >>> from math import sqrt >>> abs(stats.stdev() - expStd) < 0.5 True """ return callMLlibFunc("logNormalRDD", sc._jsc, float(mean), float(std), size, numPartitions, seed)
def _train( cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed, ): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies: raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy) if seed is None: seed = random.randint(0, 1 << 30) model = callMLlibFunc( "trainRandomForestModel", data, algo, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed, ) return RandomForestModel(model)
def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the log normal distribution. :param sc: SparkContext used to create the RDD. :param mean: Mean of the log normal distribution :param std: Standard Deviation of the log normal distribution :param numRows: Number of Vectors in the RDD. :param numCols: Number of elements in each Vector. :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). :param seed: Random seed (default: a random long integer). :return: RDD of Vector with vectors containing i.i.d. samples ~ log `N(mean, std)`. >>> import numpy as np >>> from math import sqrt, exp >>> mean = 0.0 >>> std = 1.0 >>> expMean = exp(mean + 0.5 * std * std) >>> expStd = sqrt((exp(std * std) - 1.0) * exp(2.0 * mean + std * std)) >>> m = RandomRDDs.logNormalVectorRDD(sc, mean, std, 100, 100, seed=1).collect() >>> mat = np.matrix(m) >>> mat.shape (100, 100) >>> abs(mat.mean() - expMean) < 0.1 True >>> abs(mat.std() - expStd) < 0.1 True """ return callMLlibFunc("logNormalVectorRDD", sc._jsc, float(mean), float(std), numRows, numCols, numPartitions, seed)
def _train( cls, data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, ): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" model = callMLlibFunc( "trainDecisionTreeModel", data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain, ) return DecisionTreeModel(model)
def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Exponential distribution with the input mean. :param sc: SparkContext used to create the RDD. :param mean: Mean, or 1 / lambda, for the Exponential distribution. :param numRows: Number of Vectors in the RDD. :param numCols: Number of elements in each Vector. :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`) :param seed: Random seed (default: a random long integer). :return: RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean). >>> import numpy as np >>> mean = 0.5 >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1) >>> mat = np.mat(rdd.collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(mat.std() - sqrt(mean)) < 0.5 True """ return callMLlibFunc("exponentialVectorRDD", sc._jsc, float(mean), numRows, numCols, numPartitions, seed)
def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Gamma distribution. :param sc: SparkContext used to create the RDD. :param shape: Shape (> 0) of the Gamma distribution :param scale: Scale (> 0) of the Gamma distribution :param numRows: Number of Vectors in the RDD. :param numCols: Number of elements in each Vector. :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). :param seed: Random seed (default: a random long integer). :return: RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale). >>> import numpy as np >>> from math import sqrt >>> shape = 1.0 >>> scale = 2.0 >>> expMean = shape * scale >>> expStd = sqrt(shape * scale * scale) >>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - expMean) < 0.1 True >>> abs(mat.std() - expStd) < 0.1 True """ return callMLlibFunc("gammaVectorRDD", sc._jsc, float(shape), float(scale), numRows, numCols, numPartitions, seed)
def _train(cls, data, algo, categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins) return GradientBoostedTreesModel(model)
def trainImplicit( cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, nonnegative=False, seed=None ): model = callMLlibFunc( "trainImplicitALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, alpha, nonnegative, seed ) return MatrixFactorizationModel(model)
def fit(self, dataset): """ Computes the inverse document frequency. :param dataset: an RDD of term frequency vectors """ jmodel = callMLlibFunc("fitIDF", self.minDocFreq, dataset) return IDFModel(jmodel)
def computeCost(self, rdd): """ Return the K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. """ cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector), [_convert_to_vector(c) for c in self.centers]) return cost
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"): """Train a k-means clustering model.""" # cache serialized data to avoid objects over head in JVM jcached = _to_java_object_rdd(rdd.map(_convert_to_vector), cache=True) model = callMLlibFunc("trainKMeansModel", jcached, k, maxIterations, runs, initializationMode) centers = callJavaFunc(rdd.context, model.clusterCenters) return KMeansModel([c.toArray() for c in centers])
def generateLinearRDD(sc, nexamples, nfeatures, eps, nParts=2, intercept=0.0): """ Generate a RDD of LabeledPoints. """ return callMLlibFunc( "generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures), float(eps), int(nParts), float(intercept))
def kolmogorovSmirnovTest(data, distName="norm", *params): """ .. note:: Experimental Performs the Kolmogorov-Smirnov (KS) test for data sampled from a continuous distribution. It tests the null hypothesis that the data is generated from a particular distribution. The given data is sorted and the Empirical Cumulative Distribution Function (ECDF) is calculated which for a given point is the number of points having a CDF value lesser than it divided by the total number of points. Since the data is sorted, this is a step function that rises by (1 / length of data) for every ordered point. The KS statistic gives us the maximum distance between the ECDF and the CDF. Intuitively if this statistic is large, the probabilty that the null hypothesis is true becomes small. For specific details of the implementation, please have a look at the Scala documentation. :param data: RDD, samples from the data :param distName: string, currently only "norm" is supported. (Normal distribution) to calculate the theoretical distribution of the data. :param params: additional values which need to be provided for a certain distribution. If not provided, the default values are used. :return: KolmogorovSmirnovTestResult object containing the test statistic, degrees of freedom, p-value, the method used, and the null hypothesis. >>> kstest = Statistics.kolmogorovSmirnovTest >>> data = sc.parallelize([-1.0, 0.0, 1.0]) >>> ksmodel = kstest(data, "norm") >>> print(round(ksmodel.pValue, 3)) 1.0 >>> print(round(ksmodel.statistic, 3)) 0.175 >>> ksmodel.nullHypothesis u'Sample follows theoretical distribution' >>> data = sc.parallelize([2.0, 3.0, 4.0]) >>> ksmodel = kstest(data, "norm", 3.0, 1.0) >>> print(round(ksmodel.pValue, 3)) 1.0 >>> print(round(ksmodel.statistic, 3)) 0.175 """ if not isinstance(data, RDD): raise TypeError("data should be an RDD, got %s." % type(data)) if not isinstance(distName, basestring): raise TypeError("distName should be a string, got %s." % type(distName)) params = [float(param) for param in params] return KolmogorovSmirnovTestResult( callMLlibFunc("kolmogorovSmirnovTest", data, distName, params))
def __init__(self, rows, numRows=0, numCols=0): """ Note: This docstring is not shown publicly. Create a wrapper over a Java IndexedRowMatrix. Publicly, we require that `rows` be an RDD. However, for internal usage, `rows` can also be a Java IndexedRowMatrix object, in which case we can wrap it directly. This assists in clean matrix conversions. >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows) >>> mat_diff = IndexedRowMatrix(rows) >>> (mat_diff._java_matrix_wrapper._java_model == ... mat._java_matrix_wrapper._java_model) False >>> mat_same = IndexedRowMatrix(mat._java_matrix_wrapper._java_model) >>> (mat_same._java_matrix_wrapper._java_model == ... mat._java_matrix_wrapper._java_model) True """ if isinstance(rows, RDD): rows = rows.map(_convert_to_indexed_row) # We use DataFrames for serialization of IndexedRows from # Python, so first convert the RDD to a DataFrame on this # side. This will convert each IndexedRow to a Row # containing the 'index' and 'vector' values, which can # both be easily serialized. We will convert back to # IndexedRows on the Scala side. java_matrix = callMLlibFunc("createIndexedRowMatrix", rows.toDF(), long(numRows), int(numCols)) elif (isinstance(rows, JavaObject) and rows.getClass().getSimpleName() == "IndexedRowMatrix"): java_matrix = rows else: raise TypeError( "rows should be an RDD of IndexedRows or (long, vector) tuples, " "got %s" % type(rows)) self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
def uniformRDD(sc: SparkContext, size: int, numPartitions: Optional[int] = None, seed: Optional[int] = None) -> RDD[float]: """ Generates an RDD comprised of i.i.d. samples from the uniform distribution U(0.0, 1.0). To transform the distribution in the generated RDD from U(0.0, 1.0) to U(a, b), use ``RandomRDDs.uniformRDD(sc, n, p, seed).map(lambda v: a + (b - a) * v)`` .. versionadded:: 1.1.0 Parameters ---------- sc : :py:class:`pyspark.SparkContext` used to create the RDD. size : int Size of the RDD. numPartitions : int, optional Number of partitions in the RDD (default: `sc.defaultParallelism`). seed : int, optional Random seed (default: a random long integer). Returns ------- :py:class:`pyspark.RDD` RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`. Examples -------- >>> x = RandomRDDs.uniformRDD(sc, 100).collect() >>> len(x) 100 >>> max(x) <= 1.0 and min(x) >= 0.0 True >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions() 4 >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions() >>> parts == sc.defaultParallelism True """ return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed)
def normalVectorRDD( sc: SparkContext, numRows: int, numCols: int, numPartitions: Optional[int] = None, seed: Optional[int] = None, ) -> RDD[Vector]: """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the standard normal distribution. .. versionadded:: 1.1.0 Parameters ---------- sc : :py:class:`pyspark.SparkContext` SparkContext used to create the RDD. numRows : int Number of Vectors in the RDD. numCols : int Number of elements in each Vector. numPartitions : int, optional Number of partitions in the RDD (default: `sc.defaultParallelism`). seed : int, optional Random seed (default: a random long integer). Returns ------- :py:class:`pyspark.RDD` RDD of Vector with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`. Examples -------- >>> import numpy as np >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - 0.0) < 0.1 True >>> abs(mat.std() - 1.0) < 0.1 True """ return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed)
def uniformVectorRDD( sc: SparkContext, numRows: int, numCols: int, numPartitions: Optional[int] = None, seed: Optional[int] = None, ) -> RDD[Vector]: """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the uniform distribution U(0.0, 1.0). .. versionadded:: 1.1.0 Parameters ---------- sc : :py:class:`pyspark.SparkContext` SparkContext used to create the RDD. numRows : int Number of Vectors in the RDD. numCols : int Number of elements in each Vector. numPartitions : int, optional Number of partitions in the RDD. seed : int, optional Seed for the RNG that generates the seed for the generator in each partition. Returns ------- :py:class:`pyspark.RDD` RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`. Examples -------- >>> import numpy as np >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape (10, 10) >>> mat.max() <= 1.0 and mat.min() >= 0.0 True >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() 4 """ return callMLlibFunc("uniformVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed)
def predictSoft(self, x): """ Find the membership of point 'x' or each point in RDD 'x' to all mixture components. :param x: A feature vector or an RDD of vectors representing data points. :return: The membership value to all mixture components for vector 'x' or each vector in RDD 'x'. """ if isinstance(x, RDD): means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians]) membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector), _convert_to_vector(self.weights), means, sigmas) return membership_matrix.map(lambda x: pyarray.array('d', x)) else: return self.call("predictSoft", _convert_to_vector(x)).toArray()
def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Poisson distribution with the input mean. >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L) >>> mat = np.mat(rdd.collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(mat.std() - sqrt(mean)) < 0.5 True """ return callMLlibFunc("poissonVectorRDD", sc._jsc, mean, numRows, numCols, numPartitions, seed)
def load(cls, sc, path): """Load the LDAModel from disk. .. versionadded:: 1.5.0 Parameters ---------- sc : :py:class:`pyspark.SparkContext` path : str Path to where the model is stored. """ if not isinstance(sc, SparkContext): raise TypeError("sc should be a SparkContext, got type %s" % type(sc)) if not isinstance(path, str): raise TypeError("path should be a string, got type %s" % type(path)) model = callMLlibFunc("loadLDAModel", sc, path) return LDAModel(model)
def computeCost(self, rdd): """ Return the K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. .. versionadded:: 1.4.0 Parameters ---------- rdd : ::py:class:`pyspark.RDD` The RDD of points to compute the cost on. """ cost = callMLlibFunc( "computeCostKmeansModel", rdd.map(_convert_to_vector), [_convert_to_vector(c) for c in self.centers], ) return cost
def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Exponential distribution with the input mean. .. versionadded:: 1.3.0 Parameters ---------- sc : :py:class:`pyspark.SparkContext` SparkContext used to create the RDD. mean : float Mean, or 1 / lambda, for the Exponential distribution. numRows : int Number of Vectors in the RDD. numCols : int Number of elements in each Vector. numPartitions : int, optional Number of partitions in the RDD (default: `sc.defaultParallelism`) seed : int, optional Random seed (default: a random long integer). Returns ------- :py:class:`pyspark.RDD` RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean). Examples -------- >>> import numpy as np >>> mean = 0.5 >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1) >>> mat = np.mat(rdd.collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(mat.std() - sqrt(mean)) < 0.5 True """ return callMLlibFunc("exponentialVectorRDD", sc._jsc, float(mean), numRows, numCols, numPartitions, seed)
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). :param data: RDD of LabeledPoint. :param lambda_: The smoothing parameter """ first = data.first() if not isinstance(first, LabeledPoint): raise ValueError("`data` should be an RDD of LabeledPoint") labels, pi, theta = callMLlibFunc("trainNaiveBayes", data, lambda_) return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))
def __init__(self, entries, numRows=0, numCols=0): """ Note: This docstring is not shown publicly. Create a wrapper over a Java CoordinateMatrix. Publicly, we require that `rows` be an RDD. However, for internal usage, `rows` can also be a Java CoordinateMatrix object, in which case we can wrap it directly. This assists in clean matrix conversions. >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries) >>> mat_diff = CoordinateMatrix(entries) >>> (mat_diff._java_matrix_wrapper._java_model == ... mat._java_matrix_wrapper._java_model) False >>> mat_same = CoordinateMatrix(mat._java_matrix_wrapper._java_model) >>> (mat_same._java_matrix_wrapper._java_model == ... mat._java_matrix_wrapper._java_model) True """ if isinstance(entries, RDD): entries = entries.map(_convert_to_matrix_entry) # We use DataFrames for serialization of MatrixEntry entries # from Python, so first convert the RDD to a DataFrame on # this side. This will convert each MatrixEntry to a Row # containing the 'i', 'j', and 'value' values, which can # each be easily serialized. We will convert back to # MatrixEntry inputs on the Scala side. java_matrix = callMLlibFunc("createCoordinateMatrix", entries.toDF(), long(numRows), long(numCols)) elif (isinstance(entries, JavaObject) and entries.getClass().getSimpleName() == "CoordinateMatrix"): java_matrix = entries else: raise TypeError("entries should be an RDD of MatrixEntry entries or " "(long, long, float) tuples, got %s" % type(entries)) self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
def train(cls, rdd, k, maxIterations=100, initMode="random"): """ :param rdd: an RDD of (i, j, s,,ij,,) tuples representing the affinity matrix, which is the matrix A in the PIC paper. The similarity s,,ij,, must be nonnegative. This is a symmetric matrix and hence s,,ij,, = s,,ji,,. For any (i, j) with nonzero similarity, there should be either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. Tuples with i = j are ignored, because we assume s,,ij,, = 0.0. :param k: Number of clusters. :param maxIterations: Maximum number of iterations of the PIC algorithm. :param initMode: Initialization mode. """ model = callMLlibFunc("trainPowerIterationClusteringModel", rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode) return PowerIterationClusteringModel(model)
def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initialModel=None): """ Train a Gaussian Mixture clustering model. .. versionadded:: 1.3.0 Parameters ---------- rdd : ::py:class:`pyspark.RDD` Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector` or convertible sequence types. k : int Number of independent Gaussians in the mixture model. convergenceTol : float, optional Maximum change in log-likelihood at which convergence is considered to have occurred. (default: 1e-3) maxIterations : int, optional Maximum number of iterations allowed. (default: 100) seed : int, optional Random seed for initial Gaussian distribution. Set as None to generate seed based on system time. (default: None) initialModel : GaussianMixtureModel, optional Initial GMM starting point, bypassing the random initialization. (default: None) """ initialModelWeights = None initialModelMu = None initialModelSigma = None if initialModel is not None: if initialModel.k != k: raise Exception("Mismatched cluster count, initialModel.k = %s, however k = %s" % (initialModel.k, k)) initialModelWeights = list(initialModel.weights) initialModelMu = [initialModel.gaussians[i].mu for i in range(initialModel.k)] initialModelSigma = [initialModel.gaussians[i].sigma for i in range(initialModel.k)] java_model = callMLlibFunc("trainGaussianMixtureModel", rdd.map(_convert_to_vector), k, convergenceTol, maxIterations, seed, initialModelWeights, initialModelMu, initialModelSigma) return GaussianMixtureModel(java_model)
def train(cls, data, isotonic=True): """ Train an isotonic regression model on the given data. .. versionadded:: 1.4.0 Parameters ---------- data : :py:class:`pyspark.RDD` RDD of (label, feature, weight) tuples. isotonic : bool, optional Whether this is isotonic (which is default) or antitonic. (default: True) """ boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel", data.map(_convert_to_vector), bool(isotonic)) return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic)
def train( self, rdd: RDD["VectorLike"], k: int = 4, maxIterations: int = 20, minDivisibleClusterSize: float = 1.0, seed: int = -1888008604, ) -> BisectingKMeansModel: """ Runs the bisecting k-means algorithm return the model. .. versionadded:: 2.0.0 Parameters ---------- rdd : :py:class:`pyspark.RDD` Training points as an `RDD` of `Vector` or convertible sequence types. k : int, optional The desired number of leaf clusters. The actual number could be smaller if there are no divisible leaf clusters. (default: 4) maxIterations : int, optional Maximum number of iterations allowed to split clusters. (default: 20) minDivisibleClusterSize : float, optional Minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster. (default: 1) seed : int, optional Random seed value for cluster initialization. (default: -1888008604 from classOf[BisectingKMeans].getName.##) """ java_model = callMLlibFunc( "trainBisectingKMeans", rdd.map(_convert_to_vector), k, maxIterations, minDivisibleClusterSize, seed, ) return BisectingKMeansModel(java_model)
def normalRDD(sc: SparkContext, size: int, numPartitions: Optional[int] = None, seed: Optional[int] = None) -> RDD[float]: """ Generates an RDD comprised of i.i.d. samples from the standard normal distribution. To transform the distribution in the generated RDD from standard normal to some other normal N(mean, sigma^2), use ``RandomRDDs.normal(sc, n, p, seed).map(lambda v: mean + sigma * v)`` .. versionadded:: 1.1.0 Parameters ---------- sc : :py:class:`pyspark.SparkContext` used to create the RDD. size : int Size of the RDD. numPartitions : int, optional Number of partitions in the RDD (default: `sc.defaultParallelism`). seed : int, optional Random seed (default: a random long integer). Returns ------- :py:class:`pyspark.RDD` RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0). Examples -------- >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1) >>> stats = x.stats() >>> stats.count() 1000 >>> abs(stats.mean() - 0.0) < 0.1 True >>> abs(stats.stdev() - 1.0) < 0.1 True """ return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
def generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps): """ .. versionadded:: 1.5.0 Parameters ---------- intercept : float bias factor, the term c in X'w + c weights : :py:class:`pyspark.mllib.linalg.Vector` or convertible feature vector, the term w in X'w + c xMean : :py:class:`pyspark.mllib.linalg.Vector` or convertible Point around which the data X is centered. xVariance : :py:class:`pyspark.mllib.linalg.Vector` or convertible Variance of the given data nPoints : int Number of points to be generated seed : int Random Seed eps : float Used to scale the noise. If eps is set high, the amount of gaussian noise added is more. Returns ------- list of :py:class:`pyspark.mllib.regression.LabeledPoints` of length nPoints """ weights = [float(weight) for weight in weights] xMean = [float(mean) for mean in xMean] xVariance = [float(var) for var in xVariance] return list( callMLlibFunc( "generateLinearInputWrapper", float(intercept), weights, xMean, xVariance, int(nPoints), int(seed), float(eps), ))
def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the standard normal distribution. To transform the distribution in the generated RDD from standard normal to some other normal N(mean, sigma^2), use C{RandomRDDs.normal(sc, n, p, seed)\ .map(lambda v: mean + sigma * v)} >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - 0.0) < 0.1 True >>> abs(stats.stdev() - 1.0) < 0.1 True """ return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
def generateLinearRDD( sc: SparkContext, nexamples: int, nfeatures: int, eps: float, nParts: int = 2, intercept: float = 0.0, ) -> RDD["LabeledPoint"]: """ Generate an RDD of LabeledPoints. """ return callMLlibFunc( "generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures), float(eps), int(nParts), float(intercept), )
def generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps): """ :param: intercept bias factor, the term c in X'w + c :param: weights feature vector, the term w in X'w + c :param: xMean Point around which the data X is centered. :param: xVariance Variance of the given data :param: nPoints Number of points to be generated :param: seed Random Seed :param: eps Used to scale the noise. If eps is set high, the amount of gaussian noise added is more. Returns a list of LabeledPoints of length nPoints """ weights = [float(weight) for weight in weights] xMean = [float(mean) for mean in xMean] xVariance = [float(var) for var in xVariance] return list(callMLlibFunc( "generateLinearInputWrapper", float(intercept), weights, xMean, xVariance, int(nPoints), int(seed), float(eps)))
def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0, topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"): """Train a LDA model. :param rdd: RDD of data points :param k: Number of clusters you want :param maxIterations: Number of iterations. Default to 20 :param docConcentration: Concentration parameter (commonly named "alpha") for the prior placed on documents' distributions over topics ("theta"). :param topicConcentration: Concentration parameter (commonly named "beta" or "eta") for the prior placed on topics' distributions over terms. :param seed: Random Seed :param checkpointInterval: Period (in iterations) between checkpoints. :param optimizer: LDAOptimizer used to perform the actual calculation. Currently "em", "online" are supported. Default to "em". """ model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations, docConcentration, topicConcentration, seed, checkpointInterval, optimizer) return LDAModel(model)
def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, nonnegative=False, seed=None): """ Train a matrix factorization model given an RDD of 'implicit preferences' given by users to some products, in the form of (userID, productID, preference) pairs. We approximate the ratings matrix as the product of two lower-rank matrices of a given rank (number of features). To solve for these features, we run a given number of iterations of ALS. This is done using a level of parallelism given by `blocks`. """ model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, alpha, nonnegative, seed) return MatrixFactorizationModel(model)
def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604): """ Runs the bisecting k-means algorithm return the model. :param rdd: input RDD to be trained on :param k: The desired number of leaf clusters (default: 4). The actual number could be smaller if there are no divisible leaf clusters. :param maxIterations: the max number of k-means iterations to split clusters (default: 20) :param minDivisibleClusterSize: the minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster (default: 1) :param seed: a random seed (default: -1888008604 from classOf[BisectingKMeans].getName.##) """ java_model = callMLlibFunc( "trainBisectingKMeans", rdd.map(_convert_to_vector), k, maxIterations, minDivisibleClusterSize, seed) return BisectingKMeansModel(java_model)
def entries(self): """ Entries of the CoordinateMatrix stored as an RDD of MatrixEntries. >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)])) >>> entries = mat.entries >>> entries.first() MatrixEntry(0, 0, 1.2) """ # We use DataFrames for serialization of MatrixEntry entries # from Java, so we first convert the RDD of entries to a # DataFrame on the Scala/Java side. Then we map each Row in # the DataFrame back to a MatrixEntry on this side. entries_df = callMLlibFunc("getMatrixEntries", self._java_matrix_wrapper._java_model) entries = entries_df.map( lambda row: MatrixEntry(row[0], row[1], row[2])) return entries
def train( cls, rdd: RDD[Tuple[int, int, float]], k: int, maxIterations: int = 100, initMode: str = "random", ) -> PowerIterationClusteringModel: r""" Train PowerIterationClusteringModel .. versionadded:: 1.5.0 Parameters ---------- rdd : :py:class:`pyspark.RDD` An RDD of (i, j, s\ :sub:`ij`\) tuples representing the affinity matrix, which is the matrix A in the PIC paper. The similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored, because it is assumed s\ :sub:`ij`\ = 0.0. k : int Number of clusters. maxIterations : int, optional Maximum number of iterations of the PIC algorithm. (default: 100) initMode : str, optional Initialization mode. This can be either "random" to use a random vector as vertex properties, or "degree" to use normalized sum similarities. (default: "random") """ model = callMLlibFunc( "trainPowerIterationClusteringModel", rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode, ) return PowerIterationClusteringModel(model)
def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0, topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"): """Train a LDA model. :param rdd: RDD of documents, which are tuples of document IDs and term (word) count vectors. The term count vectors are "bags of words" with a fixed-size vocabulary (where the vocabulary size is the length of the vector). Document IDs must be unique and >= 0. :param k: Number of topics to infer, i.e., the number of soft cluster centers. (default: 10) :param maxIterations: Maximum number of iterations allowed. (default: 20) :param docConcentration: Concentration parameter (commonly named "alpha") for the prior placed on documents' distributions over topics ("theta"). (default: -1.0) :param topicConcentration: Concentration parameter (commonly named "beta" or "eta") for the prior placed on topics' distributions over terms. (default: -1.0) :param seed: Random seed for cluster initialization. Set as None to generate seed based on system time. (default: None) :param checkpointInterval: Period (in iterations) between checkpoints. (default: 10) :param optimizer: LDAOptimizer used to perform the actual calculation. Currently "em", "online" are supported. (default: "em") """ model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations, docConcentration, topicConcentration, seed, checkpointInterval, optimizer) return LDAModel(model)
def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None): """ Train a matrix factorization model given an RDD of ratings by users for a subset of products. The ratings matrix is approximated as the product of two lower-rank matrices of a given rank (number of features). To solve for these features, ALS is run iteratively with a configurable level of parallelism. :param ratings: RDD of `Rating` or (userID, productID, rating) tuple. :param rank: Number of features to use (also referred to as the number of latent factors). :param iterations: Number of iterations of ALS. (default: 5) :param lambda_: Regularization parameter. (default: 0.01) :param blocks: Number of blocks used to parallelize the computation. A value of -1 will use an auto-configured number of blocks. (default: -1) :param nonnegative: A value of True will solve least-squares with nonnegativity constraints. (default: False) :param seed: Random seed for initial matrix factorization model. A value of None will use system time as the seed. (default: None) """ model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, nonnegative, seed) return MatrixFactorizationModel(model)
def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000): """ Finds the complete set of frequent sequential patterns in the input sequences of itemsets. :param data: The input data set, each element contains a sequnce of itemsets. :param minSupport: the minimal support level of the sequential pattern, any pattern appears more than (minSupport * size-of-the-dataset) times will be output (default: `0.1`) :param maxPatternLength: the maximal length of the sequential pattern, any pattern appears less than maxPatternLength will be output. (default: `10`) :param maxLocalProjDBSize: The maximum number of items (including delimiters used in the internal storage format) allowed in a projected database before local processing. If a projected database exceeds this size, another iteration of distributed prefix growth is run. (default: `32000000`) """ model = callMLlibFunc("trainPrefixSpanModel", data, minSupport, maxPatternLength, maxLocalProjDBSize) return PrefixSpanModel(model)
def fit(self, dataset): """ Computes the mean and variance and stores as a model to be used for later scaling. .. versionadded:: 1.2.0 Parameters ---------- dataset : :py:class:`pyspark.RDD` The data used to compute the mean and variance to build the transformation model. Returns ------- :py:class:`StandardScalerModel` """ dataset = dataset.map(_convert_to_vector) jmodel = callMLlibFunc("fitStandardScaler", self.withMean, self.withStd, dataset) return StandardScalerModel(jmodel)
def blocks(self): """ The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that form this distributed matrix. >>> mat = BlockMatrix( ... sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2) >>> blocks = mat.blocks >>> blocks.first() ((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0)) """ # We use DataFrames for serialization of sub-matrix blocks # from Java, so we first convert the RDD of blocks to a # DataFrame on the Scala/Java side. Then we map each Row in # the DataFrame back to a sub-matrix block on this side. blocks_df = callMLlibFunc("getMatrixBlocks", self._java_matrix_wrapper._java_model) blocks = blocks_df.rdd.map(lambda row: ((row[0][0], row[0][1]), row[1])) return blocks