def distribution_data(): vectors = data.map(lambda p: p.features) """ 通过数据的每一行构成RowMatrix """ matrix = RowMatrix(vectors) matrixSummary = matrix.computeColumnSummaryStatistics() print "mean of each column:" print matrixSummary.mean() print "min of each column:" print matrixSummary.min() print "max of each column:" print matrixSummary.max() print "variance of each column:" print matrixSummary.variance()
def get_gmm(self, k, sample_fraction=None, retry=True): if k == 1: if sample_fraction: data = self.mllib_training_data.sample(False, sample_fraction) else: data = self.mllib_training_data row_matrix = RowMatrix(data) mean = row_matrix.computeColumnSummaryStatistics().mean() cov = row_matrix.computeCovariance().toArray() weights = [1.0] gaussians = [Gaussian(mean, cov)] log_likelihood = None else: m = self.fit_ml_model(k, sample_fraction=sample_fraction, retry=retry) weights = m.weights gaussians = [ Gaussian(g.mean, g.cov.toArray()) for g in m.gaussiansDF.collect() ] log_likelihood = m.summary.logLikelihood return GaussianMixtureModel(weights, gaussians, log_likelihood)
print(nb_model) dt_model = DecisionTree().trainClassifier(data, 2, {}) print("decision tree model :") print(dt_model) #start predict data_point = data.first() lr_prediction = lr_model.predict(data_point.features) print("logistic model prediction :" + str(lr_prediction)) print("the true label :" + str(data_point.label)) #analyze data vectors = data.map(lambda lp: lp.features) matrix = RowMatrix(vectors) matrix_summary = matrix.computeColumnSummaryStatistics() print("the col mean of matrix :") print(matrix_summary.mean()) print("the col min of matrix :") print(matrix_summary.min()) print("the col max of matrix :") print(matrix_summary.max()) print("the col variance of matrix :") print(matrix_summary.variance()) print("the col num non zero of matrix :") print(matrix_summary.numNonzeros()) #transform data from data to standard scalar scaler = StandardScaler(withMean=True, withStd=True).fit(vectors) labels = data.map(lambda lp: lp.label) features_transformed = scaler.transform(vectors)
ratings = rating_raw_data.map(lambda line: line.split("\t")).map( lambda fields: Rating(int(fields[0]), int(fields[1]), float(fields[2]))) #print ratings.take(5) ratings.cache() alsModel = ALS.train(ratings, 50, 10, 0.1) movieFactors = alsModel.productFeatures().map(lambda (id, factor): (id, Vectors.dense(factor))) movieVectors = movieFactors.map(lambda (id, factor): factor) userFactors = alsModel.userFeatures().map(lambda (id, factor): (id, Vectors.dense(factor))) userVectors = userFactors.map(lambda (id, factor): factor) movieMatrix = RowMatrix(movieVectors) movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics() userMatrix = RowMatrix(userVectors) userMatrixSummary = userMatrix.computeColumnSummaryStatistics() #print "movie factors mean: " #print movieMatrixSummary.mean() #print "movie factors variance: " #print movieMatrixSummary.variance() #print "user factors mean: " #print userMatrixSummary.mean() #print "user factors mean: " #print userMatrixSummary.mean() numCLusters = 5