예제 #1
0
def distribution_data():
    vectors = data.map(lambda p: p.features)
    """
    通过数据的每一行构成RowMatrix
    """
    matrix = RowMatrix(vectors)
    matrixSummary = matrix.computeColumnSummaryStatistics()
    print "mean of each column:"
    print matrixSummary.mean()
    print "min of each column:"
    print matrixSummary.min()
    print "max of each column:"
    print matrixSummary.max()
    print "variance of each column:"
    print matrixSummary.variance()
예제 #2
0
파일: model.py 프로젝트: brettbevers/miner
    def get_gmm(self, k, sample_fraction=None, retry=True):
        if k == 1:
            if sample_fraction:
                data = self.mllib_training_data.sample(False, sample_fraction)
            else:
                data = self.mllib_training_data
            row_matrix = RowMatrix(data)
            mean = row_matrix.computeColumnSummaryStatistics().mean()
            cov = row_matrix.computeCovariance().toArray()
            weights = [1.0]
            gaussians = [Gaussian(mean, cov)]
            log_likelihood = None
        else:
            m = self.fit_ml_model(k,
                                  sample_fraction=sample_fraction,
                                  retry=retry)
            weights = m.weights
            gaussians = [
                Gaussian(g.mean, g.cov.toArray())
                for g in m.gaussiansDF.collect()
            ]
            log_likelihood = m.summary.logLikelihood

        return GaussianMixtureModel(weights, gaussians, log_likelihood)
예제 #3
0
print(nb_model)

dt_model = DecisionTree().trainClassifier(data, 2, {})
print("decision tree model :")
print(dt_model)

#start predict
data_point = data.first()
lr_prediction = lr_model.predict(data_point.features)
print("logistic model prediction :" + str(lr_prediction))
print("the true label :" + str(data_point.label))

#analyze data
vectors = data.map(lambda lp: lp.features)
matrix = RowMatrix(vectors)
matrix_summary = matrix.computeColumnSummaryStatistics()
print("the col mean of matrix :")
print(matrix_summary.mean())
print("the col min of matrix :")
print(matrix_summary.min())
print("the col max of matrix :")
print(matrix_summary.max())
print("the col variance of matrix :")
print(matrix_summary.variance())
print("the col num non zero of matrix :")
print(matrix_summary.numNonzeros())

#transform data from data to standard scalar
scaler = StandardScaler(withMean=True, withStd=True).fit(vectors)
labels = data.map(lambda lp: lp.label)
features_transformed = scaler.transform(vectors)
예제 #4
0
ratings = rating_raw_data.map(lambda line: line.split("\t")).map(
    lambda fields: Rating(int(fields[0]), int(fields[1]), float(fields[2])))
#print ratings.take(5)
ratings.cache()

alsModel = ALS.train(ratings, 50, 10, 0.1)

movieFactors = alsModel.productFeatures().map(lambda (id, factor):
                                              (id, Vectors.dense(factor)))
movieVectors = movieFactors.map(lambda (id, factor): factor)
userFactors = alsModel.userFeatures().map(lambda (id, factor):
                                          (id, Vectors.dense(factor)))
userVectors = userFactors.map(lambda (id, factor): factor)

movieMatrix = RowMatrix(movieVectors)
movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics()

userMatrix = RowMatrix(userVectors)
userMatrixSummary = userMatrix.computeColumnSummaryStatistics()

#print "movie factors mean: "
#print movieMatrixSummary.mean()
#print "movie factors variance: "
#print movieMatrixSummary.variance()

#print "user factors mean: "
#print userMatrixSummary.mean()
#print "user factors mean: "
#print userMatrixSummary.mean()

numCLusters = 5