Пример #1
0
def within_group_scatter(data: pyspark.sql.DataFrame, features, response,
                         targets):
    p = len(features)
    sw = numpy.zeros((p, p))
    for target in targets:
        df_t = data.filter("{} == '{}'".format(response, target))
        X_t = RowMatrix(df_t.select(features).rdd.map(numpy.array))
        sw += X_t.computeCovariance().toArray() * (df_t.count() - 1)
    return sw
Пример #2
0
    def get_gmm(self, k, sample_fraction=None, retry=True):
        if k == 1:
            if sample_fraction:
                data = self.mllib_training_data.sample(False, sample_fraction)
            else:
                data = self.mllib_training_data
            row_matrix = RowMatrix(data)
            mean = row_matrix.computeColumnSummaryStatistics().mean()
            cov = row_matrix.computeCovariance().toArray()
            weights = [1.0]
            gaussians = [Gaussian(mean, cov)]
            log_likelihood = None
        else:
            m = self.fit_ml_model(k,
                                  sample_fraction=sample_fraction,
                                  retry=retry)
            weights = m.weights
            gaussians = [
                Gaussian(g.mean, g.cov.toArray())
                for g in m.gaussiansDF.collect()
            ]
            log_likelihood = m.summary.logLikelihood

        return GaussianMixtureModel(weights, gaussians, log_likelihood)
Пример #3
0
#matrix = Matrices.dense(nrows, ncols, rdd)
print("ncol: %d, nrow %d" % (ncols, nrows))
coord_mat = CoordinateMatrix(rdd.map(tuple))
print("num rows in matrix %d" % coord_mat.numRows())

print("finished using pyspark")
#________________________________________________-

print("now use SparkSession")

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df_2 = spark.read.option("delimiter", " ").csv('./data/lpi_ceria3d_b.mtx',
                                               header=False,
                                               inferSchema=True)
df_2.printSchema()

#coord_mat_2 = CoordinateMatrix(df_2.rdd.map(tuple))
row_mat = RowMatrix(df_2.rdd.map(tuple))
print("num rows in row matrix %d, num_cols %d" %
      (row_mat.numRows(), row_mat.numCols()))

print("print covariance")
print(row_mat.computeCovariance())

dm = Matrices.dense(3, 1, [4, 5, 6])

print("multiply row Matrix")
result = row_mat.multiply(dm)