示例#1
0
    def run(self):

        tf_path = self.settings.tf_path
        algorithm = self.settings.algorithm
        seed = int(self.settings.seed)
        k = int(self.settings.k)
        result_path = self.settings.result_path
        target = self.settings.target

        spark = SparkSession.builder.getOrCreate()

        with open("train_spark.txt", "w") as file:
            file.write("spark context" + str(spark.sparkContext))
            file.write("===SeessionID===")
            file.write(str(id(spark)))

        df = spark.read.option("header", "true") \
            .option("inferSchema", "true") \
            .parquet(tf_path)
        df.repartition(10)

        # MODELING
        if algorithm == 'GMM':
            gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(
                seed)
            print("=====" * 8)
            print(gmm.explainParams())
            print("=====" * 8)
            model = gmm.fit(df)
        elif algorithm == 'KMeans':
            kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed)
            print("=====" * 8)
            print(kmm.explainParams())
            print("=====" * 8)
            model = kmm.fit(df)
        else:
            raise ValueError("no alg")

        prediction = model.transform(df)

        with open("./feature_info.pickle", "rb") as handle:
            features_info = pickle.load(handle)

        prediction.select(features_info["numeric_features"] +
                          features_info["category_features"] +
                          [target, 'prediction']).coalesce(1).write.mode(
                              'overwrite').csv(result_path, header=True)
        print("Result file is successfully generated at: ", result_path)
# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes  # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
print gmm.explainParams()
model = gmm.fit(sales)

# COMMAND ----------

summary = model.summary
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()

# COMMAND ----------

from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
print gmm.explainParams()
model = gmm.fit(sales)


# COMMAND ----------

summary = model.summary
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()


# COMMAND ----------
selected = ["home_lat", "home_lon"]

# Assemble the feature vector:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=selected, outputCol="features")
assembled = assembler.transform(students)


# ## Fit a Gaussian mixture model

# Specify a Gaussian mixture model with two clusters:
from pyspark.ml.clustering import GaussianMixture
gm = GaussianMixture(featuresCol="features", k=2, seed=12345)

# Examine the hyperparameters:
print(gm.explainParams())

# Fit the Gaussian mixture model:
gmm = gm.fit(assembled)
type(gmm)


# ## Examine the Gaussian mixture model

# Examine the mixing weights:
gmm.weights

# Examine the (multivariate) Gaussian distributions:
gmm.gaussiansDF.head(5)

# Examine the model summary: