示例#1
0
    def run(self):

        tf_path = self.settings.tf_path
        algorithm = self.settings.algorithm
        seed = int(self.settings.seed)
        k = int(self.settings.k)
        result_path = self.settings.result_path
        target = self.settings.target

        spark = SparkSession.builder.getOrCreate()

        with open("train_spark.txt", "w") as file:
            file.write("spark context" + str(spark.sparkContext))
            file.write("===SeessionID===")
            file.write(str(id(spark)))

        df = spark.read.option("header", "true") \
            .option("inferSchema", "true") \
            .parquet(tf_path)
        df.repartition(10)

        # MODELING
        if algorithm == 'GMM':
            gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(
                seed)
            print("=====" * 8)
            print(gmm.explainParams())
            print("=====" * 8)
            model = gmm.fit(df)
        elif algorithm == 'KMeans':
            kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed)
            print("=====" * 8)
            print(kmm.explainParams())
            print("=====" * 8)
            model = kmm.fit(df)
        else:
            raise ValueError("no alg")

        prediction = model.transform(df)

        with open("./feature_info.pickle", "rb") as handle:
            features_info = pickle.load(handle)

        prediction.select(features_info["numeric_features"] +
                          features_info["category_features"] +
                          [target, 'prediction']).coalesce(1).write.mode(
                              'overwrite').csv(result_path, header=True)
        print("Result file is successfully generated at: ", result_path)
sales = va.transform(spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/data/retail-data/by-day/*.csv")
  .limit(50)
  .coalesce(1)
  .where("Description IS NOT NULL"))

sales.cache()


# COMMAND ----------

from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print km.explainParams()
kmModel = km.fit(sales)


# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


# COMMAND ----------
  .setInputCols(["Quantity", "UnitPrice"])\
  .setOutputCol("features")

sales = va.transform(
    spark.read.format("csv").option("header",
                                    "true").option("inferSchema", "true").
    load("/databricks-datasets/definitive-guide/data/retail-data/by-day/*.csv"
         ).limit(50).coalesce(1).where("Description IS NOT NULL"))

sales.cache()

# COMMAND ----------

from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print km.explainParams()
kmModel = km.fit(sales)

# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes  # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.clustering import BisectingKMeans
feature_info = {
    "numeric_features": numeric_features,
    "category_features": category_features
}

# MODELING
if algorithm == 'GMM':
    gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(seed)
    print("=====" * 8)
    print(gmm.explainParams())
    print("=====" * 8)
    model = gmm.fit(processed)
elif algorithm == 'KMeans':
    kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed)
    print("=====" * 8)
    print(kmm.explainParams())
    print("=====" * 8)
    model = kmm.fit(processed)
else:
    raise ValueError("no alg")

prediction = model.transform(processed)

prediction.select(
    feature_info["numeric_features"] + feature_info["category_features"] +
    [target, 'prediction']).coalesce(1).write.mode('overwrite').csv(
        result_path, header=True)
print("Result file is successfully generated at: ", result_path)

end = time.time()
elapsed = end - start
示例#5
0
# standardization may be superfluous in this case.
rides_standardized.describe("origin_lat", "origin_lon", "dest_lat", "dest_lon").show()

# Spark MLlib does not provide a transformer to unscale the features.  In order
# to create meaningful plots below, we will proceed with unscaled features.


# ## Specify and fit a k-means model

# Use the `KMeans` class constructor to specify a k-means model:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol="features", predictionCol="cluster", k=3)
type(kmeans)

# Use the `explainParams` method to get a full list of the arguments:
print(kmeans.explainParams())

# Use the `fit` method to fit the k-means model:
kmeans_model = kmeans.fit(rides_standardized)
type(kmeans_model)

# **Note:** Euclidean distance may not be appropriate in this case.


# ## Evaluate the k-means model

# Compute cluster costs:
kmeans_model.computeCost(rides_standardized)

# **Note:** The `computeCost` is generally not informative on its own.  It is
# more useful when comparing multiple clustering models.
# In[62]:

from pyspark.ml.clustering import KMeans as MlKMeans

firstMlKMeans = MlKMeans(
    featuresCol="features", predictionCol="prediction", k=2, 
    initMode="k-means||", maxIter=20)
type(firstMlKMeans)


# `pyspark.ml` paketo modelių klasės turi `explainParams` metodą, kuruo išvedami modelio parametrų paaiškinimai.

# In[63]:

print(firstMlKMeans.explainParams())


# Apmokykime modelį.

# In[64]:

firstMlModel = firstMlKMeans.fit(ca1mlFeaturizedDF)
type(firstMlModel)


# In[65]:

firstMlModel.clusterCenters()