def clustering_tuning(self): df_raw = pd.read_csv(f"{self.DEFAULT_PREPROCESSING_OUTPUT}", header=None) spark = SparkSession \ .builder \ .appName("PySparkKMeans") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame(df_raw) assembler = VectorAssembler(inputCols=df.columns, outputCol="features") # df_sample = df.sample(withReplacement=False, fraction=0.1) df_vec = assembler.transform(df).select("features") K_lst = list(range(50, 401, 10)) # gmm for k in range(K_lst): gm = GaussianMixture(k=k, tol=1, seed=10) gm.setMaxIter(500) model = gm.fit(df_vec) model.setPredictionCol("newPrediction") transformed = model.transform(df).select("features", "newPrediction") transformed = transformed.reset_index() return transformed
def run(self): tf_path = self.settings.tf_path algorithm = self.settings.algorithm seed = int(self.settings.seed) k = int(self.settings.k) result_path = self.settings.result_path target = self.settings.target spark = SparkSession.builder.getOrCreate() with open("train_spark.txt", "w") as file: file.write("spark context" + str(spark.sparkContext)) file.write("===SeessionID===") file.write(str(id(spark))) df = spark.read.option("header", "true") \ .option("inferSchema", "true") \ .parquet(tf_path) df.repartition(10) # MODELING if algorithm == 'GMM': gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed( seed) print("=====" * 8) print(gmm.explainParams()) print("=====" * 8) model = gmm.fit(df) elif algorithm == 'KMeans': kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(kmm.explainParams()) print("=====" * 8) model = kmm.fit(df) else: raise ValueError("no alg") prediction = model.transform(df) with open("./feature_info.pickle", "rb") as handle: features_info = pickle.load(handle) prediction.select(features_info["numeric_features"] + features_info["category_features"] + [target, 'prediction']).coalesce(1).write.mode( 'overwrite').csv(result_path, header=True) print("Result file is successfully generated at: ", result_path)
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
def train(df, hiperparameter): ''' Gaussian Mixture training, returning Gaussian Mixture model. input: - Dataframe - config (configurasi hiperparameter) return: kmeans model ''' gm = GaussianMixture(featuresCol=hiperparameter['featuresCol'], predictionCol=hiperparameter['predictionCol'], k=hiperparameter['k'], probabilityCol=hiperparameter['probabilityCol'], tol=hiperparameter['tol'], maxIter=hiperparameter['maxIter'], seed=hiperparameter['seed']) model = gm.fit(df) return model
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ), (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
def gaussian_mixture(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.dense([-0.1, -0.05]), ), (Vectors.dense([-0.01, -0.1]), ), (Vectors.dense([0.9, 0.8]), ), (Vectors.dense([0.75, 0.935]), ), (Vectors.dense([-0.83, -0.68]), ), (Vectors.dense([-0.91, -0.76]), )] df = spark.createDataFrame(data, ["features"]) gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10) model = gm.fit(df) model.hasSummary # True summary = model.summary summary.k # 3 summary.clusterSizes # [2, 2, 2] weights = model.weights len(weights) # 3 model.gaussiansDF.show() transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[4].prediction == rows[5].prediction # True rows[2].prediction == rows[3].prediction # True temp_path = "./" gmm_path = temp_path + "/gmm" gm.save(gmm_path) gm2 = GaussianMixture.load(gmm_path) gm2.getK() # 3 model_path = temp_path + "/gmm_model" model.save(model_path) model2 = GaussianMixtureModel.load(model_path) model2.hasSummary # False model2.weights == model.weights # True model2.gaussiansDF.show()
def main(args): spark=SparkSession\ .builder\ .master(args[2])\ .appName(args[1])\ .getOrCreate() start_computing_time = time.time() # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load(args[3]) (trainingData, testData) = data.randomSplit([0.7, 0.3],seed=1234) gmm = GaussianMixture().setK(2) model = gmm.fit(trainingData) # Make predictions predictions = model.transform(testData) appendTime(sys.argv,start_computing_time) spark.stop()
summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(5) print gmm.explainParams() model = gmm.fit(sales) # COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer, CountVectorizer
for i in range(1, 5): start = time.time() bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1))) modelBkm = bkm.fit(tsneDataFrame.select("features")) transformedBkm = modelBkm.transform(tsneDataFrame) end = time.time() times.append(end - start) bisectingKmeansTime = average(times) ############## GMM ################# from pyspark.ml.clustering import GaussianMixture times = [] for i in range(1, 5): start = time.time() gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1))) modelGmm = gmm.fit(tsneDataFrame.select("features")) transformedGmm = modelGmm.transform(tsneDataFrame) end = time.time() times.append(end - start) gmmTime = average(times) #preparation of data for non pyspark implementations clusterData = tsneDataFrame.select("screen_name", "features").collect() screenames = [x[0] for x in clusterData] clData = [x[1] for x in clusterData] clusData = np.array(clData) x = [cl[0] for cl in clData] y = [cl[1] for cl in clData] ################### DBSCAN ################### from sklearn.cluster import DBSCAN
# $example on$ from pyspark.ml.clustering import GaussianMixture # $example off$ from pyspark.sql import SparkSession """ A simple example demonstrating Gaussian Mixture Model (GMM). Run with: bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("GaussianMixtureExample")\ .getOrCreate() # $example on$ # loads data dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") gmm = GaussianMixture().setK(2).setSeed(538009335) model = gmm.fit(dataset) print("Gaussians shown as a DataFrame: ") model.gaussiansDF.show(truncate=False) # $example off$ spark.stop()
dataset = outputFeatureDf kValues = [2, 3, 4, 5, 6, 7, 8] bwssse = [] for k in kValues: bkmeans = BisectingKMeans().setK(k).setSeed(122) bmodel = bkmeans.fit(dataset) bwssse.append(bmodel.computeCost(dataset)) for i in bwssse: print(i) # In[31]: from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture(predictionCol="prediction").setK(2).setSeed(538009335) gmmmodel = gmm.fit(outputFeatureDf) print("Gaussians shown as a DataFrame: ") gmmmodel.gaussiansDF.show() # In[32]: from sklearn.metrics.cluster import completeness_score transformed = gmmmodel.transform(dataset) labels = labeldf.collect() label_array = [int(i[0]) for i in labels] preds = transformed.select('prediction').collect() preds_array = [int(i.prediction) for i in preds] completeness_score(preds_array, label_array) # In[51]:
# _*_ coding:utf-8 _*_ ''' GaussianMixture ''' from pyspark.sql import SparkSession from pyspark.ml.clustering import GaussianMixture spark = SparkSession.builder.appName("GaussianMixture").getOrCreate() paths = "/export/home/ry/spark-2.2.1-bin-hadoop2.7/data/mllib/" data = spark.read.format("libsvm").load(paths + "sample_kmeans_data.txt") gmm = GaussianMixture().setK(2) model = gmm.fit(data) print("Gaussian: ") model.gaussiansDF.show()
scaled_model = stand_scaled.fit(train_df) train_df = scaled_model.transform(train_df) scaled_model = stand_scaled.fit(test1_df) test1_df = scaled_model.transform(test1_df) scaled_model = stand_scaled.fit(test2_df) test2_df = scaled_model.transform(test2_df) gm = GaussianMixture(featuresCol="features", k=2, seed=2, maxIter=20) gmodel = gm.fit(train_df) if gmodel.hasSummary: print("Cluster sizes", gmodel.summary.clusterSizes) print("Clsuters ", gmodel.summary.k) test1_df = gmodel.transform(test1_df) test1_df.select("features", "Occupancy", "prediction").show(5) test2_df = gmodel.transform(test2_df) test2_df.select("features", "Occupancy", "prediction").show(5) count1 = test1_df.filter(" prediction!=Occupancy").count() total1 = test1_df.count() count2 = test2_df.filter(" prediction!=Occupancy").count()
transformed = lda_model.transform(dataset) transformed.display() # COMMAND ---------- # MAGIC %md #####Topic Modeling using Latent Dirichlet Allocation # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture train_df = spark.read.table("retail_features").selectExpr( "selected_features as features") gmm = GaussianMixture(k=3, featuresCol='features') gmm_model = gmm.fit(train_df) gmm_model.gaussiansDF.display() # COMMAND ---------- # MAGIC %md #### 2. Associan Rules # MAGIC #####Collaborative Filtering - Alternating Least Squares # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import Row ratings_df = (spark.read.table("retail_features").selectExpr(
# In[21]: print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) print("The upper bound on perplexity: " + str(lp)) # Describe topics. topics = model.describeTopics(3) print("The topics described by their top-weighted terms:") topics.show() # Shows the result transformed = model.transform(featurized_data) transformed.show() # In[22]: from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(3).setSeed(538009335) model = gmm.fit(featurized_data) print("Gaussians shown as a DataFrame: ") model.gaussiansDF.show(truncate=False)
# COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(5) print gmm.explainParams() model = gmm.fit(sales) # COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer, CountVectorizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.drop("features"))
gmm = GaussianMixture(k=1000) result = [] with Timer('clustering', 'Computing clusters'): for weekday in range(7): for hour in range(24): with Timer('clustering', 'Computing clusters for {}x{}'.format(weekday, hour)): df_h = df.filter(df.weekday == weekday).filter(df.hour == hour) va = VectorAssembler( inputCols=["pickup_latitude", "pickup_longitude"], outputCol="features") df_t = va.transform(df_h) model = gmm.fit(df_t) df_p = model.transform(df_t) df_pp = df_p.select('pickup_latitude', 'pickup_longitude', 'prediction', 'score').toPandas() df_scores = df_pp.groupby( ['prediction'])['score'].sum().sort_values(ascending=False) df_points = df_pp.groupby(['prediction'])['pickup_longitude', 'pickup_latitude'] for cluster, c_score in df_scores.items(): points = df_points.get_group(cluster) try: hull = ConvexHull(points.values) except: continue
.read \ .format("libsvm") \ .load("data\data_libsvm.txt") data.show() # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) numTraining = trainingData.count() numTest = testData.count() print("numTraining = ",numTraining, " numTest =", numTest) # Train a Latent Dirichlet allocation. gmm = GaussianMixture(k=200, tol=0.0001,maxIter=10, seed=1) model=gmm.fit(trainingData) if model.hasSummary: summary=model.summary print("k=",summary.k) print("cluster sizes=",summary.clusterSizes) print("logLikelihood=",summary.logLikelihood) print("len weights=",len(model.weights)) # Make predictions. predictions = model.transform(testData) predictions.show(5, truncate=False) #print("Gaussians shown as a DataFrame: ") #print(model.gaussiansDF.select("mean").head())
from pyspark.ml.clustering import GaussianMixture g = sns.lmplot(x='X Coordinate', y='Y Coordinate', hue='Primary Type', data=crime_df, fit_reg=False, size=10, palette={'NARCOTICS': 'tomato', 'THEFT': 'skyblue'}) # for each type of crime for crime_type, colour in [('NARCOTICS', 'r'), ('THEFT', 'b')]: crime_subset = ( crime_with_features_sdf .filter(sf.col('Primary Type') == crime_type) ) # fit a GMM gmm = GaussianMixture(k=30) model = gmm.fit(crime_subset) # extract the centers of the gaussians centers = ( model .gaussiansDF .toPandas() ) # Put the transformed data in a variable below crimes_with_predictions = model.transform(crime_subset) # 2. # Write code here ranked_gaussians = ( crimes_with_predictions .withColumn('probability', get_probability_udf('probability'))
df = pd.read_csv(obj['Body']) df.rating = (df.rating - df.rating.mean()) ratings = spark.createDataFrame(df) # use the model that has min RMSE num_iter,param = 200,0.2 als = ALS(maxIter=num_iter, regParam=param, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop") model = als.fit(ratings) user_feature = model.userFactors book_feature = model.itemFactors k = 20 gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features") model = gmm.fit(user_feature) transformed = model.transform(user_feature).select('id', 'prediction') rows = transformed.collect() df = spark.createDataFrame(rows) df.write.jdbc(url='jdbc:%s' % url+'yelp', table='book_gm_user_feature20', mode='overwrite', properties=properties) k = 20 gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features") model = gmm.fit(book_feature) transformed = model.transform(book_feature).select('id', 'prediction') rows = transformed.collect() df = spark.createDataFrame(rows)
tot_pipeline = Pipeline(stages=[features_processed]) processed = tot_pipeline.fit(df).transform(df) processed.write.mode("overwrite").parquet(tf_path) feature_info = { "numeric_features": numeric_features, "category_features": category_features } # MODELING if algorithm == 'GMM': gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(gmm.explainParams()) print("=====" * 8) model = gmm.fit(processed) elif algorithm == 'KMeans': kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(kmm.explainParams()) print("=====" * 8) model = kmm.fit(processed) else: raise ValueError("no alg") prediction = model.transform(processed) prediction.select( feature_info["numeric_features"] + feature_info["category_features"] + [target, 'prediction']).coalesce(1).write.mode('overwrite').csv( result_path, header=True)
# negative_udf = udf(lambda x: tp_values(x, 1)) # # # # # # train_df = train_df.withColumn('pos', positive_udf(col('ST')).astype(IntegerType())) \ # .withColumn('neg', negative_udf(col('ST')).astype(IntegerType())) # # train_df.show() # # assembler = VectorAssembler(inputCols=['pos', 'neg'], outputCol='features') # train_df = assembler.transform(train_df) # train_df.show() #modelling kmeans = KMeans().setK(2).setSeed(1).setMaxIter(20) model = kmeans.fit(train_df) model.transform(test_df).show() for c in model.clusterCenters(): print(c) # bkmeans = BisectingKMeans().setK(2).setSeed(1).setMaxIter(20) model = bkmeans.fit(train_df) model.transform(test_df).show() for c in model.clusterCenters(): print(c) gaussianmixture = GaussianMixture().setK(2).setSeed(1) model = gaussianmixture.fit(train_df) model.transform(test_df).show()
from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=selected, outputCol="features") assembled = assembler.transform(students) # ## Fit a Gaussian mixture model # Specify a Gaussian mixture model with two clusters: from pyspark.ml.clustering import GaussianMixture gm = GaussianMixture(featuresCol="features", k=2, seed=12345) # Examine the hyperparameters: print(gm.explainParams()) # Fit the Gaussian mixture model: gmm = gm.fit(assembled) type(gmm) # ## Examine the Gaussian mixture model # Examine the mixing weights: gmm.weights # Examine the (multivariate) Gaussian distributions: gmm.gaussiansDF.head(5) # Examine the model summary: gmm.hasSummary # Examine the cluster sizes:
# $example on$ from pyspark.ml.clustering import GaussianMixture # $example off$ from pyspark.sql import SparkSession """ A simple example demonstrating Gaussian Mixture Model (GMM). Run with: bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("GaussianMixtureExample")\ .getOrCreate() # $example on$ # loads data dataset = spark.read.format("libsvm").load( "data/mllib/sample_kmeans_data.txt") gmm = GaussianMixture().setK(2).setSeed(538009335) model = gmm.fit(dataset) print("Gaussians shown as a DataFrame: ") model.gaussiansDF.show(truncate=False) # $example off$ spark.stop()
from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=selected, outputCol="features") assembled = assembler.transform(students) # ## Fit a Gaussian mixture model # Specify a Gaussian mixture model with two clusters: from pyspark.ml.clustering import GaussianMixture gm = GaussianMixture(featuresCol="features", k=2, seed=12345) # Examine the hyperparameters: print(gm.explainParams()) # Fit the Gaussian mixture model: gmm = gm.fit(assembled) type(gmm) # ## Examine the Gaussian mixture model # Examine the mixing weights: gmm.weights # Examine the (multivariate) Gaussian distributions: gmm.gaussiansDF.head(5) # Examine the model summary: gmm.hasSummary # Examine the cluster sizes:
df = spark.sql(q) assembler = VectorAssembler(inputCols=[ "cid", "id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9", "id10", "id11", "id12", "id13", "id14", "id15", "id16" ], outputCol="FEATURE") vd = assembler.transform(df) cost = list() gmm = GaussianMixture().setK(2).setFeaturesCol('FEATURE').setSeed( 538009335).setTol(0.01) model = gmm.fit(vd) weights = model.weights print(weights) summary = model.summary summary.k logLikelihood = summary.logLikelihood param = model.explainParams() print(param) model.gaussiansDF.select("mean").head() model.gaussiansDF.select("cov").head() model.gaussiansDF.show() mlflow.log_param("k", 2)
.builder \ .appName("ChiSqSelectorExample") \ .getOrCreate() rawData = spark.sparkContext.textFile("file:///home/tianlei/iris.txt") def f(x): rel = {} rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3])) return rel df = sc.textFile("file:///usr/local/spark/iris.txt").map( lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF() # 我们建立一个简单的GaussianMixture对象,设定其聚类数目为3,其他参数取默认值。 gm = GaussianMixture().setK(3).setPredictionCol( "Prediction").setProbabilityCol("Probability") gmm = gm.fit(df) # 调用transform()方法处理数据集之后,打印数据集,可以看到每一个样本的预测簇以及其概率分布向量 # (这里为了明晰起见,省略了大部分行,只选择三行): result = gmm.transform(df) result.show(150, False) # 得到模型后,即可查看模型的相关参数,与KMeans方法不同,GMM不直接给出聚类中心, # 而是给出各个混合成分(多元高斯分布)的参数。在ML的实现中, # GMM的每一个混合成分都使用一个MultivariateGaussian类(位于org.apache.spark.ml.stat.distribution包)来存储, # 我们可以使用GaussianMixtureModel类的weights成员获取到各个混合成分的权重, # 使用gaussians成员来获取到各个混合成分的参数(均值向量和协方差矩阵): for i in range(3): print("Component " + str(i) + " : weight is " + str(gmm.weights[i]) + "\n mu vector is " + str(gmm.gaussiansDF.select('mean').head()) + " \n sigma matrix is " + str(gmm.gaussiansDF.select('cov').head()))