def clustering_tuning(self): df_raw = pd.read_csv(f"{self.DEFAULT_PREPROCESSING_OUTPUT}", header=None) spark = SparkSession \ .builder \ .appName("PySparkKMeans") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame(df_raw) assembler = VectorAssembler(inputCols=df.columns, outputCol="features") # df_sample = df.sample(withReplacement=False, fraction=0.1) df_vec = assembler.transform(df).select("features") K_lst = list(range(50, 401, 10)) # gmm for k in range(K_lst): gm = GaussianMixture(k=k, tol=1, seed=10) gm.setMaxIter(500) model = gm.fit(df_vec) model.setPredictionCol("newPrediction") transformed = model.transform(df).select("features", "newPrediction") transformed = transformed.reset_index() return transformed
def run(self): tf_path = self.settings.tf_path algorithm = self.settings.algorithm seed = int(self.settings.seed) k = int(self.settings.k) result_path = self.settings.result_path target = self.settings.target spark = SparkSession.builder.getOrCreate() with open("train_spark.txt", "w") as file: file.write("spark context" + str(spark.sparkContext)) file.write("===SeessionID===") file.write(str(id(spark))) df = spark.read.option("header", "true") \ .option("inferSchema", "true") \ .parquet(tf_path) df.repartition(10) # MODELING if algorithm == 'GMM': gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed( seed) print("=====" * 8) print(gmm.explainParams()) print("=====" * 8) model = gmm.fit(df) elif algorithm == 'KMeans': kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(kmm.explainParams()) print("=====" * 8) model = kmm.fit(df) else: raise ValueError("no alg") prediction = model.transform(df) with open("./feature_info.pickle", "rb") as handle: features_info = pickle.load(handle) prediction.select(features_info["numeric_features"] + features_info["category_features"] + [target, 'prediction']).coalesce(1).write.mode( 'overwrite').csv(result_path, header=True) print("Result file is successfully generated at: ", result_path)
def gmmresults(): df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ ("canadatweets.csv") df2 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df3 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df4 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("claritin.csv") df = df1.unionAll(df2) df = df.unionAll(df3) df = df.unionAll(df4) df.show() # df2.show() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=20000) idf = IDF(inputCol="rawFeatures", outputCol="features") gmm = GaussianMixture(k=8, featuresCol='features') pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, gmm]) model = pipeline.fit(df) results = model.transform(df) results.cache() results.groupBy("prediction").count().show( ) # Note "display" is for Databricks; use show() for OSS Apache Spark results.filter(results.prediction == 1).show(200, False) results.show() results.toPandas().to_csv( 'gmmresultsCanadaAndProductsAndDisastersAndClaritin.csv')
def main(): parser = argparse.ArgumentParser(description='Clustering with pyspark.') parser.add_argument('--data-file', type=str, default='enwiki.json') parser.add_argument('--num-clusters', type=int, default=4) parser.add_argument('--seed', type=int, default=23) parser.add_argument('--algorithm', default='kmeans', choices=['kmeans', 'hier', 'gmm']) parser.add_argument('--output-groundtruth', type=str, default='groundtruth.csv') parser.add_argument('--output-cluster', type=str, default='cluster.csv') args = parser.parse_args() spark_session = SparkSession.builder.appName('clustering').getOrCreate() data = preprocess(spark_session, args.data_file) if args.algorithm == 'kmeans': alg = KMeans() elif args.algorithm == 'hier': alg = BisectingKMeans() elif args.algorithm == 'gmm': alg = GaussianMixture() model = train(alg, data, args.num_clusters, seed=args.seed) evaluate(data, model, args.algorithm, args.num_clusters, args.output_groundtruth, args.output_cluster)
def train(df, hiperparameter): ''' Gaussian Mixture training, returning Gaussian Mixture model. input: - Dataframe - config (configurasi hiperparameter) return: kmeans model ''' gm = GaussianMixture(featuresCol=hiperparameter['featuresCol'], predictionCol=hiperparameter['predictionCol'], k=hiperparameter['k'], probabilityCol=hiperparameter['probabilityCol'], tol=hiperparameter['tol'], maxIter=hiperparameter['maxIter'], seed=hiperparameter['seed']) model = gm.fit(df) return model
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ), (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
def main(args): spark=SparkSession\ .builder\ .master(args[2])\ .appName(args[1])\ .getOrCreate() start_computing_time = time.time() # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load(args[3]) (trainingData, testData) = data.randomSplit([0.7, 0.3],seed=1234) gmm = GaussianMixture().setK(2) model = gmm.fit(trainingData) # Make predictions predictions = model.transform(testData) appendTime(sys.argv,start_computing_time) spark.stop()
def fit_ml_model(self, k, sample_fraction=None, retry=True): if sample_fraction: training_data = self.ml_training_data.sample( False, fraction=sample_fraction) else: training_data = self.ml_training_data result = GaussianMixture( k=k, maxIter=self.max_iterations).fit(training_data) ll = result.summary.logLikelihood # Retry to get a valid model if the calculated log likelihood is > 0. retries = 0 while retry and ll > 0 and retries < self.fit_model_retries: retry_sample_fraction = sample_fraction or self.ll_sample_fraction retry_data = self.ml_training_data.sample( False, fraction=retry_sample_fraction) result = GaussianMixture( k=k, maxIter=self.max_iterations).fit(retry_data) ll = result.summary.logLikelihood retries += 1 return result
def gaussian_mixture(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.dense([-0.1, -0.05]), ), (Vectors.dense([-0.01, -0.1]), ), (Vectors.dense([0.9, 0.8]), ), (Vectors.dense([0.75, 0.935]), ), (Vectors.dense([-0.83, -0.68]), ), (Vectors.dense([-0.91, -0.76]), )] df = spark.createDataFrame(data, ["features"]) gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10) model = gm.fit(df) model.hasSummary # True summary = model.summary summary.k # 3 summary.clusterSizes # [2, 2, 2] weights = model.weights len(weights) # 3 model.gaussiansDF.show() transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[4].prediction == rows[5].prediction # True rows[2].prediction == rows[3].prediction # True temp_path = "./" gmm_path = temp_path + "/gmm" gm.save(gmm_path) gm2 = GaussianMixture.load(gmm_path) gm2.getK() # 3 model_path = temp_path + "/gmm_model" model.save(model_path) model2 = GaussianMixtureModel.load(model_path) model2.hasSummary # False model2.weights == model.weights # True model2.gaussiansDF.show()
# In[36]: dataset = outputFeatureDf kValues = [2, 3, 4, 5, 6, 7, 8] bwssse = [] for k in kValues: bkmeans = BisectingKMeans().setK(k).setSeed(122) bmodel = bkmeans.fit(dataset) bwssse.append(bmodel.computeCost(dataset)) for i in bwssse: print(i) # In[31]: from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture(predictionCol="prediction").setK(2).setSeed(538009335) gmmmodel = gmm.fit(outputFeatureDf) print("Gaussians shown as a DataFrame: ") gmmmodel.gaussiansDF.show() # In[32]: from sklearn.metrics.cluster import completeness_score transformed = gmmmodel.transform(dataset) labels = labeldf.collect() label_array = [int(i[0]) for i in labels] preds = transformed.select('prediction').collect() preds_array = [int(i.prediction) for i in preds] completeness_score(preds_array, label_array) # In[51]:
# COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(5) print gmm.explainParams() model = gmm.fit(sales) # COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ----------
withStd=True, withMean=True) scaled_model = stand_scaled.fit(train_df) train_df = scaled_model.transform(train_df) scaled_model = stand_scaled.fit(test1_df) test1_df = scaled_model.transform(test1_df) scaled_model = stand_scaled.fit(test2_df) test2_df = scaled_model.transform(test2_df) gm = GaussianMixture(featuresCol="features", k=2, seed=2, maxIter=20) gmodel = gm.fit(train_df) if gmodel.hasSummary: print("Cluster sizes", gmodel.summary.clusterSizes) print("Clsuters ", gmodel.summary.k) test1_df = gmodel.transform(test1_df) test1_df.select("features", "Occupancy", "prediction").show(5) test2_df = gmodel.transform(test2_df) test2_df.select("features", "Occupancy", "prediction").show(5) count1 = test1_df.filter(" prediction!=Occupancy").count() total1 = test1_df.count()
transformed = lda_model.transform(dataset) transformed.display() # COMMAND ---------- # MAGIC %md #####Topic Modeling using Latent Dirichlet Allocation # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture train_df = spark.read.table("retail_features").selectExpr( "selected_features as features") gmm = GaussianMixture(k=3, featuresCol='features') gmm_model = gmm.fit(train_df) gmm_model.gaussiansDF.display() # COMMAND ---------- # MAGIC %md #### 2. Associan Rules # MAGIC #####Collaborative Filtering - Alternating Least Squares # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import Row
# Integrate features features_processed = VectorAssembler(inputCols=["category", "numeric"], outputCol="features") tot_pipeline = Pipeline(stages=[features_processed]) processed = tot_pipeline.fit(df).transform(df) processed.write.mode("overwrite").parquet(tf_path) feature_info = { "numeric_features": numeric_features, "category_features": category_features } # MODELING if algorithm == 'GMM': gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(gmm.explainParams()) print("=====" * 8) model = gmm.fit(processed) elif algorithm == 'KMeans': kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(kmm.explainParams()) print("=====" * 8) model = kmm.fit(processed) else: raise ValueError("no alg") prediction = model.transform(processed)
bkmModel = bkm.fit(sales) # COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(5) print gmm.explainParams() model = gmm.fit(sales) # COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer, CountVectorizer
q = "SELECT ss.ss_customer_id AS cid, count(CASE WHEN i.i_class_id=1 THEN 1 ELSE NULL END) AS id1,count(CASE WHEN i.i_class_id=3 THEN 1 ELSE NULL END) AS id3,count(CASE WHEN i.i_class_id=5 THEN 1 ELSE NULL END) AS id5,count(CASE WHEN i.i_class_id=7 THEN 1 ELSE NULL END) AS id7, count(CASE WHEN i.i_class_id=9 THEN 1 ELSE NULL END) AS id9,count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS id11,count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS id13,count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS id15,count(CASE WHEN i.i_class_id=2 THEN 1 ELSE NULL END) AS id2,count(CASE WHEN i.i_class_id=4 THEN 1 ELSE NULL END) AS id4,count(CASE WHEN i.i_class_id=6 THEN 1 ELSE NULL END) AS id6,count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS id12, count(CASE WHEN i.i_class_id=8 THEN 1 ELSE NULL END) AS id8,count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS id10,count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS id14,count(CASE WHEN i.i_class_id=16 THEN 1 ELSE NULL END) AS id16 FROM store_sales ss INNER JOIN items i ON ss.ss_item_id = i.i_item_id WHERE i.i_category_name IN ('cat#01','cat#02','cat#03','cat#04','cat#05','cat#06','cat#07','cat#08','cat#09','cat#10','cat#11','cat#12','cat#013','cat#14','cat#15') AND ss.ss_customer_id IS NOT NULL GROUP BY ss.ss_customer_id HAVING count(ss.ss_item_id) > 3" df = spark.sql(q) assembler = VectorAssembler(inputCols=[ "cid", "id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9", "id10", "id11", "id12", "id13", "id14", "id15", "id16" ], outputCol="FEATURE") vd = assembler.transform(df) cost = list() gmm = GaussianMixture().setK(2).setFeaturesCol('FEATURE').setSeed( 538009335).setTol(0.01) model = gmm.fit(vd) weights = model.weights print(weights) summary = model.summary summary.k logLikelihood = summary.logLikelihood param = model.explainParams() print(param) model.gaussiansDF.select("mean").head() model.gaussiansDF.select("cov").head() model.gaussiansDF.show()
# In[21]: print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) print("The upper bound on perplexity: " + str(lp)) # Describe topics. topics = model.describeTopics(3) print("The topics described by their top-weighted terms:") topics.show() # Shows the result transformed = model.transform(featurized_data) transformed.show() # In[22]: from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(3).setSeed(538009335) model = gmm.fit(featurized_data) print("Gaussians shown as a DataFrame: ") model.gaussiansDF.show(truncate=False)
#Lets test again with the best k = 7 on the denormalized dataset kmeans = KMeans(featuresCol="features").setK(7).setSeed(1) pipeline = Pipeline(stages=[vectorAssembler, kmeans]) model = pipeline.fit(df_denormalized) predictions = model.transform(df_denormalized) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) #Lets try GaussianMixture instead of KMeans from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture(featuresCol="features").setK(14).setSeed(1) pipeline = Pipeline(stages=[vectorAssembler, gmm]) model = pipeline.fit(df) predictions = model.transform(df) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) #Lets try finding the best K using an iterative method Ks = 15 mean_acc = np.zeros((Ks - 1)) ConfusionMx = [] for n in range(7, Ks): #Train Model and Predict
def SparkML(train_df, test_df=None, featuresCol='features', labelCol='label', binaryclass=False, multiclass=False, n_cluster=2, userCol='user', itemCol='item', ratingCol='rating', rank=10, userid=3, itemid=3, itemsCol='items', minSupport=0.3, minConfidence=0.8, stringIndexer=False, inputColStringIndexer=None, outputColStringIndexer=None, oneHotEncoder=False, inputColOneHotEncoder=None, outputColOneHotEncoder=None, vectorAssembler=False, inputColsVectorAssembler=None, outputColsVectorAssembler=None, vectorIndexer=False, inputColsVectorIndexer=None, outputColsVectorIndexer=None, maxCategories=None, classification=False, logisticregression=False, decisiontreeclassifier=False, linearsvc=False, naivebayes=False, randomforestclassifier=False, gbtclassifier=False, regression=False, linearregression=True, decisiontreeregressor=False, randomforestregressor=False, gbtregressor=False, clustering=False, kmeans=False, gaussianmixture=False, lda=False, recommendation=False, als=False, association=False, fpgrowth=False): if classification: if logisticregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRClassifier = LogisticRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', standardization=True, maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5) paramGrid = ParamGridBuilder().addGrid( LRClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") LRCV = CrossValidator(estimator=LRClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LRC_Pipeline = Pipeline(stages=stagesList) LRC_PipelineModel = LRC_Pipeline.fit(train_df) LRC_Predicted = LRC_PipelineModel.transform(test_df) LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel LRC_Probability = LRC_Predicted.select("Probability").toPandas() LRC_Prediction = LRC_Predicted.select("Prediction").toPandas() LRC_Score = evaluator.evaluate(LRC_Predicted) return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score if decisiontreeclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTClassifier = DecisionTreeClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', seed=None) paramGrid = ParamGridBuilder().addGrid( DTClassifier.impurity, ["gini", "entropy"]).addGrid( DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTClassifier.maxBins, [3, 5, 10, 50, 100, 200]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") DTCV = CrossValidator(estimator=DTClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTCV) DTC_Pipeline = Pipeline(stages=stagesList) DTC_PipelineModel = DTC_Pipeline.fit(train_df) DTC_Predicted = DTC_PipelineModel.transform(test_df) DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel DTC_Probability = DTC_Predicted.select("Probability").toPandas() DTC_Prediction = DTC_Predicted.select("Prediction").toPandas() DTC_Score = evaluator.evaluate(DTC_Predicted) return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score if linearsvc: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) SVClassifier = LinearSVC(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', rawPredictionCol='RawPrediction', maxIter=100, regParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, threshold=0.0) paramGrid = ParamGridBuilder().addGrid( SVClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( SVClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") SVCV = CrossValidator(estimator=SVClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(SVCV) SVC_Pipeline = Pipeline(stages=stagesList) SVC_PipelineModel = SVC_Pipeline.fit(train_df) SVC_Predicted = SVC_PipelineModel.transform(test_df) SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel SVC_Prediction = SVC_Predicted.select("Prediction").toPandas() SVC_Score = evaluator.evaluate(SVC_Predicted) return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score if naivebayes: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) NBClassifier = NaiveBayes(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None) paramGrid = ParamGridBuilder().addGrid( NBClassifier.smoothing, [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") NBCV = CrossValidator(estimator=NBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(NBCV) NBC_Pipeline = Pipeline(stages=stagesList) NBC_PipelineModel = NBC_Pipeline.fit(train_df) NBC_Predicted = NBC_PipelineModel.transform(test_df) NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel NBC_Probability = NBC_Predicted.select("Probability").toPandas() NBC_Prediction = NBC_Predicted.select("Prediction").toPandas() NBC_Score = evaluator.evaluate(NBC_Predicted) return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score if randomforestclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFClassifier = RandomForestClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', numTrees=20, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( RFClassifier.impurity, ["gini", "entropy"]).addGrid( RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFClassifier.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") RFCV = CrossValidator(estimator=RFClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFCV) RFC_Pipeline = Pipeline(stages=stagesList) RFC_PipelineModel = RFC_Pipeline.fit(train_df) RFC_Predicted = RFC_PipelineModel.transform(test_df) RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel RFC_Probability = RFC_Predicted.select("Probability").toPandas() RFC_Prediction = RFC_Predicted.select("Prediction").toPandas() RFC_Score = evaluator.evaluate(RFC_Predicted) return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score if gbtclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBClassifier = GBTClassifier(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBClassifier.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBClassifier.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") GBCV = CrossValidator(estimator=GBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBCV) GBC_Pipeline = Pipeline(stages=stagesList) GBC_PipelineModel = GBC_Pipeline.fit(train_df) GBC_Predicted = GBC_PipelineModel.transform(test_df) GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel GBC_Prediction = GBC_Predicted.select("Prediction").toPandas() GBC_Score = evaluator.evaluate(GBC_Predicted) return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score if regression: if linearregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRegressor = LinearRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', standardization=True, fitIntercept=True, loss='squaredError', maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, epsilon=1.35) paramGrid = ParamGridBuilder().addGrid( LRegressor.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRegressor.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") LRCV = CrossValidator(estimator=LRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LR_Pipeline = Pipeline(stages=stagesList) LR_PipelineModel = LR_Pipeline.fit(train_df) LR_Predicted = LR_PipelineModel.transform(test_df) LR_BestModel = LR_PipelineModel.stages[-1].bestModel LR_Prediction = LR_Predicted.select("Prediction").toPandas() LR_Score = evaluator.evaluate(LR_Predicted) return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score if decisiontreeregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', seed=None, varianceCol=None) paramGrid = ParamGridBuilder().addGrid( DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") DTRCV = CrossValidator(estimator=DTRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTRCV) DTR_Pipeline = Pipeline(stages=stagesList) DTR_PipelineModel = DTR_Pipeline.fit(train_df) DTR_Predicted = DTR_PipelineModel.transform(test_df) DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel DTR_Prediction = DTR_Predicted.select("Prediction").toPandas() DTR_Score = evaluator.evaluate(DTR_Predicted) return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score if randomforestregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFRegressor = RandomForestRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', subsamplingRate=1.0, seed=None, numTrees=20) paramGrid = ParamGridBuilder().addGrid( RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFRegressor.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") RFRCV = CrossValidator(estimator=RFRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFRCV) RFR_Pipeline = Pipeline(stages=stagesList) RFR_PipelineModel = RFR_Pipeline.fit(train_df) RFR_Predicted = RFR_PipelineModel.transform(test_df) RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel RFR_Prediction = RFR_Predicted.select("Prediction").toPandas() RFR_Score = evaluator.evaluate(RFR_Predicted) return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score if gbtregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBRegressor = GBTRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, subsamplingRate=1.0, lossType='squared', maxIter=20, stepSize=0.1, seed=None, impurity='variance') paramGrid = ParamGridBuilder().addGrid( GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBRegressor.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBRegressor.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") GBRCV = CrossValidator(estimator=GBRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBRCV) GBR_Pipeline = Pipeline(stages=stagesList) GBR_PipelineModel = GBR_Pipeline.fit(train_df) GBR_Predicted = GBR_PipelineModel.transform(test_df) GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel GBR_Prediction = GBR_Predicted.select("Prediction").toPandas() GBR_Score = evaluator.evaluate(GBR_Predicted) return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score if clustering: if kmeans: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) KCluster = KMeans(featuresCol=featuresCol, predictionCol='Prediction', k=n_cluster, initMode='k-means||', initSteps=2, tol=0.0001, maxIter=20, seed=None) paramGrid = ParamGridBuilder().addGrid( KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid( KCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( KCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') KMCV = CrossValidator(estimator=KCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(KMCV) KMC_Pipeline = Pipeline(stages=stagesList) KMC_PipelineModel = KMC_Pipeline.fit(train_df) KMC_Predicted = KMC_PipelineModel.transform(train_df) KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel KMC_Prediction = KMC_Predicted.select("Prediction").toPandas() KMC_Score = evaluator.evaluate(KMC_Predicted) return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score if gaussianmixture: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GMCluster = GaussianMixture(featuresCol=featuresCol, predictionCol='Prediction', probabilityCol='Probability', k=n_cluster, tol=0.01, maxIter=100, seed=None) paramGrid = ParamGridBuilder().addGrid( GMCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( GMCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') GMCV = CrossValidator(estimator=GMCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GMCV) GMC_Pipeline = Pipeline(stages=stagesList) GMC_PipelineModel = GMC_Pipeline.fit(train_df) GMC_Predicted = GMC_PipelineModel.transform(train_df) GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel GMC_Probability = GMC_Predicted.select("Probability").toPandas() GMC_Prediction = GMC_Predicted.select("Prediction").toPandas() GMC_Score = evaluator.evaluate(GMC_Predicted) return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score if lda: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LDACluster = LDA(featuresCol=featuresCol, maxIter=20, seed=None, k=n_cluster, learningOffset=1024.0, learningDecay=0.51, subsamplingRate=0.05) paramGrid = ParamGridBuilder().addGrid( LDACluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( LDACluster.seed, [i for i in range(1001)]).addGrid( LDACluster.subsamplingRate, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') LDACV = CrossValidator(estimator=LDACluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LDACV) LDA_Pipeline = Pipeline(stages=stagesList) LDA_PipelineModel = LDA_Pipeline.fit(train_df) LDA_Predicted = LDA_PipelineModel.transform(train_df) LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel LDA_Topics = LDA_BestModel.describeTopics().toPandas() LDA_Score = evaluator.evaluate(LDA_Predicted) return LDA_BestModel, LDA_Topics, LDA_Score if recommendation: if als: ALSR = ALS(userCol=userCol, itemCol=itemCol, ratingCol=ratingCol, rank=rank, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, alpha=1.0, seed=1) ALSR_Model = ALSR.fit(train_df) ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid) ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid) return ALSR_Model, ALSR_ForUsers, ALSR_ForItems if association: if fpgrowth: fpg = FPGrowth(minSupport=minSupport, minConfidence=minConfidence, itemsCol=itemsCol, predictionCol='Prediction') fpg_model = fpg.fit(train_df) fpg_freqItemsets = fpg_model.freqItemsets.toPandas() fpg_associationRules = fpg_model.associationRules.toPandas() return fpg_model, fpg_freqItemsets, fpg_associationRules
times = [] for i in range(1, 5): start = time.time() bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1))) modelBkm = bkm.fit(tsneDataFrame.select("features")) transformedBkm = modelBkm.transform(tsneDataFrame) end = time.time() times.append(end - start) bisectingKmeansTime = average(times) ############## GMM ################# from pyspark.ml.clustering import GaussianMixture times = [] for i in range(1, 5): start = time.time() gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1))) modelGmm = gmm.fit(tsneDataFrame.select("features")) transformedGmm = modelGmm.transform(tsneDataFrame) end = time.time() times.append(end - start) gmmTime = average(times) #preparation of data for non pyspark implementations clusterData = tsneDataFrame.select("screen_name", "features").collect() screenames = [x[0] for x in clusterData] clData = [x[1] for x in clusterData] clusData = np.array(clData) x = [cl[0] for cl in clData] y = [cl[1] for cl in clData] ################### DBSCAN ###################
# longitude are already in the same scale, we can proceed. # Select home latitude and longitude as the features: selected = ["home_lat", "home_lon"] # Assemble the feature vector: from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=selected, outputCol="features") assembled = assembler.transform(students) # ## Fit a Gaussian mixture model # Specify a Gaussian mixture model with two clusters: from pyspark.ml.clustering import GaussianMixture gm = GaussianMixture(featuresCol="features", k=2, seed=12345) # Examine the hyperparameters: print(gm.explainParams()) # Fit the Gaussian mixture model: gmm = gm.fit(assembled) type(gmm) # ## Examine the Gaussian mixture model # Examine the mixing weights: gmm.weights # Examine the (multivariate) Gaussian distributions:
# negative_udf = udf(lambda x: tp_values(x, 1)) # # # # # # train_df = train_df.withColumn('pos', positive_udf(col('ST')).astype(IntegerType())) \ # .withColumn('neg', negative_udf(col('ST')).astype(IntegerType())) # # train_df.show() # # assembler = VectorAssembler(inputCols=['pos', 'neg'], outputCol='features') # train_df = assembler.transform(train_df) # train_df.show() #modelling kmeans = KMeans().setK(2).setSeed(1).setMaxIter(20) model = kmeans.fit(train_df) model.transform(test_df).show() for c in model.clusterCenters(): print(c) # bkmeans = BisectingKMeans().setK(2).setSeed(1).setMaxIter(20) model = bkmeans.fit(train_df) model.transform(test_df).show() for c in model.clusterCenters(): print(c) gaussianmixture = GaussianMixture().setK(2).setSeed(1) model = gaussianmixture.fit(train_df) model.transform(test_df).show()
k = args.k_clusters if algorithm not in ['kmeans', 'gmm', 'lda', 'spectral']: raise ValueError('Not a valid algorithm') ss = SparkSession.builder.getOrCreate() df = ss.read.csv(path, header=True, inferSchema=True) df_preprocessed = preprocessing(df, num_pca=num_pca_features) df_preprocessed.write.parquet("preprocessed", mode="Overwrite") if algorithm == 'kmeans': model = KMeans(k=k).setSeed(1).fit(df_preprocessed) predictions = model.transform(df_preprocessed) elif algorithm == 'spectral': model = SpectralClustering(k=k, k_nearest=7) predictions = model.cluster(df_preprocessed, ss, repartition_num=num_nodes) elif algorithm == 'lda': model = LDA(k=k, maxIter=10).fit(df_preprocessed) predictions = model.transform(df_preprocessed) elif algorithm == 'gmm': model = GaussianMixture(k=k).fit(df_preprocessed) predictions = model.transform(df_preprocessed) predictions.select([col for col in predictions.columns if col != 'features'])\ .toPandas()\ .to_csv(sys.stdout)
obj = client.get_object(Bucket='yelpdatacf', Key='book_cf.csv') df = pd.read_csv(obj['Body']) df.rating = (df.rating - df.rating.mean()) ratings = spark.createDataFrame(df) # use the model that has min RMSE num_iter,param = 200,0.2 als = ALS(maxIter=num_iter, regParam=param, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop") model = als.fit(ratings) user_feature = model.userFactors book_feature = model.itemFactors k = 20 gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features") model = gmm.fit(user_feature) transformed = model.transform(user_feature).select('id', 'prediction') rows = transformed.collect() df = spark.createDataFrame(rows) df.write.jdbc(url='jdbc:%s' % url+'yelp', table='book_gm_user_feature20', mode='overwrite', properties=properties) k = 20 gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features") model = gmm.fit(book_feature) transformed = model.transform(book_feature).select('id', 'prediction') rows = transformed.collect()
# $example on$ from pyspark.ml.clustering import GaussianMixture # $example off$ from pyspark.sql import SparkSession """ A simple example demonstrating Gaussian Mixture Model (GMM). Run with: bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("GaussianMixtureExample")\ .getOrCreate() # $example on$ # loads data dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") gmm = GaussianMixture().setK(2).setSeed(538009335) model = gmm.fit(dataset) print("Gaussians shown as a DataFrame: ") model.gaussiansDF.show(truncate=False) # $example off$ spark.stop()
with Timer('read', 'Reading data'): df = df_base = spark.read.csv('data/yellow_tripdata_2016-01.csv', header=True, inferSchema=True) with Timer('process', 'Cleaning invalid data'): df = process(df) from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType from pyspark import StorageLevel from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture(k=1000) result = [] with Timer('clustering', 'Computing clusters'): for weekday in range(7): for hour in range(24): with Timer('clustering', 'Computing clusters for {}x{}'.format(weekday, hour)): df_h = df.filter(df.weekday == weekday).filter(df.hour == hour) va = VectorAssembler( inputCols=["pickup_latitude", "pickup_longitude"], outputCol="features") df_t = va.transform(df_h) model = gmm.fit(df_t)
# In[84]: from pyspark.ml.feature import VectorAssembler vectorAssembler = VectorAssembler(inputCols=["X", "Y", "Z"], outputCol="features") # Please insatiate a clustering algorithm from the SparkML package and assign it to the clust variable. Here we don’t need to take care of the “CLASS” column since we are in unsupervised learning mode – so let’s pretend to not even have the “CLASS” column for now – but it will become very handy later in assessing the clustering performance. PLEASE NOTE – IN REAL-WORLD SCENARIOS THERE IS NO CLASS COLUMN – THEREFORE YOU CAN’T ASSESS CLASSIFICATION PERFORMANCE USING THIS COLUMN # # # In[85]: from pyspark.ml.clustering import GaussianMixture clust = GaussianMixture().setK(2).setSeed(1) # Let’s train... # # In[86]: from pyspark.ml import Pipeline pipeline = Pipeline(stages=[vectorAssembler, clust]) model = pipeline.fit(df) # ...and evaluate... # In[87]:
from pyspark.ml.clustering import GaussianMixture g = sns.lmplot(x='X Coordinate', y='Y Coordinate', hue='Primary Type', data=crime_df, fit_reg=False, size=10, palette={'NARCOTICS': 'tomato', 'THEFT': 'skyblue'}) # for each type of crime for crime_type, colour in [('NARCOTICS', 'r'), ('THEFT', 'b')]: crime_subset = ( crime_with_features_sdf .filter(sf.col('Primary Type') == crime_type) ) # fit a GMM gmm = GaussianMixture(k=30) model = gmm.fit(crime_subset) # extract the centers of the gaussians centers = ( model .gaussiansDF .toPandas() ) # Put the transformed data in a variable below crimes_with_predictions = model.transform(crime_subset) # 2. # Write code here ranked_gaussians = ( crimes_with_predictions .withColumn('probability', get_probability_udf('probability'))
# $example on$ from pyspark.ml.clustering import GaussianMixture # $example off$ from pyspark.sql import SparkSession """ A simple example demonstrating Gaussian Mixture Model (GMM). Run with: bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("GaussianMixtureExample")\ .getOrCreate() # $example on$ # loads data dataset = spark.read.format("libsvm").load( "data/mllib/sample_kmeans_data.txt") gmm = GaussianMixture().setK(2).setSeed(538009335) model = gmm.fit(dataset) print("Gaussians shown as a DataFrame: ") model.gaussiansDF.show(truncate=False) # $example off$ spark.stop()
# $example on$ from pyspark.ml.clustering import GaussianMixture # $example off$ from pyspark.sql import SparkSession """ A simple example demonstrating Gaussian Mixture Model (GMM). Run with: bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PythonGuassianMixtureExample")\ .getOrCreate() # $example on$ # loads data dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt") gmm = GaussianMixture().setK(2) model = gmm.fit(dataset) print("Gaussians: ") model.gaussiansDF.show() # $example off$ spark.stop()
async def gaussian_mixture(self, df): return GaussianMixture().fit(df.select('features'))
# Load the data stored in LIBSVM format as a DataFrame. data = sparkConf \ .read \ .format("libsvm") \ .load("data\data_libsvm.txt") data.show() # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) numTraining = trainingData.count() numTest = testData.count() print("numTraining = ",numTraining, " numTest =", numTest) # Train a Latent Dirichlet allocation. gmm = GaussianMixture(k=200, tol=0.0001,maxIter=10, seed=1) model=gmm.fit(trainingData) if model.hasSummary: summary=model.summary print("k=",summary.k) print("cluster sizes=",summary.clusterSizes) print("logLikelihood=",summary.logLikelihood) print("len weights=",len(model.weights)) # Make predictions. predictions = model.transform(testData) predictions.show(5, truncate=False)