def main(): parser = argparse.ArgumentParser(description='Clustering with pyspark.') parser.add_argument('--data-file', type=str, default='enwiki.json') parser.add_argument('--num-clusters', type=int, default=4) parser.add_argument('--seed', type=int, default=23) parser.add_argument('--algorithm', default='kmeans', choices=['kmeans', 'hier', 'gmm']) parser.add_argument('--output-groundtruth', type=str, default='groundtruth.csv') parser.add_argument('--output-cluster', type=str, default='cluster.csv') args = parser.parse_args() spark_session = SparkSession.builder.appName('clustering').getOrCreate() data = preprocess(spark_session, args.data_file) if args.algorithm == 'kmeans': alg = KMeans() elif args.algorithm == 'hier': alg = BisectingKMeans() elif args.algorithm == 'gmm': alg = GaussianMixture() model = train(alg, data, args.num_clusters, seed=args.seed) evaluate(data, model, args.algorithm, args.num_clusters, args.output_groundtruth, args.output_cluster)
def clustering_tuning(self): df_raw = pd.read_csv(f"{self.DEFAULT_PREPROCESSING_OUTPUT}", header=None) spark = SparkSession \ .builder \ .appName("PySparkKMeans") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame(df_raw) assembler = VectorAssembler(inputCols=df.columns, outputCol="features") # df_sample = df.sample(withReplacement=False, fraction=0.1) df_vec = assembler.transform(df).select("features") K_lst = list(range(50, 401, 10)) # gmm for k in range(K_lst): gm = GaussianMixture(k=k, tol=1, seed=10) gm.setMaxIter(500) model = gm.fit(df_vec) model.setPredictionCol("newPrediction") transformed = model.transform(df).select("features", "newPrediction") transformed = transformed.reset_index() return transformed
def gmmresults(): df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ ("canadatweets.csv") df2 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df3 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df4 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("claritin.csv") df = df1.unionAll(df2) df = df.unionAll(df3) df = df.unionAll(df4) df.show() # df2.show() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=20000) idf = IDF(inputCol="rawFeatures", outputCol="features") gmm = GaussianMixture(k=8, featuresCol='features') pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, gmm]) model = pipeline.fit(df) results = model.transform(df) results.cache() results.groupBy("prediction").count().show( ) # Note "display" is for Databricks; use show() for OSS Apache Spark results.filter(results.prediction == 1).show(200, False) results.show() results.toPandas().to_csv( 'gmmresultsCanadaAndProductsAndDisastersAndClaritin.csv')
def fit_ml_model(self, k, sample_fraction=None, retry=True): if sample_fraction: training_data = self.ml_training_data.sample( False, fraction=sample_fraction) else: training_data = self.ml_training_data result = GaussianMixture( k=k, maxIter=self.max_iterations).fit(training_data) ll = result.summary.logLikelihood # Retry to get a valid model if the calculated log likelihood is > 0. retries = 0 while retry and ll > 0 and retries < self.fit_model_retries: retry_sample_fraction = sample_fraction or self.ll_sample_fraction retry_data = self.ml_training_data.sample( False, fraction=retry_sample_fraction) result = GaussianMixture( k=k, maxIter=self.max_iterations).fit(retry_data) ll = result.summary.logLikelihood retries += 1 return result
def run(self): tf_path = self.settings.tf_path algorithm = self.settings.algorithm seed = int(self.settings.seed) k = int(self.settings.k) result_path = self.settings.result_path target = self.settings.target spark = SparkSession.builder.getOrCreate() with open("train_spark.txt", "w") as file: file.write("spark context" + str(spark.sparkContext)) file.write("===SeessionID===") file.write(str(id(spark))) df = spark.read.option("header", "true") \ .option("inferSchema", "true") \ .parquet(tf_path) df.repartition(10) # MODELING if algorithm == 'GMM': gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed( seed) print("=====" * 8) print(gmm.explainParams()) print("=====" * 8) model = gmm.fit(df) elif algorithm == 'KMeans': kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed) print("=====" * 8) print(kmm.explainParams()) print("=====" * 8) model = kmm.fit(df) else: raise ValueError("no alg") prediction = model.transform(df) with open("./feature_info.pickle", "rb") as handle: features_info = pickle.load(handle) prediction.select(features_info["numeric_features"] + features_info["category_features"] + [target, 'prediction']).coalesce(1).write.mode( 'overwrite').csv(result_path, header=True) print("Result file is successfully generated at: ", result_path)
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ), (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
def train(df, hiperparameter): ''' Gaussian Mixture training, returning Gaussian Mixture model. input: - Dataframe - config (configurasi hiperparameter) return: kmeans model ''' gm = GaussianMixture(featuresCol=hiperparameter['featuresCol'], predictionCol=hiperparameter['predictionCol'], k=hiperparameter['k'], probabilityCol=hiperparameter['probabilityCol'], tol=hiperparameter['tol'], maxIter=hiperparameter['maxIter'], seed=hiperparameter['seed']) model = gm.fit(df) return model
def gaussian_mixture(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.dense([-0.1, -0.05]), ), (Vectors.dense([-0.01, -0.1]), ), (Vectors.dense([0.9, 0.8]), ), (Vectors.dense([0.75, 0.935]), ), (Vectors.dense([-0.83, -0.68]), ), (Vectors.dense([-0.91, -0.76]), )] df = spark.createDataFrame(data, ["features"]) gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10) model = gm.fit(df) model.hasSummary # True summary = model.summary summary.k # 3 summary.clusterSizes # [2, 2, 2] weights = model.weights len(weights) # 3 model.gaussiansDF.show() transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[4].prediction == rows[5].prediction # True rows[2].prediction == rows[3].prediction # True temp_path = "./" gmm_path = temp_path + "/gmm" gm.save(gmm_path) gm2 = GaussianMixture.load(gmm_path) gm2.getK() # 3 model_path = temp_path + "/gmm_model" model.save(model_path) model2 = GaussianMixtureModel.load(model_path) model2.hasSummary # False model2.weights == model.weights # True model2.gaussiansDF.show()
def main(args): spark=SparkSession\ .builder\ .master(args[2])\ .appName(args[1])\ .getOrCreate() start_computing_time = time.time() # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load(args[3]) (trainingData, testData) = data.randomSplit([0.7, 0.3],seed=1234) gmm = GaussianMixture().setK(2) model = gmm.fit(trainingData) # Make predictions predictions = model.transform(testData) appendTime(sys.argv,start_computing_time) spark.stop()
# In[36]: dataset = outputFeatureDf kValues = [2, 3, 4, 5, 6, 7, 8] bwssse = [] for k in kValues: bkmeans = BisectingKMeans().setK(k).setSeed(122) bmodel = bkmeans.fit(dataset) bwssse.append(bmodel.computeCost(dataset)) for i in bwssse: print(i) # In[31]: from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture(predictionCol="prediction").setK(2).setSeed(538009335) gmmmodel = gmm.fit(outputFeatureDf) print("Gaussians shown as a DataFrame: ") gmmmodel.gaussiansDF.show() # In[32]: from sklearn.metrics.cluster import completeness_score transformed = gmmmodel.transform(dataset) labels = labeldf.collect() label_array = [int(i[0]) for i in labels] preds = transformed.select('prediction').collect() preds_array = [int(i.prediction) for i in preds] completeness_score(preds_array, label_array) # In[51]:
transformed = lda_model.transform(dataset) transformed.display() # COMMAND ---------- # MAGIC %md #####Topic Modeling using Latent Dirichlet Allocation # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture train_df = spark.read.table("retail_features").selectExpr( "selected_features as features") gmm = GaussianMixture(k=3, featuresCol='features') gmm_model = gmm.fit(train_df) gmm_model.gaussiansDF.display() # COMMAND ---------- # MAGIC %md #### 2. Associan Rules # MAGIC #####Collaborative Filtering - Alternating Least Squares # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import Row
withStd=True, withMean=True) scaled_model = stand_scaled.fit(train_df) train_df = scaled_model.transform(train_df) scaled_model = stand_scaled.fit(test1_df) test1_df = scaled_model.transform(test1_df) scaled_model = stand_scaled.fit(test2_df) test2_df = scaled_model.transform(test2_df) gm = GaussianMixture(featuresCol="features", k=2, seed=2, maxIter=20) gmodel = gm.fit(train_df) if gmodel.hasSummary: print("Cluster sizes", gmodel.summary.clusterSizes) print("Clsuters ", gmodel.summary.k) test1_df = gmodel.transform(test1_df) test1_df.select("features", "Occupancy", "prediction").show(5) test2_df = gmodel.transform(test2_df) test2_df.select("features", "Occupancy", "prediction").show(5) count1 = test1_df.filter(" prediction!=Occupancy").count() total1 = test1_df.count()
bkmModel = bkm.fit(sales) # COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(5) print gmm.explainParams() model = gmm.fit(sales) # COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer, CountVectorizer
#Lets test again with the best k = 7 on the denormalized dataset kmeans = KMeans(featuresCol="features").setK(7).setSeed(1) pipeline = Pipeline(stages=[vectorAssembler, kmeans]) model = pipeline.fit(df_denormalized) predictions = model.transform(df_denormalized) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) #Lets try GaussianMixture instead of KMeans from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture(featuresCol="features").setK(14).setSeed(1) pipeline = Pipeline(stages=[vectorAssembler, gmm]) model = pipeline.fit(df) predictions = model.transform(df) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) #Lets try finding the best K using an iterative method Ks = 15 mean_acc = np.zeros((Ks - 1)) ConfusionMx = [] for n in range(7, Ks): #Train Model and Predict
# are drawn from one of k Gaussian sub-distributions, each with its own # probability. The spark.ml implementation uses the expectation-maximization # algorithm to induce the maximum-likelihood model given a set of samples. The # implementation has the following parameters: # - k is the number of desired clusters. # - convergenceTol is the maximum change in log-likelihood at which we # consider convergence achieved. # - maxIterations is the maximum number of iterations to perform without # reaching convergence. # - initialModel is an optional starting point from which to start the EM # algorithm. If this parameter is omitted, a random starting point will be # constructed from the data. spark = SparkSession.builder.appName("GaussianMixture").getOrCreate() # Load data. dataset = spark.read.format("libsvm") \ .load("sample_kmeans_data.txt") \ .toDF("id", "features") gmm = GaussianMixture().setK(2).setSeed(1) model = gmm.fit(dataset) print("Gaussians shown as a DataFrame:") model.gaussiansDF.show(truncate=False) clustered = model.transform(dataset) clustered.show() spark.stop()
# Load the data stored in LIBSVM format as a DataFrame. data = sparkConf \ .read \ .format("libsvm") \ .load("data\data_libsvm.txt") data.show() # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) numTraining = trainingData.count() numTest = testData.count() print("numTraining = ",numTraining, " numTest =", numTest) # Train a Latent Dirichlet allocation. gmm = GaussianMixture(k=200, tol=0.0001,maxIter=10, seed=1) model=gmm.fit(trainingData) if model.hasSummary: summary=model.summary print("k=",summary.k) print("cluster sizes=",summary.clusterSizes) print("logLikelihood=",summary.logLikelihood) print("len weights=",len(model.weights)) # Make predictions. predictions = model.transform(testData) predictions.show(5, truncate=False)
with Timer('read', 'Reading data'): df = df_base = spark.read.csv('data/yellow_tripdata_2016-01.csv', header=True, inferSchema=True) with Timer('process', 'Cleaning invalid data'): df = process(df) from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType from pyspark import StorageLevel from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture(k=1000) result = [] with Timer('clustering', 'Computing clusters'): for weekday in range(7): for hour in range(24): with Timer('clustering', 'Computing clusters for {}x{}'.format(weekday, hour)): df_h = df.filter(df.weekday == weekday).filter(df.hour == hour) va = VectorAssembler( inputCols=["pickup_latitude", "pickup_longitude"], outputCol="features") df_t = va.transform(df_h) model = gmm.fit(df_t)
# longitude are already in the same scale, we can proceed. # Select home latitude and longitude as the features: selected = ["home_lat", "home_lon"] # Assemble the feature vector: from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=selected, outputCol="features") assembled = assembler.transform(students) # ## Fit a Gaussian mixture model # Specify a Gaussian mixture model with two clusters: from pyspark.ml.clustering import GaussianMixture gm = GaussianMixture(featuresCol="features", k=2, seed=12345) # Examine the hyperparameters: print(gm.explainParams()) # Fit the Gaussian mixture model: gmm = gm.fit(assembled) type(gmm) # ## Examine the Gaussian mixture model # Examine the mixing weights: gmm.weights # Examine the (multivariate) Gaussian distributions:
k = args.k_clusters if algorithm not in ['kmeans', 'gmm', 'lda', 'spectral']: raise ValueError('Not a valid algorithm') ss = SparkSession.builder.getOrCreate() df = ss.read.csv(path, header=True, inferSchema=True) df_preprocessed = preprocessing(df, num_pca=num_pca_features) df_preprocessed.write.parquet("preprocessed", mode="Overwrite") if algorithm == 'kmeans': model = KMeans(k=k).setSeed(1).fit(df_preprocessed) predictions = model.transform(df_preprocessed) elif algorithm == 'spectral': model = SpectralClustering(k=k, k_nearest=7) predictions = model.cluster(df_preprocessed, ss, repartition_num=num_nodes) elif algorithm == 'lda': model = LDA(k=k, maxIter=10).fit(df_preprocessed) predictions = model.transform(df_preprocessed) elif algorithm == 'gmm': model = GaussianMixture(k=k).fit(df_preprocessed) predictions = model.transform(df_preprocessed) predictions.select([col for col in predictions.columns if col != 'features'])\ .toPandas()\ .to_csv(sys.stdout)
obj = client.get_object(Bucket='yelpdatacf', Key='book_cf.csv') df = pd.read_csv(obj['Body']) df.rating = (df.rating - df.rating.mean()) ratings = spark.createDataFrame(df) # use the model that has min RMSE num_iter,param = 200,0.2 als = ALS(maxIter=num_iter, regParam=param, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop") model = als.fit(ratings) user_feature = model.userFactors book_feature = model.itemFactors k = 20 gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features") model = gmm.fit(user_feature) transformed = model.transform(user_feature).select('id', 'prediction') rows = transformed.collect() df = spark.createDataFrame(rows) df.write.jdbc(url='jdbc:%s' % url+'yelp', table='book_gm_user_feature20', mode='overwrite', properties=properties) k = 20 gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features") model = gmm.fit(book_feature) transformed = model.transform(book_feature).select('id', 'prediction') rows = transformed.collect()
# negative_udf = udf(lambda x: tp_values(x, 1)) # # # # # # train_df = train_df.withColumn('pos', positive_udf(col('ST')).astype(IntegerType())) \ # .withColumn('neg', negative_udf(col('ST')).astype(IntegerType())) # # train_df.show() # # assembler = VectorAssembler(inputCols=['pos', 'neg'], outputCol='features') # train_df = assembler.transform(train_df) # train_df.show() #modelling kmeans = KMeans().setK(2).setSeed(1).setMaxIter(20) model = kmeans.fit(train_df) model.transform(test_df).show() for c in model.clusterCenters(): print(c) # bkmeans = BisectingKMeans().setK(2).setSeed(1).setMaxIter(20) model = bkmeans.fit(train_df) model.transform(test_df).show() for c in model.clusterCenters(): print(c) gaussianmixture = GaussianMixture().setK(2).setSeed(1) model = gaussianmixture.fit(train_df) model.transform(test_df).show()
times = [] for i in range(1, 5): start = time.time() bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1))) modelBkm = bkm.fit(tsneDataFrame.select("features")) transformedBkm = modelBkm.transform(tsneDataFrame) end = time.time() times.append(end - start) bisectingKmeansTime = average(times) ############## GMM ################# from pyspark.ml.clustering import GaussianMixture times = [] for i in range(1, 5): start = time.time() gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1))) modelGmm = gmm.fit(tsneDataFrame.select("features")) transformedGmm = modelGmm.transform(tsneDataFrame) end = time.time() times.append(end - start) gmmTime = average(times) #preparation of data for non pyspark implementations clusterData = tsneDataFrame.select("screen_name", "features").collect() screenames = [x[0] for x in clusterData] clData = [x[1] for x in clusterData] clusData = np.array(clData) x = [cl[0] for cl in clData] y = [cl[1] for cl in clData] ################### DBSCAN ###################
from pyspark.ml.clustering import GaussianMixture g = sns.lmplot(x='X Coordinate', y='Y Coordinate', hue='Primary Type', data=crime_df, fit_reg=False, size=10, palette={'NARCOTICS': 'tomato', 'THEFT': 'skyblue'}) # for each type of crime for crime_type, colour in [('NARCOTICS', 'r'), ('THEFT', 'b')]: crime_subset = ( crime_with_features_sdf .filter(sf.col('Primary Type') == crime_type) ) # fit a GMM gmm = GaussianMixture(k=30) model = gmm.fit(crime_subset) # extract the centers of the gaussians centers = ( model .gaussiansDF .toPandas() ) # Put the transformed data in a variable below crimes_with_predictions = model.transform(crime_subset) # 2. # Write code here ranked_gaussians = ( crimes_with_predictions .withColumn('probability', get_probability_udf('probability'))
q = "SELECT ss.ss_customer_id AS cid, count(CASE WHEN i.i_class_id=1 THEN 1 ELSE NULL END) AS id1,count(CASE WHEN i.i_class_id=3 THEN 1 ELSE NULL END) AS id3,count(CASE WHEN i.i_class_id=5 THEN 1 ELSE NULL END) AS id5,count(CASE WHEN i.i_class_id=7 THEN 1 ELSE NULL END) AS id7, count(CASE WHEN i.i_class_id=9 THEN 1 ELSE NULL END) AS id9,count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS id11,count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS id13,count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS id15,count(CASE WHEN i.i_class_id=2 THEN 1 ELSE NULL END) AS id2,count(CASE WHEN i.i_class_id=4 THEN 1 ELSE NULL END) AS id4,count(CASE WHEN i.i_class_id=6 THEN 1 ELSE NULL END) AS id6,count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS id12, count(CASE WHEN i.i_class_id=8 THEN 1 ELSE NULL END) AS id8,count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS id10,count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS id14,count(CASE WHEN i.i_class_id=16 THEN 1 ELSE NULL END) AS id16 FROM store_sales ss INNER JOIN items i ON ss.ss_item_id = i.i_item_id WHERE i.i_category_name IN ('cat#01','cat#02','cat#03','cat#04','cat#05','cat#06','cat#07','cat#08','cat#09','cat#10','cat#11','cat#12','cat#013','cat#14','cat#15') AND ss.ss_customer_id IS NOT NULL GROUP BY ss.ss_customer_id HAVING count(ss.ss_item_id) > 3" df = spark.sql(q) assembler = VectorAssembler(inputCols=[ "cid", "id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9", "id10", "id11", "id12", "id13", "id14", "id15", "id16" ], outputCol="FEATURE") vd = assembler.transform(df) cost = list() gmm = GaussianMixture().setK(2).setFeaturesCol('FEATURE').setSeed( 538009335).setTol(0.01) model = gmm.fit(vd) weights = model.weights print(weights) summary = model.summary summary.k logLikelihood = summary.logLikelihood param = model.explainParams() print(param) model.gaussiansDF.select("mean").head() model.gaussiansDF.select("cov").head() model.gaussiansDF.show()
# $example on$ from pyspark.ml.clustering import GaussianMixture # $example off$ from pyspark.sql import SparkSession """ A simple example demonstrating Gaussian Mixture Model (GMM). Run with: bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("GaussianMixtureExample")\ .getOrCreate() # $example on$ # loads data dataset = spark.read.format("libsvm").load( "data/mllib/sample_kmeans_data.txt") gmm = GaussianMixture().setK(2).setSeed(538009335) model = gmm.fit(dataset) print("Gaussians shown as a DataFrame: ") model.gaussiansDF.show(truncate=False) # $example off$ spark.stop()
def SparkML(train_df, test_df=None, featuresCol='features', labelCol='label', binaryclass=False, multiclass=False, n_cluster=2, userCol='user', itemCol='item', ratingCol='rating', rank=10, userid=3, itemid=3, itemsCol='items', minSupport=0.3, minConfidence=0.8, stringIndexer=False, inputColStringIndexer=None, outputColStringIndexer=None, oneHotEncoder=False, inputColOneHotEncoder=None, outputColOneHotEncoder=None, vectorAssembler=False, inputColsVectorAssembler=None, outputColsVectorAssembler=None, vectorIndexer=False, inputColsVectorIndexer=None, outputColsVectorIndexer=None, maxCategories=None, classification=False, logisticregression=False, decisiontreeclassifier=False, linearsvc=False, naivebayes=False, randomforestclassifier=False, gbtclassifier=False, regression=False, linearregression=True, decisiontreeregressor=False, randomforestregressor=False, gbtregressor=False, clustering=False, kmeans=False, gaussianmixture=False, lda=False, recommendation=False, als=False, association=False, fpgrowth=False): if classification: if logisticregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRClassifier = LogisticRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', standardization=True, maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5) paramGrid = ParamGridBuilder().addGrid( LRClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") LRCV = CrossValidator(estimator=LRClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LRC_Pipeline = Pipeline(stages=stagesList) LRC_PipelineModel = LRC_Pipeline.fit(train_df) LRC_Predicted = LRC_PipelineModel.transform(test_df) LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel LRC_Probability = LRC_Predicted.select("Probability").toPandas() LRC_Prediction = LRC_Predicted.select("Prediction").toPandas() LRC_Score = evaluator.evaluate(LRC_Predicted) return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score if decisiontreeclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTClassifier = DecisionTreeClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', seed=None) paramGrid = ParamGridBuilder().addGrid( DTClassifier.impurity, ["gini", "entropy"]).addGrid( DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTClassifier.maxBins, [3, 5, 10, 50, 100, 200]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") DTCV = CrossValidator(estimator=DTClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTCV) DTC_Pipeline = Pipeline(stages=stagesList) DTC_PipelineModel = DTC_Pipeline.fit(train_df) DTC_Predicted = DTC_PipelineModel.transform(test_df) DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel DTC_Probability = DTC_Predicted.select("Probability").toPandas() DTC_Prediction = DTC_Predicted.select("Prediction").toPandas() DTC_Score = evaluator.evaluate(DTC_Predicted) return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score if linearsvc: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) SVClassifier = LinearSVC(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', rawPredictionCol='RawPrediction', maxIter=100, regParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, threshold=0.0) paramGrid = ParamGridBuilder().addGrid( SVClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( SVClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") SVCV = CrossValidator(estimator=SVClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(SVCV) SVC_Pipeline = Pipeline(stages=stagesList) SVC_PipelineModel = SVC_Pipeline.fit(train_df) SVC_Predicted = SVC_PipelineModel.transform(test_df) SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel SVC_Prediction = SVC_Predicted.select("Prediction").toPandas() SVC_Score = evaluator.evaluate(SVC_Predicted) return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score if naivebayes: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) NBClassifier = NaiveBayes(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None) paramGrid = ParamGridBuilder().addGrid( NBClassifier.smoothing, [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") NBCV = CrossValidator(estimator=NBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(NBCV) NBC_Pipeline = Pipeline(stages=stagesList) NBC_PipelineModel = NBC_Pipeline.fit(train_df) NBC_Predicted = NBC_PipelineModel.transform(test_df) NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel NBC_Probability = NBC_Predicted.select("Probability").toPandas() NBC_Prediction = NBC_Predicted.select("Prediction").toPandas() NBC_Score = evaluator.evaluate(NBC_Predicted) return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score if randomforestclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFClassifier = RandomForestClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', numTrees=20, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( RFClassifier.impurity, ["gini", "entropy"]).addGrid( RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFClassifier.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") RFCV = CrossValidator(estimator=RFClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFCV) RFC_Pipeline = Pipeline(stages=stagesList) RFC_PipelineModel = RFC_Pipeline.fit(train_df) RFC_Predicted = RFC_PipelineModel.transform(test_df) RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel RFC_Probability = RFC_Predicted.select("Probability").toPandas() RFC_Prediction = RFC_Predicted.select("Prediction").toPandas() RFC_Score = evaluator.evaluate(RFC_Predicted) return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score if gbtclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBClassifier = GBTClassifier(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBClassifier.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBClassifier.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") GBCV = CrossValidator(estimator=GBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBCV) GBC_Pipeline = Pipeline(stages=stagesList) GBC_PipelineModel = GBC_Pipeline.fit(train_df) GBC_Predicted = GBC_PipelineModel.transform(test_df) GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel GBC_Prediction = GBC_Predicted.select("Prediction").toPandas() GBC_Score = evaluator.evaluate(GBC_Predicted) return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score if regression: if linearregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRegressor = LinearRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', standardization=True, fitIntercept=True, loss='squaredError', maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, epsilon=1.35) paramGrid = ParamGridBuilder().addGrid( LRegressor.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRegressor.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") LRCV = CrossValidator(estimator=LRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LR_Pipeline = Pipeline(stages=stagesList) LR_PipelineModel = LR_Pipeline.fit(train_df) LR_Predicted = LR_PipelineModel.transform(test_df) LR_BestModel = LR_PipelineModel.stages[-1].bestModel LR_Prediction = LR_Predicted.select("Prediction").toPandas() LR_Score = evaluator.evaluate(LR_Predicted) return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score if decisiontreeregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', seed=None, varianceCol=None) paramGrid = ParamGridBuilder().addGrid( DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") DTRCV = CrossValidator(estimator=DTRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTRCV) DTR_Pipeline = Pipeline(stages=stagesList) DTR_PipelineModel = DTR_Pipeline.fit(train_df) DTR_Predicted = DTR_PipelineModel.transform(test_df) DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel DTR_Prediction = DTR_Predicted.select("Prediction").toPandas() DTR_Score = evaluator.evaluate(DTR_Predicted) return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score if randomforestregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFRegressor = RandomForestRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', subsamplingRate=1.0, seed=None, numTrees=20) paramGrid = ParamGridBuilder().addGrid( RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFRegressor.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") RFRCV = CrossValidator(estimator=RFRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFRCV) RFR_Pipeline = Pipeline(stages=stagesList) RFR_PipelineModel = RFR_Pipeline.fit(train_df) RFR_Predicted = RFR_PipelineModel.transform(test_df) RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel RFR_Prediction = RFR_Predicted.select("Prediction").toPandas() RFR_Score = evaluator.evaluate(RFR_Predicted) return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score if gbtregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBRegressor = GBTRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, subsamplingRate=1.0, lossType='squared', maxIter=20, stepSize=0.1, seed=None, impurity='variance') paramGrid = ParamGridBuilder().addGrid( GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBRegressor.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBRegressor.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") GBRCV = CrossValidator(estimator=GBRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBRCV) GBR_Pipeline = Pipeline(stages=stagesList) GBR_PipelineModel = GBR_Pipeline.fit(train_df) GBR_Predicted = GBR_PipelineModel.transform(test_df) GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel GBR_Prediction = GBR_Predicted.select("Prediction").toPandas() GBR_Score = evaluator.evaluate(GBR_Predicted) return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score if clustering: if kmeans: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) KCluster = KMeans(featuresCol=featuresCol, predictionCol='Prediction', k=n_cluster, initMode='k-means||', initSteps=2, tol=0.0001, maxIter=20, seed=None) paramGrid = ParamGridBuilder().addGrid( KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid( KCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( KCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') KMCV = CrossValidator(estimator=KCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(KMCV) KMC_Pipeline = Pipeline(stages=stagesList) KMC_PipelineModel = KMC_Pipeline.fit(train_df) KMC_Predicted = KMC_PipelineModel.transform(train_df) KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel KMC_Prediction = KMC_Predicted.select("Prediction").toPandas() KMC_Score = evaluator.evaluate(KMC_Predicted) return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score if gaussianmixture: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GMCluster = GaussianMixture(featuresCol=featuresCol, predictionCol='Prediction', probabilityCol='Probability', k=n_cluster, tol=0.01, maxIter=100, seed=None) paramGrid = ParamGridBuilder().addGrid( GMCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( GMCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') GMCV = CrossValidator(estimator=GMCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GMCV) GMC_Pipeline = Pipeline(stages=stagesList) GMC_PipelineModel = GMC_Pipeline.fit(train_df) GMC_Predicted = GMC_PipelineModel.transform(train_df) GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel GMC_Probability = GMC_Predicted.select("Probability").toPandas() GMC_Prediction = GMC_Predicted.select("Prediction").toPandas() GMC_Score = evaluator.evaluate(GMC_Predicted) return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score if lda: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LDACluster = LDA(featuresCol=featuresCol, maxIter=20, seed=None, k=n_cluster, learningOffset=1024.0, learningDecay=0.51, subsamplingRate=0.05) paramGrid = ParamGridBuilder().addGrid( LDACluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( LDACluster.seed, [i for i in range(1001)]).addGrid( LDACluster.subsamplingRate, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') LDACV = CrossValidator(estimator=LDACluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LDACV) LDA_Pipeline = Pipeline(stages=stagesList) LDA_PipelineModel = LDA_Pipeline.fit(train_df) LDA_Predicted = LDA_PipelineModel.transform(train_df) LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel LDA_Topics = LDA_BestModel.describeTopics().toPandas() LDA_Score = evaluator.evaluate(LDA_Predicted) return LDA_BestModel, LDA_Topics, LDA_Score if recommendation: if als: ALSR = ALS(userCol=userCol, itemCol=itemCol, ratingCol=ratingCol, rank=rank, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, alpha=1.0, seed=1) ALSR_Model = ALSR.fit(train_df) ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid) ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid) return ALSR_Model, ALSR_ForUsers, ALSR_ForItems if association: if fpgrowth: fpg = FPGrowth(minSupport=minSupport, minConfidence=minConfidence, itemsCol=itemsCol, predictionCol='Prediction') fpg_model = fpg.fit(train_df) fpg_freqItemsets = fpg_model.freqItemsets.toPandas() fpg_associationRules = fpg_model.associationRules.toPandas() return fpg_model, fpg_freqItemsets, fpg_associationRules
async def gaussian_mixture(self, df): return GaussianMixture().fit(df.select('features'))
spark = SparkSession \ .builder \ .appName("ChiSqSelectorExample") \ .getOrCreate() rawData = spark.sparkContext.textFile("file:///home/tianlei/iris.txt") def f(x): rel = {} rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3])) return rel df = sc.textFile("file:///usr/local/spark/iris.txt").map( lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF() # 我们建立一个简单的GaussianMixture对象,设定其聚类数目为3,其他参数取默认值。 gm = GaussianMixture().setK(3).setPredictionCol( "Prediction").setProbabilityCol("Probability") gmm = gm.fit(df) # 调用transform()方法处理数据集之后,打印数据集,可以看到每一个样本的预测簇以及其概率分布向量 # (这里为了明晰起见,省略了大部分行,只选择三行): result = gmm.transform(df) result.show(150, False) # 得到模型后,即可查看模型的相关参数,与KMeans方法不同,GMM不直接给出聚类中心, # 而是给出各个混合成分(多元高斯分布)的参数。在ML的实现中, # GMM的每一个混合成分都使用一个MultivariateGaussian类(位于org.apache.spark.ml.stat.distribution包)来存储, # 我们可以使用GaussianMixtureModel类的weights成员获取到各个混合成分的权重, # 使用gaussians成员来获取到各个混合成分的参数(均值向量和协方差矩阵): for i in range(3): print("Component " + str(i) + " : weight is " + str(gmm.weights[i]) + "\n mu vector is " + str(gmm.gaussiansDF.select('mean').head()) + " \n sigma matrix is " + str(gmm.gaussiansDF.select('cov').head()))