def main():
    parser = argparse.ArgumentParser(description='Clustering with pyspark.')

    parser.add_argument('--data-file', type=str, default='enwiki.json')
    parser.add_argument('--num-clusters', type=int, default=4)
    parser.add_argument('--seed', type=int, default=23)
    parser.add_argument('--algorithm',
                        default='kmeans',
                        choices=['kmeans', 'hier', 'gmm'])
    parser.add_argument('--output-groundtruth',
                        type=str,
                        default='groundtruth.csv')
    parser.add_argument('--output-cluster', type=str, default='cluster.csv')

    args = parser.parse_args()

    spark_session = SparkSession.builder.appName('clustering').getOrCreate()

    data = preprocess(spark_session, args.data_file)

    if args.algorithm == 'kmeans':
        alg = KMeans()
    elif args.algorithm == 'hier':
        alg = BisectingKMeans()
    elif args.algorithm == 'gmm':
        alg = GaussianMixture()

    model = train(alg, data, args.num_clusters, seed=args.seed)
    evaluate(data, model, args.algorithm, args.num_clusters,
             args.output_groundtruth, args.output_cluster)
示例#2
0
    def clustering_tuning(self):
        df_raw = pd.read_csv(f"{self.DEFAULT_PREPROCESSING_OUTPUT}",
                             header=None)

        spark = SparkSession \
            .builder \
            .appName("PySparkKMeans") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()

        df = spark.createDataFrame(df_raw)
        assembler = VectorAssembler(inputCols=df.columns, outputCol="features")
        # df_sample = df.sample(withReplacement=False, fraction=0.1)
        df_vec = assembler.transform(df).select("features")

        K_lst = list(range(50, 401, 10))

        # gmm
        for k in range(K_lst):
            gm = GaussianMixture(k=k, tol=1, seed=10)
            gm.setMaxIter(500)

            model = gm.fit(df_vec)
            model.setPredictionCol("newPrediction")
            transformed = model.transform(df).select("features",
                                                     "newPrediction")

        transformed = transformed.reset_index()
        return transformed
示例#3
0
def gmmresults():
    df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        ("canadatweets.csv")
    df2 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df3 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df4 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("claritin.csv")
    df = df1.unionAll(df2)
    df = df.unionAll(df3)
    df = df.unionAll(df4)
    df.show()
    # df2.show()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens",
                               outputCol="stopWordsRemovedTokens")
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=20000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    gmm = GaussianMixture(k=8, featuresCol='features')
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, gmm])
    model = pipeline.fit(df)
    results = model.transform(df)
    results.cache()
    results.groupBy("prediction").count().show(
    )  # Note "display" is for Databricks; use show() for OSS Apache Spark
    results.filter(results.prediction == 1).show(200, False)
    results.show()
    results.toPandas().to_csv(
        'gmmresultsCanadaAndProductsAndDisastersAndClaritin.csv')
示例#4
0
    def fit_ml_model(self, k, sample_fraction=None, retry=True):
        if sample_fraction:
            training_data = self.ml_training_data.sample(
                False, fraction=sample_fraction)
        else:
            training_data = self.ml_training_data

        result = GaussianMixture(
            k=k, maxIter=self.max_iterations).fit(training_data)
        ll = result.summary.logLikelihood

        # Retry to get a valid model if the calculated log likelihood is > 0.
        retries = 0
        while retry and ll > 0 and retries < self.fit_model_retries:
            retry_sample_fraction = sample_fraction or self.ll_sample_fraction
            retry_data = self.ml_training_data.sample(
                False, fraction=retry_sample_fraction)
            result = GaussianMixture(
                k=k, maxIter=self.max_iterations).fit(retry_data)
            ll = result.summary.logLikelihood
            retries += 1

        return result
示例#5
0
    def run(self):

        tf_path = self.settings.tf_path
        algorithm = self.settings.algorithm
        seed = int(self.settings.seed)
        k = int(self.settings.k)
        result_path = self.settings.result_path
        target = self.settings.target

        spark = SparkSession.builder.getOrCreate()

        with open("train_spark.txt", "w") as file:
            file.write("spark context" + str(spark.sparkContext))
            file.write("===SeessionID===")
            file.write(str(id(spark)))

        df = spark.read.option("header", "true") \
            .option("inferSchema", "true") \
            .parquet(tf_path)
        df.repartition(10)

        # MODELING
        if algorithm == 'GMM':
            gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(
                seed)
            print("=====" * 8)
            print(gmm.explainParams())
            print("=====" * 8)
            model = gmm.fit(df)
        elif algorithm == 'KMeans':
            kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed)
            print("=====" * 8)
            print(kmm.explainParams())
            print("=====" * 8)
            model = kmm.fit(df)
        else:
            raise ValueError("no alg")

        prediction = model.transform(df)

        with open("./feature_info.pickle", "rb") as handle:
            features_info = pickle.load(handle)

        prediction.select(features_info["numeric_features"] +
                          features_info["category_features"] +
                          [target, 'prediction']).coalesce(1).write.mode(
                              'overwrite').csv(result_path, header=True)
        print("Result file is successfully generated at: ", result_path)
示例#6
0
 def test_gaussian_mixture_summary(self):
     data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ),
             (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )]
     df = self.spark.createDataFrame(data, ["features"])
     gmm = GaussianMixture(k=2)
     model = gmm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertTrue(isinstance(s.probability, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 3)
示例#7
0
def train(df, hiperparameter):
    '''
    Gaussian Mixture training, returning Gaussian Mixture model.
    input: - Dataframe
           - config (configurasi hiperparameter)
    
    return: kmeans model
    '''
    gm = GaussianMixture(featuresCol=hiperparameter['featuresCol'],
                         predictionCol=hiperparameter['predictionCol'],
                         k=hiperparameter['k'],
                         probabilityCol=hiperparameter['probabilityCol'],
                         tol=hiperparameter['tol'],
                         maxIter=hiperparameter['maxIter'],
                         seed=hiperparameter['seed'])
    model = gm.fit(df)
    return model
示例#8
0
def gaussian_mixture():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    data = [(Vectors.dense([-0.1, -0.05]), ), (Vectors.dense([-0.01, -0.1]), ),
            (Vectors.dense([0.9, 0.8]), ), (Vectors.dense([0.75, 0.935]), ),
            (Vectors.dense([-0.83, -0.68]), ), (Vectors.dense([-0.91,
                                                               -0.76]), )]
    df = spark.createDataFrame(data, ["features"])
    gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10)
    model = gm.fit(df)
    model.hasSummary
    # True
    summary = model.summary
    summary.k
    # 3
    summary.clusterSizes
    # [2, 2, 2]
    weights = model.weights
    len(weights)
    # 3
    model.gaussiansDF.show()
    transformed = model.transform(df).select("features", "prediction")
    rows = transformed.collect()
    rows[4].prediction == rows[5].prediction
    # True
    rows[2].prediction == rows[3].prediction
    # True
    temp_path = "./"
    gmm_path = temp_path + "/gmm"
    gm.save(gmm_path)
    gm2 = GaussianMixture.load(gmm_path)
    gm2.getK()
    # 3
    model_path = temp_path + "/gmm_model"
    model.save(model_path)
    model2 = GaussianMixtureModel.load(model_path)
    model2.hasSummary
    # False
    model2.weights == model.weights
    # True
    model2.gaussiansDF.show()
def main(args):
    spark=SparkSession\
            .builder\
            .master(args[2])\
            .appName(args[1])\
            .getOrCreate()
    
    start_computing_time = time.time()

    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load(args[3])

    (trainingData, testData) = data.randomSplit([0.7, 0.3],seed=1234)

    gmm = GaussianMixture().setK(2)
    model = gmm.fit(trainingData)
    
    # Make predictions
    predictions = model.transform(testData)

    appendTime(sys.argv,start_computing_time)

    spark.stop()
# In[36]:

dataset = outputFeatureDf
kValues = [2, 3, 4, 5, 6, 7, 8]
bwssse = []
for k in kValues:
    bkmeans = BisectingKMeans().setK(k).setSeed(122)
    bmodel = bkmeans.fit(dataset)
    bwssse.append(bmodel.computeCost(dataset))
for i in bwssse:
    print(i)

# In[31]:

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture(predictionCol="prediction").setK(2).setSeed(538009335)
gmmmodel = gmm.fit(outputFeatureDf)
print("Gaussians shown as a DataFrame: ")
gmmmodel.gaussiansDF.show()

# In[32]:

from sklearn.metrics.cluster import completeness_score
transformed = gmmmodel.transform(dataset)
labels = labeldf.collect()
label_array = [int(i[0]) for i in labels]
preds = transformed.select('prediction').collect()
preds_array = [int(i.prediction) for i in preds]
completeness_score(preds_array, label_array)

# In[51]:
示例#11
0
transformed = lda_model.transform(dataset)
transformed.display()

# COMMAND ----------

# MAGIC %md #####Topic Modeling using Latent Dirichlet Allocation

# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture

train_df = spark.read.table("retail_features").selectExpr(
    "selected_features as features")

gmm = GaussianMixture(k=3, featuresCol='features')
gmm_model = gmm.fit(train_df)

gmm_model.gaussiansDF.display()

# COMMAND ----------

# MAGIC %md #### 2. Associan Rules
# MAGIC #####Collaborative Filtering - Alternating Least Squares

# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
示例#12
0
文件: Gaussian.py 项目: AAB94/RP
                              withStd=True,
                              withMean=True)

scaled_model = stand_scaled.fit(train_df)

train_df = scaled_model.transform(train_df)

scaled_model = stand_scaled.fit(test1_df)

test1_df = scaled_model.transform(test1_df)

scaled_model = stand_scaled.fit(test2_df)

test2_df = scaled_model.transform(test2_df)

gm = GaussianMixture(featuresCol="features", k=2, seed=2, maxIter=20)

gmodel = gm.fit(train_df)

if gmodel.hasSummary:
    print("Cluster sizes", gmodel.summary.clusterSizes)
    print("Clsuters ", gmodel.summary.k)

test1_df = gmodel.transform(test1_df)
test1_df.select("features", "Occupancy", "prediction").show(5)

test2_df = gmodel.transform(test2_df)
test2_df.select("features", "Occupancy", "prediction").show(5)

count1 = test1_df.filter(" prediction!=Occupancy").count()
total1 = test1_df.count()
bkmModel = bkm.fit(sales)

# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes  # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
print gmm.explainParams()
model = gmm.fit(sales)

# COMMAND ----------

summary = model.summary
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()

# COMMAND ----------

from pyspark.ml.feature import Tokenizer, CountVectorizer
示例#14
0
#Lets test again with the best k = 7 on the denormalized dataset

kmeans = KMeans(featuresCol="features").setK(7).setSeed(1)
pipeline = Pipeline(stages=[vectorAssembler, kmeans])
model = pipeline.fit(df_denormalized)
predictions = model.transform(df_denormalized)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

#Lets try GaussianMixture instead of KMeans
from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture(featuresCol="features").setK(14).setSeed(1)
pipeline = Pipeline(stages=[vectorAssembler, gmm])
model = pipeline.fit(df)
predictions = model.transform(df)
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

#Lets try finding the best K using an iterative method
Ks = 15
mean_acc = np.zeros((Ks - 1))
ConfusionMx = []
for n in range(7, Ks):

    #Train Model and Predict
示例#15
0
# are drawn from one of k Gaussian sub-distributions, each with its own
# probability. The spark.ml implementation uses the expectation-maximization
# algorithm to induce the maximum-likelihood model given a set of samples. The
# implementation has the following parameters:
#   - k is the number of desired clusters.
#   - convergenceTol is the maximum change in log-likelihood at which we
#     consider convergence achieved.
#   - maxIterations is the maximum number of iterations to perform without
#     reaching convergence.
#   - initialModel is an optional starting point from which to start the EM
#     algorithm. If this parameter is omitted, a random starting point will be
#     constructed from the data.

spark = SparkSession.builder.appName("GaussianMixture").getOrCreate()

# Load data.
dataset = spark.read.format("libsvm") \
                    .load("sample_kmeans_data.txt") \
                    .toDF("id", "features")

gmm = GaussianMixture().setK(2).setSeed(1)
model = gmm.fit(dataset)

print("Gaussians shown as a DataFrame:")
model.gaussiansDF.show(truncate=False)

clustered = model.transform(dataset)
clustered.show()

spark.stop()
示例#16
0
    # Load the data stored in LIBSVM format as a DataFrame.
    data = sparkConf \
        .read \
        .format("libsvm") \
        .load("data\data_libsvm.txt")
    data.show()

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    numTraining = trainingData.count()
    numTest = testData.count()
    print("numTraining = ",numTraining, " numTest =", numTest)

    # Train a Latent Dirichlet allocation.
    gmm = GaussianMixture(k=200, tol=0.0001,maxIter=10, seed=1)

    model=gmm.fit(trainingData)

    if model.hasSummary:
        summary=model.summary
        print("k=",summary.k)
        print("cluster sizes=",summary.clusterSizes)
        print("logLikelihood=",summary.logLikelihood)
        print("len weights=",len(model.weights))


    # Make predictions.
    predictions = model.transform(testData)
    predictions.show(5, truncate=False)
示例#17
0
with Timer('read', 'Reading data'):
    df = df_base = spark.read.csv('data/yellow_tripdata_2016-01.csv',
                                  header=True,
                                  inferSchema=True)

with Timer('process', 'Cleaning invalid data'):
    df = process(df)

from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark import StorageLevel

from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture(k=1000)

result = []
with Timer('clustering', 'Computing clusters'):
    for weekday in range(7):
        for hour in range(24):
            with Timer('clustering',
                       'Computing clusters for {}x{}'.format(weekday, hour)):
                df_h = df.filter(df.weekday == weekday).filter(df.hour == hour)
                va = VectorAssembler(
                    inputCols=["pickup_latitude", "pickup_longitude"],
                    outputCol="features")
                df_t = va.transform(df_h)

                model = gmm.fit(df_t)
# longitude are already in the same scale, we can proceed.

# Select home latitude and longitude as the features:
selected = ["home_lat", "home_lon"]

# Assemble the feature vector:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=selected, outputCol="features")
assembled = assembler.transform(students)


# ## Fit a Gaussian mixture model

# Specify a Gaussian mixture model with two clusters:
from pyspark.ml.clustering import GaussianMixture
gm = GaussianMixture(featuresCol="features", k=2, seed=12345)

# Examine the hyperparameters:
print(gm.explainParams())

# Fit the Gaussian mixture model:
gmm = gm.fit(assembled)
type(gmm)


# ## Examine the Gaussian mixture model

# Examine the mixing weights:
gmm.weights

# Examine the (multivariate) Gaussian distributions:
示例#19
0
k = args.k_clusters

if algorithm not in ['kmeans', 'gmm', 'lda', 'spectral']:
    raise ValueError('Not a valid algorithm')

ss = SparkSession.builder.getOrCreate()

df = ss.read.csv(path, header=True, inferSchema=True)

df_preprocessed = preprocessing(df, num_pca=num_pca_features)

df_preprocessed.write.parquet("preprocessed", mode="Overwrite")

if algorithm == 'kmeans':
    model = KMeans(k=k).setSeed(1).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)
elif algorithm == 'spectral':
    model = SpectralClustering(k=k, k_nearest=7)
    predictions = model.cluster(df_preprocessed, ss, repartition_num=num_nodes)
elif algorithm == 'lda':
    model = LDA(k=k, maxIter=10).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)
elif algorithm == 'gmm':
    model = GaussianMixture(k=k).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)


predictions.select([col for col in predictions.columns if col != 'features'])\
           .toPandas()\
           .to_csv(sys.stdout)
示例#20
0
obj = client.get_object(Bucket='yelpdatacf', Key='book_cf.csv')
df = pd.read_csv(obj['Body'])

df.rating = (df.rating - df.rating.mean())
ratings = spark.createDataFrame(df)

# use the model that has min RMSE
num_iter,param = 200,0.2
als = ALS(maxIter=num_iter, regParam=param, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(ratings)

user_feature = model.userFactors
book_feature = model.itemFactors

k = 20
gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features")
model = gmm.fit(user_feature)
transformed = model.transform(user_feature).select('id', 'prediction')
rows = transformed.collect()
df = spark.createDataFrame(rows)


df.write.jdbc(url='jdbc:%s' % url+'yelp',
        table='book_gm_user_feature20', mode='overwrite',  properties=properties)


k = 20
gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features")
model = gmm.fit(book_feature)
transformed = model.transform(book_feature).select('id', 'prediction')
rows = transformed.collect()
示例#21
0
# negative_udf = udf(lambda x: tp_values(x, 1))
# #
#
# #
# train_df = train_df.withColumn('pos', positive_udf(col('ST')).astype(IntegerType())) \
#     .withColumn('neg', negative_udf(col('ST')).astype(IntegerType()))
#
# train_df.show()
#
# assembler = VectorAssembler(inputCols=['pos', 'neg'], outputCol='features')
# train_df = assembler.transform(train_df)
# train_df.show()

#modelling

kmeans = KMeans().setK(2).setSeed(1).setMaxIter(20)
model = kmeans.fit(train_df)
model.transform(test_df).show()
for c in model.clusterCenters():
    print(c)
#

bkmeans = BisectingKMeans().setK(2).setSeed(1).setMaxIter(20)
model = bkmeans.fit(train_df)
model.transform(test_df).show()
for c in model.clusterCenters():
    print(c)

gaussianmixture = GaussianMixture().setK(2).setSeed(1)
model = gaussianmixture.fit(train_df)
model.transform(test_df).show()
times = []
for i in range(1, 5):
    start = time.time()
    bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1)))
    modelBkm = bkm.fit(tsneDataFrame.select("features"))
    transformedBkm = modelBkm.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
bisectingKmeansTime = average(times)

##############       GMM      #################
from pyspark.ml.clustering import GaussianMixture
times = []
for i in range(1, 5):
    start = time.time()
    gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1)))
    modelGmm = gmm.fit(tsneDataFrame.select("features"))
    transformedGmm = modelGmm.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
gmmTime = average(times)

#preparation of data for non pyspark implementations
clusterData = tsneDataFrame.select("screen_name", "features").collect()
screenames = [x[0] for x in clusterData]
clData = [x[1] for x in clusterData]
clusData = np.array(clData)
x = [cl[0] for cl in clData]
y = [cl[1] for cl in clData]

###################   DBSCAN   ###################
示例#23
0
from pyspark.ml.clustering import GaussianMixture

g = sns.lmplot(x='X Coordinate', y='Y Coordinate', hue='Primary Type', data=crime_df,
               fit_reg=False, size=10, palette={'NARCOTICS': 'tomato', 'THEFT': 'skyblue'})

# for each type of crime
for crime_type, colour in [('NARCOTICS', 'r'), ('THEFT', 'b')]:
    crime_subset = (
        crime_with_features_sdf
        .filter(sf.col('Primary Type') == crime_type)
    )

    # fit a GMM
    gmm = GaussianMixture(k=30)
    model = gmm.fit(crime_subset)

    # extract the centers of the gaussians
    centers = (
        model
        .gaussiansDF
        .toPandas()
    )

    # Put the transformed data in a variable below
    crimes_with_predictions = model.transform(crime_subset)

    # 2.
    # Write code here
    ranked_gaussians = (
        crimes_with_predictions
        .withColumn('probability', get_probability_udf('probability'))
示例#24
0
q = "SELECT ss.ss_customer_id AS cid, count(CASE WHEN i.i_class_id=1  THEN 1 ELSE NULL END) AS id1,count(CASE WHEN i.i_class_id=3  THEN 1 ELSE NULL END) AS id3,count(CASE WHEN i.i_class_id=5  THEN 1 ELSE NULL END) AS id5,count(CASE WHEN i.i_class_id=7  THEN 1 ELSE NULL END) AS id7, count(CASE WHEN i.i_class_id=9  THEN 1 ELSE NULL END) AS id9,count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS id11,count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS id13,count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS id15,count(CASE WHEN i.i_class_id=2  THEN 1 ELSE NULL END) AS id2,count(CASE WHEN i.i_class_id=4  THEN 1 ELSE NULL END) AS id4,count(CASE WHEN i.i_class_id=6  THEN 1 ELSE NULL END) AS id6,count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS id12, count(CASE WHEN i.i_class_id=8  THEN 1 ELSE NULL END) AS id8,count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS id10,count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS id14,count(CASE WHEN i.i_class_id=16 THEN 1 ELSE NULL END) AS id16 FROM store_sales ss INNER JOIN items i ON ss.ss_item_id = i.i_item_id WHERE i.i_category_name IN ('cat#01','cat#02','cat#03','cat#04','cat#05','cat#06','cat#07','cat#08','cat#09','cat#10','cat#11','cat#12','cat#013','cat#14','cat#15') AND ss.ss_customer_id IS NOT NULL GROUP BY ss.ss_customer_id HAVING count(ss.ss_item_id) > 3"

df = spark.sql(q)

assembler = VectorAssembler(inputCols=[
    "cid", "id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9",
    "id10", "id11", "id12", "id13", "id14", "id15", "id16"
],
                            outputCol="FEATURE")

vd = assembler.transform(df)

cost = list()

gmm = GaussianMixture().setK(2).setFeaturesCol('FEATURE').setSeed(
    538009335).setTol(0.01)
model = gmm.fit(vd)

weights = model.weights
print(weights)
summary = model.summary
summary.k
logLikelihood = summary.logLikelihood

param = model.explainParams()

print(param)

model.gaussiansDF.select("mean").head()
model.gaussiansDF.select("cov").head()
model.gaussiansDF.show()
示例#25
0
# $example on$
from pyspark.ml.clustering import GaussianMixture
# $example off$
from pyspark.sql import SparkSession
"""
A simple example demonstrating Gaussian Mixture Model (GMM).
Run with:
  bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("GaussianMixtureExample")\
        .getOrCreate()

    # $example on$
    # loads data
    dataset = spark.read.format("libsvm").load(
        "data/mllib/sample_kmeans_data.txt")

    gmm = GaussianMixture().setK(2).setSeed(538009335)
    model = gmm.fit(dataset)

    print("Gaussians shown as a DataFrame: ")
    model.gaussiansDF.show(truncate=False)
    # $example off$

    spark.stop()
示例#26
0
def SparkML(train_df,
            test_df=None,
            featuresCol='features',
            labelCol='label',
            binaryclass=False,
            multiclass=False,
            n_cluster=2,
            userCol='user',
            itemCol='item',
            ratingCol='rating',
            rank=10,
            userid=3,
            itemid=3,
            itemsCol='items',
            minSupport=0.3,
            minConfidence=0.8,
            stringIndexer=False,
            inputColStringIndexer=None,
            outputColStringIndexer=None,
            oneHotEncoder=False,
            inputColOneHotEncoder=None,
            outputColOneHotEncoder=None,
            vectorAssembler=False,
            inputColsVectorAssembler=None,
            outputColsVectorAssembler=None,
            vectorIndexer=False,
            inputColsVectorIndexer=None,
            outputColsVectorIndexer=None,
            maxCategories=None,
            classification=False,
            logisticregression=False,
            decisiontreeclassifier=False,
            linearsvc=False,
            naivebayes=False,
            randomforestclassifier=False,
            gbtclassifier=False,
            regression=False,
            linearregression=True,
            decisiontreeregressor=False,
            randomforestregressor=False,
            gbtregressor=False,
            clustering=False,
            kmeans=False,
            gaussianmixture=False,
            lda=False,
            recommendation=False,
            als=False,
            association=False,
            fpgrowth=False):
    if classification:
        if logisticregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRClassifier = LogisticRegression(featuresCol=featuresCol,
                                              labelCol=labelCol,
                                              predictionCol='Prediction',
                                              probabilityCol='Probability',
                                              rawPredictionCol='RawPrediction',
                                              standardization=True,
                                              maxIter=100,
                                              regParam=0.0,
                                              elasticNetParam=0.0,
                                              tol=1e-06,
                                              fitIntercept=True,
                                              threshold=0.5)
            paramGrid = ParamGridBuilder().addGrid(
                LRClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            LRCV = CrossValidator(estimator=LRClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LRC_Pipeline = Pipeline(stages=stagesList)
            LRC_PipelineModel = LRC_Pipeline.fit(train_df)
            LRC_Predicted = LRC_PipelineModel.transform(test_df)
            LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel
            LRC_Probability = LRC_Predicted.select("Probability").toPandas()
            LRC_Prediction = LRC_Predicted.select("Prediction").toPandas()
            LRC_Score = evaluator.evaluate(LRC_Predicted)
            return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score
        if decisiontreeclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTClassifier = DecisionTreeClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTClassifier.impurity, ["gini", "entropy"]).addGrid(
                    DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        DTClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            DTCV = CrossValidator(estimator=DTClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(DTCV)
            DTC_Pipeline = Pipeline(stages=stagesList)
            DTC_PipelineModel = DTC_Pipeline.fit(train_df)
            DTC_Predicted = DTC_PipelineModel.transform(test_df)
            DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel
            DTC_Probability = DTC_Predicted.select("Probability").toPandas()
            DTC_Prediction = DTC_Predicted.select("Prediction").toPandas()
            DTC_Score = evaluator.evaluate(DTC_Predicted)
            return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score
        if linearsvc:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            SVClassifier = LinearSVC(featuresCol=featuresCol,
                                     labelCol=labelCol,
                                     predictionCol='Prediction',
                                     rawPredictionCol='RawPrediction',
                                     maxIter=100,
                                     regParam=0.0,
                                     tol=1e-06,
                                     fitIntercept=True,
                                     standardization=True,
                                     threshold=0.0)
            paramGrid = ParamGridBuilder().addGrid(
                SVClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    SVClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            SVCV = CrossValidator(estimator=SVClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(SVCV)
            SVC_Pipeline = Pipeline(stages=stagesList)
            SVC_PipelineModel = SVC_Pipeline.fit(train_df)
            SVC_Predicted = SVC_PipelineModel.transform(test_df)
            SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel
            SVC_Prediction = SVC_Predicted.select("Prediction").toPandas()
            SVC_Score = evaluator.evaluate(SVC_Predicted)
            return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score
        if naivebayes:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            NBClassifier = NaiveBayes(featuresCol=featuresCol,
                                      labelCol=labelCol,
                                      predictionCol='Prediction',
                                      probabilityCol='Probability',
                                      rawPredictionCol='RawPrediction',
                                      smoothing=1.0,
                                      modelType='multinomial',
                                      thresholds=None)
            paramGrid = ParamGridBuilder().addGrid(
                NBClassifier.smoothing,
                [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            NBCV = CrossValidator(estimator=NBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(NBCV)
            NBC_Pipeline = Pipeline(stages=stagesList)
            NBC_PipelineModel = NBC_Pipeline.fit(train_df)
            NBC_Predicted = NBC_PipelineModel.transform(test_df)
            NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel
            NBC_Probability = NBC_Predicted.select("Probability").toPandas()
            NBC_Prediction = NBC_Predicted.select("Prediction").toPandas()
            NBC_Score = evaluator.evaluate(NBC_Predicted)
            return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score
        if randomforestclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFClassifier = RandomForestClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                numTrees=20,
                featureSubsetStrategy='auto',
                seed=None,
                subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                RFClassifier.impurity, ["gini", "entropy"]).addGrid(
                    RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        RFClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).addGrid(
                            RFClassifier.numTrees,
                            [5, 10, 20, 50, 100, 200]).addGrid(
                                RFClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            RFCV = CrossValidator(estimator=RFClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(RFCV)
            RFC_Pipeline = Pipeline(stages=stagesList)
            RFC_PipelineModel = RFC_Pipeline.fit(train_df)
            RFC_Predicted = RFC_PipelineModel.transform(test_df)
            RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel
            RFC_Probability = RFC_Predicted.select("Probability").toPandas()
            RFC_Prediction = RFC_Predicted.select("Prediction").toPandas()
            RFC_Score = evaluator.evaluate(RFC_Predicted)
            return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score
        if gbtclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBClassifier = GBTClassifier(featuresCol=featuresCol,
                                         labelCol=labelCol,
                                         predictionCol='Prediction',
                                         maxDepth=5,
                                         maxBins=32,
                                         minInstancesPerNode=1,
                                         minInfoGain=0.0,
                                         lossType='logistic',
                                         maxIter=20,
                                         stepSize=0.1,
                                         seed=None,
                                         subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBClassifier.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBClassifier.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = MulticlassClassificationEvaluator(
                labelCol=labelCol,
                predictionCol="Prediction",
                metricName="accuracy")
            GBCV = CrossValidator(estimator=GBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GBCV)
            GBC_Pipeline = Pipeline(stages=stagesList)
            GBC_PipelineModel = GBC_Pipeline.fit(train_df)
            GBC_Predicted = GBC_PipelineModel.transform(test_df)
            GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel
            GBC_Prediction = GBC_Predicted.select("Prediction").toPandas()
            GBC_Score = evaluator.evaluate(GBC_Predicted)
            return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score
    if regression:
        if linearregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRegressor = LinearRegression(featuresCol=featuresCol,
                                          labelCol=labelCol,
                                          predictionCol='Prediction',
                                          standardization=True,
                                          fitIntercept=True,
                                          loss='squaredError',
                                          maxIter=100,
                                          regParam=0.0,
                                          elasticNetParam=0.0,
                                          tol=1e-06,
                                          epsilon=1.35)
            paramGrid = ParamGridBuilder().addGrid(
                LRegressor.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRegressor.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            LRCV = CrossValidator(estimator=LRegressor,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LR_Pipeline = Pipeline(stages=stagesList)
            LR_PipelineModel = LR_Pipeline.fit(train_df)
            LR_Predicted = LR_PipelineModel.transform(test_df)
            LR_BestModel = LR_PipelineModel.stages[-1].bestModel
            LR_Prediction = LR_Predicted.select("Prediction").toPandas()
            LR_Score = evaluator.evaluate(LR_Predicted)
            return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score
        if decisiontreeregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                seed=None,
                                                varianceCol=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            DTRCV = CrossValidator(estimator=DTRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(DTRCV)
            DTR_Pipeline = Pipeline(stages=stagesList)
            DTR_PipelineModel = DTR_Pipeline.fit(train_df)
            DTR_Predicted = DTR_PipelineModel.transform(test_df)
            DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel
            DTR_Prediction = DTR_Predicted.select("Prediction").toPandas()
            DTR_Score = evaluator.evaluate(DTR_Predicted)
            return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score
        if randomforestregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFRegressor = RandomForestRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                subsamplingRate=1.0,
                                                seed=None,
                                                numTrees=20)
            paramGrid = ParamGridBuilder().addGrid(
                RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        RFRegressor.numTrees,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            RFRegressor.subsamplingRate,
                            [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            RFRCV = CrossValidator(estimator=RFRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(RFRCV)
            RFR_Pipeline = Pipeline(stages=stagesList)
            RFR_PipelineModel = RFR_Pipeline.fit(train_df)
            RFR_Predicted = RFR_PipelineModel.transform(test_df)
            RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel
            RFR_Prediction = RFR_Predicted.select("Prediction").toPandas()
            RFR_Score = evaluator.evaluate(RFR_Predicted)
            return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score
        if gbtregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBRegressor = GBTRegressor(featuresCol=featuresCol,
                                       labelCol=labelCol,
                                       predictionCol='Prediction',
                                       maxDepth=5,
                                       maxBins=32,
                                       minInstancesPerNode=1,
                                       minInfoGain=0.0,
                                       subsamplingRate=1.0,
                                       lossType='squared',
                                       maxIter=20,
                                       stepSize=0.1,
                                       seed=None,
                                       impurity='variance')
            paramGrid = ParamGridBuilder().addGrid(
                GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBRegressor.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBRegressor.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBRegressor.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            GBRCV = CrossValidator(estimator=GBRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(GBRCV)
            GBR_Pipeline = Pipeline(stages=stagesList)
            GBR_PipelineModel = GBR_Pipeline.fit(train_df)
            GBR_Predicted = GBR_PipelineModel.transform(test_df)
            GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel
            GBR_Prediction = GBR_Predicted.select("Prediction").toPandas()
            GBR_Score = evaluator.evaluate(GBR_Predicted)
            return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score
    if clustering:
        if kmeans:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            KCluster = KMeans(featuresCol=featuresCol,
                              predictionCol='Prediction',
                              k=n_cluster,
                              initMode='k-means||',
                              initSteps=2,
                              tol=0.0001,
                              maxIter=20,
                              seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid(
                    KCluster.maxIter,
                    [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                        KCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            KMCV = CrossValidator(estimator=KCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(KMCV)
            KMC_Pipeline = Pipeline(stages=stagesList)
            KMC_PipelineModel = KMC_Pipeline.fit(train_df)
            KMC_Predicted = KMC_PipelineModel.transform(train_df)
            KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel
            KMC_Prediction = KMC_Predicted.select("Prediction").toPandas()
            KMC_Score = evaluator.evaluate(KMC_Predicted)
            return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score
        if gaussianmixture:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GMCluster = GaussianMixture(featuresCol=featuresCol,
                                        predictionCol='Prediction',
                                        probabilityCol='Probability',
                                        k=n_cluster,
                                        tol=0.01,
                                        maxIter=100,
                                        seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                GMCluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    GMCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            GMCV = CrossValidator(estimator=GMCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GMCV)
            GMC_Pipeline = Pipeline(stages=stagesList)
            GMC_PipelineModel = GMC_Pipeline.fit(train_df)
            GMC_Predicted = GMC_PipelineModel.transform(train_df)
            GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel
            GMC_Probability = GMC_Predicted.select("Probability").toPandas()
            GMC_Prediction = GMC_Predicted.select("Prediction").toPandas()
            GMC_Score = evaluator.evaluate(GMC_Predicted)
            return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score
        if lda:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LDACluster = LDA(featuresCol=featuresCol,
                             maxIter=20,
                             seed=None,
                             k=n_cluster,
                             learningOffset=1024.0,
                             learningDecay=0.51,
                             subsamplingRate=0.05)
            paramGrid = ParamGridBuilder().addGrid(
                LDACluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    LDACluster.seed, [i for i in range(1001)]).addGrid(
                        LDACluster.subsamplingRate,
                        [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            LDACV = CrossValidator(estimator=LDACluster,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(LDACV)
            LDA_Pipeline = Pipeline(stages=stagesList)
            LDA_PipelineModel = LDA_Pipeline.fit(train_df)
            LDA_Predicted = LDA_PipelineModel.transform(train_df)
            LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel
            LDA_Topics = LDA_BestModel.describeTopics().toPandas()
            LDA_Score = evaluator.evaluate(LDA_Predicted)
            return LDA_BestModel, LDA_Topics, LDA_Score
    if recommendation:
        if als:
            ALSR = ALS(userCol=userCol,
                       itemCol=itemCol,
                       ratingCol=ratingCol,
                       rank=rank,
                       maxIter=10,
                       regParam=0.1,
                       numUserBlocks=10,
                       numItemBlocks=10,
                       alpha=1.0,
                       seed=1)
            ALSR_Model = ALSR.fit(train_df)
            ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid)
            ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid)
            return ALSR_Model, ALSR_ForUsers, ALSR_ForItems
    if association:
        if fpgrowth:
            fpg = FPGrowth(minSupport=minSupport,
                           minConfidence=minConfidence,
                           itemsCol=itemsCol,
                           predictionCol='Prediction')
            fpg_model = fpg.fit(train_df)
            fpg_freqItemsets = fpg_model.freqItemsets.toPandas()
            fpg_associationRules = fpg_model.associationRules.toPandas()
            return fpg_model, fpg_freqItemsets, fpg_associationRules
 async def gaussian_mixture(self, df):
     return GaussianMixture().fit(df.select('features'))
示例#28
0
    spark = SparkSession \
        .builder \
        .appName("ChiSqSelectorExample") \
        .getOrCreate()
    rawData = spark.sparkContext.textFile("file:///home/tianlei/iris.txt")

    def f(x):
        rel = {}
        rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]),
                                        float(x[3]))
        return rel

    df = sc.textFile("file:///usr/local/spark/iris.txt").map(
        lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
    # 我们建立一个简单的GaussianMixture对象,设定其聚类数目为3,其他参数取默认值。
    gm = GaussianMixture().setK(3).setPredictionCol(
        "Prediction").setProbabilityCol("Probability")
    gmm = gm.fit(df)
    # 调用transform()方法处理数据集之后,打印数据集,可以看到每一个样本的预测簇以及其概率分布向量
    # (这里为了明晰起见,省略了大部分行,只选择三行):
    result = gmm.transform(df)
    result.show(150, False)
    # 得到模型后,即可查看模型的相关参数,与KMeans方法不同,GMM不直接给出聚类中心,
    # 而是给出各个混合成分(多元高斯分布)的参数。在ML的实现中,
    # GMM的每一个混合成分都使用一个MultivariateGaussian类(位于org.apache.spark.ml.stat.distribution包)来存储,
    # 我们可以使用GaussianMixtureModel类的weights成员获取到各个混合成分的权重,
    # 使用gaussians成员来获取到各个混合成分的参数(均值向量和协方差矩阵):
    for i in range(3):
        print("Component " + str(i) + " : weight is " + str(gmm.weights[i]) +
              "\n mu vector is " + str(gmm.gaussiansDF.select('mean').head()) +
              " \n sigma matrix is " +
              str(gmm.gaussiansDF.select('cov').head()))