Пример #1
0
 def test_bisecting_kmeans(self):
     from pyspark.mllib.clustering import BisectingKMeans
     data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
     bskm = BisectingKMeans()
     model = bskm.train(self.sc.parallelize(data, 2), k=4)
     p = array([0.0, 0.0])
     rdd_p = self.sc.parallelize([p])
     self.assertEqual(model.predict(p), model.predict(rdd_p).first())
     self.assertEqual(model.computeCost(p), model.computeCost(rdd_p))
     self.assertEqual(model.k, len(model.clusterCenters))
Пример #2
0
 def test_bisecting_kmeans(self):
     from pyspark.mllib.clustering import BisectingKMeans
     data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
     bskm = BisectingKMeans()
     model = bskm.train(self.sc.parallelize(data, 2), k=4)
     p = array([0.0, 0.0])
     rdd_p = self.sc.parallelize([p])
     self.assertEqual(model.predict(p), model.predict(rdd_p).first())
     self.assertEqual(model.computeCost(p), model.computeCost(rdd_p))
     self.assertEqual(model.k, len(model.clusterCenters))
Пример #3
0
def doublekmeans(data,year):
    data = data.loc[year,'pm2.5']
    #kmeans
    data = sc.parallelize(data)
    cluster_no = 2
    maxIter = 30
    clusters = KMeans.train(data,cluster_no,maxIter)
    #find 1.0 labels
    tdata = data.collect()
    cluster_info = np.zeros(len(tdata))
    label = []
    for i in range(0,len(tdata)):
        cluster_info[i]=clusters.predict(np.array(tdata[i]))
        if cluster_info[i]==1.0:
            label.append(i)
    #selecting 1.0 data and preparing the data      
    data1 = clean_data.drop(['Is','Ir','cbwd','No','month','year','hour','day'],axis = 1)
    data2 = data1.iloc[label]
    data2 = sc.parallelize(data2.as_matrix())
    
    #bisecting kmeans
    data2.collect()
    cluster_no = 2
    maxiter = 30
    model = BisectingKMeans.train(data2,cluster_no,maxiter)
    
    return clusters.centers,model.centers, model.computeCost(data2)
Пример #4
0
def BisectingKMeans_clustering(parsedData):

    parsedData = data.map(
        lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    model = BisectingKMeans.train(parsedData, 2, maxIterations=5)

    print "\n-----------------------------------------------------------------------------"
    print "\n          Cluster Centers (BisectingKMeans)"
    print "\n-----------------------------------------------------------------------------"

    print model.clusterCenters
Пример #5
0
def bisecting_k_means(unclustered_data,
                      number_of_clusters,
                      max_iterations=5,
                      seed=None,
                      min_divisible_cluster_size=1.0):

    if number_of_clusters < 1:
        raise ValueError("While clustering with BisectingKMeans, \
                the given number of clusters is not positive")

    model = BisectingKMeans.train(
        rdd=unclustered_data,
        k=number_of_clusters,
        maxIterations=max_iterations,
        seed=seed,
        minDivisibleClusterSize=min_divisible_cluster_size)
    cost = model.computeCost(unclustered_data)
    return [model, cost]
Пример #6
0
def main():
    compounds = load_data(sc, dataFile)
    compounds.partitionBy(executor_num)
    fingerprints = select_fingerprints(compounds).cache()

    fp_only = fingerprints.map(lambda (id, smi, fp, name): fp)

    for x in [1500, 2000]:
        start_time = time.time()
        model = BisectingKMeans.train(fp_only, k=x)
        #print(model.clusterCenters)
        #print("Clusters " ,len(model.clusterCenters))

        cost = model.computeCost(fp_only)
        #model.save(sc, baseFile + '/btreemodel')
        print("Bisecting " + str(cost))

        #model.clusterCenters.foreach(lambda ctr : print("Cluster Center"))

        all_fps = fingerprints.collect()
        cluster_assignment = []
        end_time1 = time.time()
        print("Clustering Time taken ", x, end_time1 - start_time)
        for fp in all_fps:
            cluster_assignment.append('{} {} {}'.format(
                fp[1], fp[3], model.predict(fp[2])))
            #print ( "FP ", fp[0], " SMI: ", fp[1], " ", model.predict(fp[2]))

        end_time = time.time()
        print("Total Time taken ", x, end_time - start_time)
        if EVALUATION:
            header = sc.parallelize(["smiles Name Cluster"])
            clusters = sc.parallelize(cluster_assignment)

            output_file = header.union(clusters)
            #output_file.foreach(output)
            # output_file.saveAsTextFile("../mols/resultsSpark/result")
            current_time_milli = int(round(time.time() * 1000))
            outputextension = str(current_time_milli)
            output_file.coalesce(1).saveAsTextFile(baseFile + "/output" +
                                                   str(x) + "/result" +
                                                   outputextension)

    sc.stop()
Пример #7
0
today = dt.datetime.today()
spark_df = sc.parallelize(
    spark.read.json("Data/yelp_academic_dataset_user.json").select(
        "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[
            0], x[1], (today - par.parse(x[2])).days)).collect()[:1200])
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(
    lambda x: Vectors.dense(x))
num_clusters = 4

#Input into the Algorithm
km = BisectingKMeans()

start = timer()
kme = km.train(vector_df, k=num_clusters, maxIterations=20, seed=2018)
end = timer()
print(end - start)
centers = kme.clusterCenters

err = vector_df.map(lambda x: (x[0], findCenter(x[0], centers))).collect()

per_clus = [0] * num_clusters
per_clus_num = [0] * num_clusters

#Silhoutte Value comparison
ag = 0
agi = 1200
Пример #8
0
from numpy import array

from pyspark import SparkContext
from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel

# spark-submit mllib_k_means.py

# 二分KMeans
# K-Means是聚类算法中的最常用的一种,算法最大的特点是简单,好理解,运算速度快
# K-Means算法是一种无监督分类算法
if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")

    data = sc.textFile("kmeans_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    model = BisectingKMeans.train(parsedData, 2, maxIterations=5)

    cost = model.computeCost(parsedData)
    print("Final centers: " + str(model.clusterCenters))
    print("Bisecting K-means Cost = " + str(cost))
    sc.stop()
Пример #9
0
#

from __future__ import print_function

# $example on$
from numpy import array
# $example off$

from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonBisectingKMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("data/mllib/kmeans_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    model = BisectingKMeans.train(parsedData, 2, maxIterations=5)

    # Evaluate clustering
    cost = model.computeCost(parsedData)
    print("Bisecting K-means Cost = " + str(cost))
    # $example off$

    sc.stop()
tfidf = idf.transform(tf)

# In[84]:

# tfidf.collect()

# In[85]:

if algorithm == "K":
    clusters = KMeans.train(tfidf,
                            8,
                            maxIterations=20,
                            initializationMode="random",
                            seed=42)
else:
    clusters = BisectingKMeans.train(tfidf, 8, maxIterations=20, seed=42)
    clusterCenters = clusters.clusterCenters

# In[ ]:

# In[86]:

documentModel = documents1.zip(tfidf)
# cluster_broadcast = sc.broadcast(clusters)

# In[87]:


def findErrorWC(document, clusters):
    documentWords = document[0]
    documentTfidf = document[1]