Python BisectingKMeans примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark.mllib.clustering

Класс/Тип: BisectingKMeans

Примеров на hotexamples.com: 10

Python BisectingKMeans - 10 примеров найдено. Это лучшие примеры Python кода для pyspark.mllib.clustering.BisectingKMeans, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

train(9)

BisectingKMeans(2)

Основные методы

train (9)

BisectingKMeans (2)

Пример #1

Показать файл

Файл: test_algorithms.py Проект: Ignalina/spark311

 def test_bisecting_kmeans(self):
     from pyspark.mllib.clustering import BisectingKMeans
     data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
     bskm = BisectingKMeans()
     model = bskm.train(self.sc.parallelize(data, 2), k=4)
     p = array([0.0, 0.0])
     rdd_p = self.sc.parallelize([p])
     self.assertEqual(model.predict(p), model.predict(rdd_p).first())
     self.assertEqual(model.computeCost(p), model.computeCost(rdd_p))
     self.assertEqual(model.k, len(model.clusterCenters))

Пример #2

Показать файл

Файл: test_algorithms.py Проект: drewrobb/spark

 def test_bisecting_kmeans(self):
     from pyspark.mllib.clustering import BisectingKMeans
     data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
     bskm = BisectingKMeans()
     model = bskm.train(self.sc.parallelize(data, 2), k=4)
     p = array([0.0, 0.0])
     rdd_p = self.sc.parallelize([p])
     self.assertEqual(model.predict(p), model.predict(rdd_p).first())
     self.assertEqual(model.computeCost(p), model.computeCost(rdd_p))
     self.assertEqual(model.k, len(model.clusterCenters))

Пример #3

Показать файл

def doublekmeans(data,year):
    data = data.loc[year,'pm2.5']
    #kmeans
    data = sc.parallelize(data)
    cluster_no = 2
    maxIter = 30
    clusters = KMeans.train(data,cluster_no,maxIter)
    #find 1.0 labels
    tdata = data.collect()
    cluster_info = np.zeros(len(tdata))
    label = []
    for i in range(0,len(tdata)):
        cluster_info[i]=clusters.predict(np.array(tdata[i]))
        if cluster_info[i]==1.0:
            label.append(i)
    #selecting 1.0 data and preparing the data      
    data1 = clean_data.drop(['Is','Ir','cbwd','No','month','year','hour','day'],axis = 1)
    data2 = data1.iloc[label]
    data2 = sc.parallelize(data2.as_matrix())
    
    #bisecting kmeans
    data2.collect()
    cluster_no = 2
    maxiter = 30
    model = BisectingKMeans.train(data2,cluster_no,maxiter)
    
    return clusters.centers,model.centers, model.computeCost(data2)

Пример #4

Показать файл

def BisectingKMeans_clustering(parsedData):

    parsedData = data.map(
        lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    model = BisectingKMeans.train(parsedData, 2, maxIterations=5)

    print "\n-----------------------------------------------------------------------------"
    print "\n          Cluster Centers (BisectingKMeans)"
    print "\n-----------------------------------------------------------------------------"

    print model.clusterCenters

Пример #5

Показать файл

Файл: clustering.py Проект: zafercavdar/rhythm-scoring-v3

def bisecting_k_means(unclustered_data,
                      number_of_clusters,
                      max_iterations=5,
                      seed=None,
                      min_divisible_cluster_size=1.0):

    if number_of_clusters < 1:
        raise ValueError("While clustering with BisectingKMeans, \
                the given number of clusters is not positive")

    model = BisectingKMeans.train(
        rdd=unclustered_data,
        k=number_of_clusters,
        maxIterations=max_iterations,
        seed=seed,
        minDivisibleClusterSize=min_divisible_cluster_size)
    cost = model.computeCost(unclustered_data)
    return [model, cost]

Пример #6

Показать файл

Файл: bisecting.py Проект: cassar1/spark_tests

def main():
    compounds = load_data(sc, dataFile)
    compounds.partitionBy(executor_num)
    fingerprints = select_fingerprints(compounds).cache()

    fp_only = fingerprints.map(lambda (id, smi, fp, name): fp)

    for x in [1500, 2000]:
        start_time = time.time()
        model = BisectingKMeans.train(fp_only, k=x)
        #print(model.clusterCenters)
        #print("Clusters " ,len(model.clusterCenters))

        cost = model.computeCost(fp_only)
        #model.save(sc, baseFile + '/btreemodel')
        print("Bisecting " + str(cost))

        #model.clusterCenters.foreach(lambda ctr : print("Cluster Center"))

        all_fps = fingerprints.collect()
        cluster_assignment = []
        end_time1 = time.time()
        print("Clustering Time taken ", x, end_time1 - start_time)
        for fp in all_fps:
            cluster_assignment.append('{} {} {}'.format(
                fp[1], fp[3], model.predict(fp[2])))
            #print ( "FP ", fp[0], " SMI: ", fp[1], " ", model.predict(fp[2]))

        end_time = time.time()
        print("Total Time taken ", x, end_time - start_time)
        if EVALUATION:
            header = sc.parallelize(["smiles Name Cluster"])
            clusters = sc.parallelize(cluster_assignment)

            output_file = header.union(clusters)
            #output_file.foreach(output)
            # output_file.saveAsTextFile("../mols/resultsSpark/result")
            current_time_milli = int(round(time.time() * 1000))
            outputextension = str(current_time_milli)
            output_file.coalesce(1).saveAsTextFile(baseFile + "/output" +
                                                   str(x) + "/result" +
                                                   outputextension)

    sc.stop()

Пример #7

Показать файл

Файл: Users_Hierarical.py Проект: lee1343/YELPClustering

today = dt.datetime.today()
spark_df = sc.parallelize(
    spark.read.json("Data/yelp_academic_dataset_user.json").select(
        "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[
            0], x[1], (today - par.parse(x[2])).days)).collect()[:1200])
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(
    lambda x: Vectors.dense(x))
num_clusters = 4

#Input into the Algorithm
km = BisectingKMeans()

start = timer()
kme = km.train(vector_df, k=num_clusters, maxIterations=20, seed=2018)
end = timer()
print(end - start)
centers = kme.clusterCenters

err = vector_df.map(lambda x: (x[0], findCenter(x[0], centers))).collect()

per_clus = [0] * num_clusters
per_clus_num = [0] * num_clusters

#Silhoutte Value comparison
ag = 0
agi = 1200

Пример #8

Показать файл

Файл: mllib_k_means.py Проект: wupengbo125/penter

from numpy import array

from pyspark import SparkContext
from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel

# spark-submit mllib_k_means.py

# 二分KMeans
# K-Means是聚类算法中的最常用的一种，算法最大的特点是简单，好理解，运算速度快
# K-Means算法是一种无监督分类算法
if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")

    data = sc.textFile("kmeans_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    model = BisectingKMeans.train(parsedData, 2, maxIterations=5)

    cost = model.computeCost(parsedData)
    print("Final centers: " + str(model.clusterCenters))
    print("Bisecting K-means Cost = " + str(cost))
    sc.stop()

Пример #9

Показать файл

Файл: bisecting_k_means_example.py Проект: 11wzy001/spark

#

from __future__ import print_function

# $example on$
from numpy import array
# $example off$

from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonBisectingKMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("data/mllib/kmeans_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    model = BisectingKMeans.train(parsedData, 2, maxIterations=5)

    # Evaluate clustering
    cost = model.computeCost(parsedData)
    print("Bisecting K-means Cost = " + str(cost))
    # $example off$

    sc.stop()

Пример #10

Показать файл

Файл: parth_vaghani_Cluster.py Проект: parthvaghani1995/Data-Mining

tfidf = idf.transform(tf)

# In[84]:

# tfidf.collect()

# In[85]:

if algorithm == "K":
    clusters = KMeans.train(tfidf,
                            8,
                            maxIterations=20,
                            initializationMode="random",
                            seed=42)
else:
    clusters = BisectingKMeans.train(tfidf, 8, maxIterations=20, seed=42)
    clusterCenters = clusters.clusterCenters

# In[ ]:

# In[86]:

documentModel = documents1.zip(tfidf)
# cluster_broadcast = sc.broadcast(clusters)

# In[87]:


def findErrorWC(document, clusters):
    documentWords = document[0]
    documentTfidf = document[1]