Exemplo n.º 1
0
    def test_gmm_deterministic(self):
        from pyspark.mllib.clustering import GaussianMixture

        x = range(0, 100, 10)
        y = range(0, 100, 10)
        data = self.sc.parallelize([[a, b] for a, b in zip(x, y)])
        clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=10, seed=63)
        clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=10, seed=63)
        for c1, c2 in zip(clusters1.weights, clusters2.weights):
            self.assertEqual(round(c1, 7), round(c2, 7))
Exemplo n.º 2
0
    def test_gmm_with_initial_model(self):
        from pyspark.mllib.clustering import GaussianMixture

        data = self.sc.parallelize([(-10, -5), (-9, -4), (10, 5), (9, 4)])

        gmm1 = GaussianMixture.train(data, 2, convergenceTol=0.001, maxIterations=10, seed=63)
        gmm2 = GaussianMixture.train(
            data, 2, convergenceTol=0.001, maxIterations=10, seed=63, initialModel=gmm1
        )
        self.assertAlmostEqual((gmm1.weights - gmm2.weights).sum(), 0.0)
Exemplo n.º 3
0
    def test_gmm_deterministic(self):
        from pyspark.mllib.clustering import GaussianMixture

        x = range(0, 100, 10)
        y = range(0, 100, 10)
        data = self.sc.parallelize([[a, b] for a, b in zip(x, y)])
        clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=100, seed=63)
        clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=100, seed=63)
        for c1, c2 in zip(clusters1.weights, clusters2.weights):
            self.assertEquals(round(c1, 7), round(c2, 7))
Exemplo n.º 4
0
    def test_gmm_with_initial_model(self):
        from pyspark.mllib.clustering import GaussianMixture
        data = self.sc.parallelize([
            (-10, -5), (-9, -4), (10, 5), (9, 4)
        ])

        gmm1 = GaussianMixture.train(data, 2, convergenceTol=0.001,
                                     maxIterations=10, seed=63)
        gmm2 = GaussianMixture.train(data, 2, convergenceTol=0.001,
                                     maxIterations=10, seed=63, initialModel=gmm1)
        self.assertAlmostEqual((gmm1.weights - gmm2.weights).sum(), 0.0)
Exemplo n.º 5
0
def gmm_spark(sc, X=None, clusters=3):
	if X is None:
		X = users_as_parallelizable_sparse_data(users)
	X = sc.parallelize(X)
	gmm = GaussianMixture.train(X, k=clusters)
	for i in range(2):
		print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray())
Exemplo n.º 6
0
def gmm_spark(sc, X=None, clusters=3):
    if X is None:
        X = users_as_parallelizable_sparse_data(users)
    X = sc.parallelize(X)
    gmm = GaussianMixture.train(X, k=clusters)
    for i in range(2):
        print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
              "sigma = ", gmm.gaussians[i].sigma.toArray())
Exemplo n.º 7
0
    def test_gmm(self):
        from pyspark.mllib.clustering import GaussianMixture

        data = self.sc.parallelize([[1, 2], [8, 9], [-4, -3], [-6, -7]])
        clusters = GaussianMixture.train(data, 2, convergenceTol=0.001, maxIterations=100, seed=56)
        labels = clusters.predict(data).collect()
        self.assertEquals(labels[0], labels[1])
        self.assertEquals(labels[2], labels[3])
Exemplo n.º 8
0
def main():
    sc = SparkContext(master="local", appName="K-Means")
    try:
        # csv = sc.textFile(sys.argv[1]) if input via cmd
        csv = sc.textFile("kmeans_data.csv")
    except IOError:
        print('No such file')
        exit(1)

    parsedData = csv.map(parseLine)
    trueValue = csv.map(getTrueValue)
    # print for debugging
    print("number of features: ", len(parsedData.collect()[0]))
    # Build the model (cluster the data), K = 2
    clusters = KMeans.train(parsedData,
                            2,
                            maxIterations=50,
                            initializationMode="random")
    g_clusters = GaussianMixture.train(parsedData, 2)
    centers = clusters.clusterCenters
    # g_centers = g_clusters.clusterCenters
    print("Final k centers:", centers)  # print for debugging purpose
    # print("Final k centers for expectation maximization:", g_centers)

    # for each data point, generate its cluster label:
    predictedLabels = parsedData.map(
        lambda point: closestCluster(point, centers))
    # g_predictedLabels = parsedData.map(lambda point: closestCluster(point, g_centers))
    g_predictedLabels = g_clusters.predict(parsedData)
    results = predictedLabels.collect()
    g_results = g_predictedLabels.collect()
    true = trueValue.collect()
    accuracy_count = 0  # count how many data points having correct labels
    # output in results.txt: i-th row: true label, predicted label for i-th data point:
    g_accuracy_count = 0
    with open("results.txt", "w") as f:
        f.write("true\tpredicted\n")
        for i in range(len(results)):
            f.write(str(true[i]) + "\t" + str(results[i]) + "\n")
            if int(true[i]) == int(results[i]):
                accuracy_count += 1
            if int(true[i]) == int(g_results[i]):
                g_accuracy_count += 1

    accuracy = accuracy_count / len(results)
    g_accuracy = g_accuracy_count / len(results)
    if accuracy < 0.5:  # our predicted label IDs might be opposite
        accuracy = 1 - accuracy

    if g_accuracy < 0.5:
        g_accuracy = 1 - g_accuracy
    print("accuracy is :", accuracy)
    print("EM accuracy is : ", g_accuracy)
    sc.stop()
Exemplo n.º 9
0
 def test_gmm(self):
     from pyspark.mllib.clustering import GaussianMixture
     data = self.sc.parallelize([
         [1, 2],
         [8, 9],
         [-4, -3],
         [-6, -7],
     ])
     clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
                                      maxIterations=100, seed=56)
     labels = clusters.predict(data).collect()
     self.assertEquals(labels[0], labels[1])
     self.assertEquals(labels[2], labels[3])
Exemplo n.º 10
0
def cluster(sc, sample):
    sample = sc.parallelize(sample)
    testdata = sc.parallelize([5, 5])

    ######
    #    kmeansmodel = KMeans.train(sample,3)

    #    print kmeansmodel.centers
    #    print kmeansmodel.predict([5,5])

    gmmmodel = GaussianMixture.train(sample, 3, maxIterations=10)

    #    print gmmmodel.weights
    print gmmmodel.predict(testdata)
Exemplo n.º 11
0
def find_outliers_Gaussian(sequence, distance_factor=6):
    df = sequence
    df_vector = df.map(lambda x: np.array(float(x)))

    gmm = GaussianMixture.train(df_vector, 1)

    mu, sigma = list(zip(*[(g.mu, g.sigma) for g in gmm.gaussians]))

    m = mu[0].values
    s = sqrt(sigma[0].values)

    l = np.array(df_vector.collect())
    d = abs(l - m)
    outliers = list(set(list(l[d >= distance_factor * s])))
    filtered = sequence.filter(lambda x: x not in outliers)
    return outliers, filtered
Exemplo n.º 12
0
def find_collective_outliers_KGaussians(sequence,
                                        k=5,
                                        proportion=0.1,
                                        ratio=10):
    df = sequence
    df_vector = df.map(lambda x: np.array(float(x)))
    gmm = GaussianMixture.train(df_vector, k)
    labels = gmm.predict(df_vector)
    w = gmm.weights
    l = []
    point_label = df_vector.zip(labels)
    mus, sigmas = list(zip(*[(g.mu, g.sigma) for g in gmm.gaussians]))
    m = []
    for i in range(k):
        m.append(float(mus[i].values))
    m1 = m[:]
    removed = []
    not_removed = []
    for i in range(len(w)):
        w_i = w[i]
        if w_i < proportion / k:
            removed.append(i)
        else:
            not_removed.append(i)
    for e in removed:
        l = l + point_label.filter(lambda x: x[1] == e).map(
            lambda x: float(x[0])).collect()
    m = np.array(m)
    if not_removed:
        m = m[not_removed]
    n = list(m).index(max(m))
    try:
        p = not_removed[n]
        m = sorted(m)
        a = m[0]
        b = m[-2]
        c = m[-1]
        if ratio * b / a < c / b:
            l = l + point_label.filter(lambda x: x[1] == p).map(
                lambda x: float(x[0])).collect()
        #return l+m1+list(w)
        return l
    except IndexError:
        return []
Exemplo n.º 13
0
 def __gauss_clustering(self):
     # get the whole population without fitness value, then flat it
     rdd_aux = self.__rdd.flatMap(lambda x: x.get_population(fitness=False))
     # train the gauss cluster
     gauss_cluster = GaussianMixture.train(
         rdd_aux, self.__colonies, maxIterations=self.__cluster_iterations)
     # create a new rdd with the labels
     rdd_labels = gauss_cluster.predict(rdd_aux)
     # zip each result with its class
     rdd_aux = rdd_labels.zip(rdd_aux)
     # input serialization
     cols = self.__colonies
     self.__sc.broadcast(cols)
     # divide into partitions
     rdd_aux = rdd_aux.partitionBy(cols, partitionFunc=lambda x: x).glom()
     # remove the index of each element
     rdd_aux = rdd_aux.map(lambda x: [y[1] for y in x])
     # input serialization
     evaluation = self.__evaluation
     generation = self.__generation
     cross = self.__cross
     mutation = self.__mutation
     selection = self.__selection
     survival = self.__survival
     mut_ratio = self.__mut_ratio
     survival_ratio = self.__survival_ratio
     control_obj = self.__control_obj
     # create the new colonies
     self.__rdd = rdd_aux.map(
         lambda x: Colony(evaluation,
                          generation,
                          cross=cross,
                          mutation=mutation,
                          selection=selection,
                          mut_ratio=mut_ratio,
                          survival_ratio=survival_ratio,
                          survival=survival,
                          control_obj=control_obj,
                          population=x))
Exemplo n.º 14
0
def gaussian_mixture(unclustered_data,
                     number_of_clusters,
                     max_iterations=100,
                     seed=None,
                     initial_model=None):

    if number_of_clusters < 1:
        raise ValueError("While clustering with GaussianMixture, \
                the given number of clusters is not positive")

    gmm = GaussianMixture.train(rdd=unclustered_data,
                                k=number_of_clusters,
                                maxIterations=max_iterations,
                                seed=seed,
                                initialModel=initial_model)
    parameters = []
    for i in range(number_of_clusters):
        parameters.append({
            "weight": gmm.weights[i],
            "mu": gmm.gaussians[i].mu,
            "sigma": gmm.gaussians[i].sigma.toArray()
        })
    return [gmm, parameters]
Exemplo n.º 15
0
def find_outliers_KGuaussians(sequence,
                              k=2,
                              proportion=0.95,
                              distance_factor=3):
    #currently please take input as a list and return a list
    #for now, k=2
    df = sequence
    df_vector = df.map(lambda x: np.array(float(x)))

    gmm = GaussianMixture.train(df_vector, k)
    labels = gmm.predict(df_vector).collect()

    labels = np.array(labels)
    n = len(labels)
    c0 = len(labels[labels == 0])
    c1 = n - c0

    mus, sigmas = list(zip(*[(g.mu, g.sigma) for g in gmm.gaussians]))

    m = []
    s = []
    for i in range(k):
        m.append(float(mus[i].values))
        s.append(float(sigmas[i].values))

        m = np.array(m)
        s = np.array(m)

        l = df_vector.collect()
        l = np.array(l)
        if abs(m[0] - m[1]) > distance_factor * (sqrt(s[0]) + sqrt(s[1])):
            if c0 / n > proportion:
                return list(l[labels == 1])
            elif c1 / n > proportion:
                return list(l[labels == 0])
            else:
                return []
Exemplo n.º 16
0
# -*- coding:utf-8 -*-
""""
Program: GMM
Description: 调用spark内置的GMM算法示例
Author: zhenglei - [email protected]
Date: 2016-01-14 13:38:58
Last modified: 2016-01-14 13:50:11
Python release: 2.7
"""
# 调用spark内部的kmeans算法实现完成机器学习实战中的第十章示例
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.clustering import GaussianMixture

if __name__ == '__main__':
    sc = SparkContext()
    datas = sc.textFile('testSet.txt')
    clusters_num = 4
    parseData = datas.map(lambda x: array([float(y) for y in x.split('\t')]))
    model = GaussianMixture.train(parseData, clusters_num, maxIterations=10)
    clusters = [[] for i in range(clusters_num)]
    labels = model.predict(parseData).collect()
    nums = len(labels)
    for i in xrange(nums):
        clusters[labels[i]].append(parseData.collect()[i])
    print clusters
    sc.stop()
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name
    , mongo_tuples, labelnameflag, fromweb, src_filename
    , jobname ): 

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util')
    print "INFO: zip_file_path=",zip_file_path
        
    #data_folder = hdfs_feat_dir + "/"
    #local_out_dir = local_out_dir + "/"
    #if os.path.exists(local_out_dir): 
    #    shutil.rmtree(local_out_dir) # to keep smaplelist file
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)
            
    # init Spark context ====
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path]) 

    # start here =================================================================== ===============
    t0 = time()
        
    
    ### Need to check if PCA available here ===========================
    libsvm_data_file = os.path.join(hdfs_feat_dir , src_filename) # need to set k numb in filename somehow
    print "INFO: libsvm_data_file=", libsvm_data_file
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file).cache()
    # load sample RDD from text file   
    # format (LabeledPoint,hash) from str2LabeledPoint_hash() 
    feature_count=0
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, '')
    
    # get label as a list
    labels_list_all = samples_rdd.map(lambda p: int(p[0].label)).collect()
    total_sample_count=len(labels_list_all)
    parsedData =samples_rdd.map(lambda p: p[0].features).cache()
    #for i in parsedData.collect(): #p.features: pyspark.mllib.linalg.SparseVector
    #    print "pd=",type(i),",i=",i

    t1 = time()
    print 'INFO: running time: %f' %(t1-t0)
    t0 = t1
    
    ###############################################
    ########## build learning model ###############
    ###############################################
    
    ### get the parameters###
    print "INFO: ============Learning Algorithm and Parameters============="
    para_dict = json.loads(ml_opts_jstr)
    flag_model = para_dict['learning_algorithm'] # kmeans
    iteration_num = eval(para_dict['iterations'])
    k=2
    if 'k' in para_dict:
        k = eval(para_dict['k'])

    print "INFO: Learning Algorithm:", flag_model
    print "INFO: iterations=", iteration_num
    #print "training_sample_number=", training_sample_number
    
    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    if labelnameflag == 1:
        key = "dic_name_label"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'
 
        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
 
        doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']
        
        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic 
        
    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    print "INFO: labels_list=", labels_list
    
    ### build model ###
    
    if flag_model == "kmeans":
        print "=================== Kmeans ============"
        model = KMeans.train(parsedData, k, maxIterations=iteration_num)   
        t_cost= model.computeCost(parsedData)
        print "INFO: cost for training set =", str(t_cost)
        clusterCenters=model.clusterCenters
        print "INFO: clusterCenters t=", type(clusterCenters)  #list
    elif flag_model == "gaussian_mixture_model": # didn't work some native lib issue
        print "=================== Gaussian_Mixture_Model ============"
        model = GaussianMixture.train(parsedData, k, maxIterations=iteration_num)   
        print "INFO: model.weights =", model.weights
    else:
        print "INFO: Training model selection error: no valid ML model selected!"
        return
        
    ### Save model
    save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str
    try:
        hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(save_dir)
        #print "all files removed"
    except IOError as e:
        print "ERROR: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "ERROR: Unexpected error:", sys.exc_info()[0] 
    
    print "INFO: model saved at hdfs=",save_dir
    print "INFO: model type=",type(model)," model=",model
    model.save(sc, save_dir)
        
    ###load model if needed 
    #sameModel = SVMModel.load(sc, save_dir)

    ### 
    # (true label, keams label, features list, hash)
    all_data=samples_rdd.map(lambda t: ( t[0].label, model.predict(t[0].features), t[0].features, t[1] ) ).collect() 
    true_label_arr = np.asarray([int(x) for x,_,_,_ in all_data])
    labels_kmeans = np.asarray([int(x) for _,x,_,_ in all_data])
    hash_list = np.asarray([x for _,_,_,x in all_data])
    print "INFO: all_data len=",len(all_data),"all_data t=",type(labels_list_all)
    print "INFO: true_label_arr.shape=",true_label_arr.shape,"labels_kmeans.shape=",labels_kmeans.shape
    print "INFO: true_label_arr t=",type(true_label_arr),"labels_kmeans t=",type(labels_kmeans)
    mtx_center=np.asarray(clusterCenters)
    features_array_reduced=np.asarray([x.toArray() for _,_,x,_ in all_data])
    print "INFO: mtx_center t=",type(mtx_center),"mtx_center.shape=",mtx_center.shape
    print "INFO: features_array_reduced t=",type(features_array_reduced),"features_array_reduced.shape",features_array_reduced.shape

    #Adjusted Mutual Information between two clusterings
    amis=adjusted_mutual_info_score(labels_list_all,labels_kmeans)
    print "INFO: Adjusted_mutual_info_score=", amis  
    #Similarity measure between two clusterings
    ars=adjusted_rand_score(labels_list_all,labels_kmeans)
    print "INFO: Adjusted_rand_score=", ars   

    
    accuracy=0.0
   
    t1 = time()
    print 'INFO: training run time: %f' %(t1-t0)
    t0 = t1

    ###############################################
    ########## plot histogram               ######
    ###############################################
    n_clusters=k
    plot_col_num = int(math.ceil(math.sqrt(n_clusters)))
    figsize = (4*plot_col_num, 3*int(math.ceil(n_clusters*1.0/plot_col_num)))
    

    print "INFO: n_clusters=",n_clusters,",label_dic=",label_dic
    print "INFO: plot_col_num=",plot_col_num,",figsize=",figsize,",local_out_dir=",local_out_dir
    
    # kmeans histogram
    _, p_true = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic
                        , plot_col_num = plot_col_num, figsize=figsize, folder = local_out_dir, rid=row_id_str)
    # normalized kmeans histogram
    _, p_true_norm = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic
                        , plot_col_num = plot_col_num, figsize=figsize, normalize = True, folder = local_out_dir, rid=row_id_str)
    

    ####plot "reverse" histogram with labels ####
    num_bars = max(true_label_arr) + 1
    figsize = (4*plot_col_num, 3*int(math.ceil(num_bars*1.0/plot_col_num)))
    
    _, p_cluster = ml_plot_kmeans_histogram_subfigures(labels_kmeans, true_label_arr, num_bars, names = label_dic
                        , plot_col_num = plot_col_num, figsize=figsize, reverse = True, folder = local_out_dir, rid=row_id_str)


    #### plot dot figures ####
    # dot plot for Kmeans   ===========
    filename=os.path.join(local_out_dir ,row_id_str+'_cluster.png')   
    filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d.json')  
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced, labels_kmeans, mtx_center, n_clusters, figsize=(10,7), filename=filename
        , title='KMeans', filename_3d=filename_3d)
        
    # dot plot for True Labels  ===========
    filename=os.path.join(local_out_dir ,row_id_str+'_cluster_tl.png')      
    filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d_tl.json')  
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced, true_label_arr, mtx_center, n_clusters, figsize=(10,7), filename=filename
        , title='True Labels', filename_3d=filename_3d)

    dataset_info={"training_fraction":1, "class_count":n_clusters,"dataset_count":total_sample_count}
    
    # only update db for web request
    if fromweb=="1": 
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', total_feature_numb='"+str(feature_count) \
            +"', perf_measures='{}" \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret=exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '"+str(accuracy*100)+"%"

    
    print 'INFO: Finished!'
    return 0
            "   opp_score " \
            "FROM team_avgs"
    query = "SELECT " \
            "   team_id, " \
            "   team_name, " \
            "   AVG(t1_rush), " \
            "   AVG(t1_pass), " \
            "   AVG(t2_rush), " \
            "   AVG(t2_pass) " \
            "FROM full_game_stats " \
            "JOIN team ON 1=1 " \
            "   AND full_game_stats.t1_id = team.team_id " \
            "GROUP BY team_id, team_name"
    curs.execute(query)
    sql_dat = curs.fetchall()
    team_ids = [row[0] for row in sql_dat]
    team_names = [row[1] for row in sql_dat]
    features = [row[2:] for row in sql_dat]

    data = sc.parallelize(features, 1)
    model = GaussianMixture.train(data, k=10)
    cluster_labels = model.predict(data).collect()


    labels = zip(team_ids,team_names, cluster_labels)
    df = spark.createDataFrame( labels,
                        ["team_id", "team_name", "cluster_id"] )
    df.createOrReplaceTempView("model")
    for k in range(10):
        spark.sql("SELECT * FROM model WHERE cluster_id = {}".format(k)).show()
Exemplo n.º 19
0
                                maxIterations=100,
                                initialModel=KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]
    #path = "hdfs://masterNode:9000/user/spark/MODELOS-marcelo/KMEANS-2"
    #kmeans_model.save(sc,path)
    # Gauss KMeans
    start = time()
    gauss_model = GaussianMixture.train(data, k, maxIterations=20)
    end = time()
    elapsed_time = end - start
    gauss_output = [
        "====================== Gauss KMeans ====================\n"
    ]
    for i in range(k):
        v1 = ("weight = ", gauss_model.weights[i])
        v2 = ("mu = ", gauss_model.gaussians[i].mu)
        v3 = ("sigma = ", gauss_model.gaussians[i].sigma.toArray())
        gauss_output.append((v1, v2, v3))
    tiempo = "Tiempo: " + str(elapsed_time)
    gauss_output.append(tiempo)

    kmeans_info = sc.parallelize(kmeans_output)
    gauss_info = sc.parallelize(gauss_output)
Exemplo n.º 20
0
from pyspark.mllib.clustering import GaussianMixture
from pyspark import SparkContext
from scipy.stats import mvn
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time

DIR = "/home/adrianj/Desktop/MachineLearning/Resources/"
FILE_PATH = DIR+"atemporalTest.txt"
NUM_GAUSSIANS = 500

sc = SparkContext(appName="GMM Trainer")
data = sc.textFile(FILE_PATH)
parsedData = data.map(lambda line: np.array([float(x) for x in line.strip().split(' ')]))
gmm = GaussianMixture.train(parsedData, NUM_GAUSSIANS, seed=10)

print("Dumping to "+DIR+"GMMA/...")
#fig = plt.figure()
#ax = fig.gca(projection='3d')
# Record the model
gmm.save(sc, DIR+"GMMA/")
'''
for i in range(NUM_GAUSSIANS):
	
	mu = gmm.gaussians[i].mu
	sigma = (gmm.gaussians[i].sigma).toArray()
	weight = gmm.weights[i]
	#a, b = np.random.multivariate_normal(mu, sigma, 5000).T
	#surf = ax.scatter(a, b, c, zdir='z')
	#plt.plot(a, b, "x")
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel

# $example off$
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="GaussianMixtureExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("data/mllib/gmm_data.txt")
    parsedData = data.map(
        lambda line: array([float(x) for x in line.strip().split(' ')]))

    # Build the model (cluster the data)
    gmm = GaussianMixture.train(parsedData, 2)

    # Save and load model
    gmm.save(
        sc,
        "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel"
    )
    sameModel = GaussianMixtureModel\
        .load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")

    # output parameters of model
    for i in range(2):
        print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
              "sigma = ", gmm.gaussians[i].sigma.toArray())
    # $example off$
Exemplo n.º 22
0
# -*- coding:utf-8 -*-
""""
Program: GMM
Description: 调用spark内置的GMM算法示例
Author: zhenglei - [email protected]
Date: 2016-01-14 13:38:58
Last modified: 2016-01-14 13:50:11
Python release: 2.7
"""
# 调用spark内部的kmeans算法实现完成机器学习实战中的第十章示例
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.clustering import GaussianMixture


if __name__ == '__main__':
    sc = SparkContext()
    datas = sc.textFile('testSet.txt')
    clusters_num = 4
    parseData = datas.map(lambda x: array([float(y) for y in x.split('\t')]))
    model = GaussianMixture.train(parseData, clusters_num, maxIterations=10)
    clusters = [[] for i in range(clusters_num)]
    labels = model.predict(parseData).collect()
    nums = len(labels)
    for i in xrange(nums):
        clusters[labels[i]].append(parseData.collect()[i])
    print clusters
    sc.stop()
Exemplo n.º 23
0
from pyspark import SparkContext
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel

if __name__ == "__main__":
    sc = SparkContext(appName="GaussianMixtureExample")  # SparkContext

    ### Local default options
    k=2 # "k" (int) Set the number of Gaussians in the mixture model.  Default: 2
    convergenceTol=0.001 # "convergenceTol" (double) Set the largest change in log-likelihood at which convergence is considered to have occurred.
    maxIterations=150 # "maxIterations" (int) Set the maximum number of iterations to run. Default: 100
    seed=None # "seed" (long) Set the random seed

    # Load and parse the data    
    data = sc.textFile("/var/mdp-cloud/gmm_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.strip().split(' ')])) 
    # filteredData = data.filter(lambda arr: int(arr[1]) != 0)	

    # Build and save the model (cluster the data)
    gmm = GaussianMixture.train(parsedData, k, convergenceTol=0.001, maxIterations=150, seed=None)
    # gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")
    # gmm.save(sc, "GaussianMixtureModel_CV")
    # The following line would load the model
    # sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")

    # output parameters of model
    for i in range(k):
        print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
              "sigma = ", gmm.gaussians[i].sigma.toArray())

    sc.stop()
Exemplo n.º 24
0
    ### Local default options
    k = 2  # "k" (int) Set the number of Gaussians in the mixture model.  Default: 2
    convergenceTol = 0.001  # "convergenceTol" (double) Set the largest change in log-likelihood at which convergence is considered to have occurred.
    maxIterations = 150  # "maxIterations" (int) Set the maximum number of iterations to run. Default: 100
    seed = None  # "seed" (long) Set the random seed

    # Load and parse the data
    data = sc.textFile("/var/mdp-cloud/gmm_data.txt")
    parsedData = data.map(
        lambda line: array([float(x) for x in line.strip().split(' ')]))
    # filteredData = data.filter(lambda arr: int(arr[1]) != 0)

    # Build and save the model (cluster the data)
    gmm = GaussianMixture.train(parsedData,
                                k,
                                convergenceTol=0.001,
                                maxIterations=150,
                                seed=None)
    # gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")
    # gmm.save(sc, "GaussianMixtureModel_CV")
    # The following line would load the model
    # sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")

    # output parameters of model
    for i in range(k):
        print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
              "sigma = ", gmm.gaussians[i].sigma.toArray())

    sc.stop()
Exemplo n.º 25
0
spark_df = sc.parallelize(
    spark.read.json("Data/yelp_academic_dataset_user.json").select(
        "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[
            0], x[1], (today - par.parse(x[2])).days)).collect()[:1200])
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
# Getting the input data
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(
    lambda x: Vectors.dense(x))

# Initialize GMM
start = timer()
gmm = GaussianMixture.train(vector_df, k=4, maxIterations=20, seed=2018)
end = timer()
print(end - start)
df = pandas.DataFrame({'features': [], 'cluster': []})
i = 0
for v in vector_df.collect():
    df.loc[i] = [[float(v[0]), float(v[1]), float(v[2])], int(gmm.predict(v))]
    i += 1

print df

err = spark.createDataFrame(df).rdd.map(lambda x: (x[0], int(x[1]))).collect()
num_clusters = 4

per_clus = [0] * num_clusters
per_clus_num = [0] * num_clusters
Exemplo n.º 26
0
            elements = repo.get(pk_aids)
            for element in elements:
                for col_index, col in enumerate(cols):
                    if element.get(col) is not None:
                        rows[index].get(pk_aids)[col_index] = element.get(col)
                        print(element.get(col))
    for index, row in enumerate(rows):
        for pk_aids in row:
            if rows[index].get(pk_aids) is not None:
                if index == 0:
                    data = rows[index].get(pk_aids)
                else:
                    data = np.concatenate((data, rows[index].get(pk_aids)),
                                          axis=0)
    print(data)
    #Parameters:
    #data – RDD of data points
    #k – Number of components
    #convergenceTol – Threshold value to check the convergence criteria. Defaults to 1e-3
    #maxIterations – Number of iterations. Default to 100
    #seed – Random Seed
    #initialModel – GaussianMixtureModel for initializing learning
    model = GaussianMixture.train(data,
                                  10,
                                  convergenceTol=0.0001,
                                  maxIterations=50)

    labels = model.predict(data).collect()

    print
Exemplo n.º 27
0
            gmm.gaussians[i].mu, gmm.gaussians[i].sigma.toArray()).pdf(x)
    # prob_x = gmm.predictSoft([x])
    # rs = np.prod(prob_x)
    return rs


if __name__ == "__main__":
    sc = SparkContext(appName="GaussianMixtureExample")  # SparkContext
    # $example on$
    # Load and parse the data
    data = sc.textFile(sys.argv[1])
    parsedData = data.map(lambda line: array(
        ([float(x) for x in line.strip().split(",")])[index]))

    # Build the model (cluster the data)
    gmm = GaussianMixture.train(parsedData, n_clusters)

    # Save and load model
    if (os.path.isdir('GMMResult')):
        shutil.rmtree('GMMResult')
    gmm.save(sc, "GMMResult")
    sameModel = GaussianMixtureModel.load(sc, "GMMResult")

    # output parameters of model
    for i in range(n_clusters):
        print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
              "sigma = ", gmm.gaussians[i].sigma.toArray())

    datfull = data.map(lambda line: array(
        ([float(x) for x in line.strip().split(",")])))
    dat = datfull.take(datfull.count())
Exemplo n.º 28
0
import numpy as np


def parse(data):
    list = []
    for i in range(len(data)):
        value = float(data[i][1:-1])
        list.append(value)
    return (list)


parsedata = outdata.map(lambda line: line.encode('utf-8').split(",")).map(
    lambda l: parse(l))

start_time = time.time()
gmm = GaussianMixture.train(parsedata, 80)
gmm.fit(parsedata)
print time.time() - start_time

#testing Gaussian mixture model for python
start_time = time.time()
#print sample1

gmix = mixture.GMM(n_components=90, covariance_type='full')
gmix.fit(parsedata)
#gmix.predict(parsedInSample1)
end_time = time.time()
gmpython = end_time - start_time
print gmpython

Exemplo n.º 29
0
#    print data1.take(5)
# Without converting the features into dense vectors, transformation with zero mean will raise
# exception on sparse vector.
# data2 will be unit variance and zero mean.
    data2 = label.zip(scaler1.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    parsedData = data2.map (lambda x: x[1])
    parsedData.cache()
    modelList = [];
    d = dict()

    noClusters = 5
    convergenceTol = 1e-3
    maxIterations = 1000
    seed = random.getrandbits(19)
# Build the model (cluster the data)
    gmm = GaussianMixture.train(parsedData, noClusters, convergenceTol,
                                  maxIterations, seed)
# output parameters of model
    for i in range(noOfClusters):
        print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
            "sigma = ", gmm.gaussians[i].sigma.toArray())
    """
    for clusterSize in range(2, 21, 2):
    # Build the model (cluster the data)
        clusters = KMeans.train(parsedData, clusterSize, maxIterations=10,runs=10, initializationMode="random")
        modelList.append(clusters)

    # Evaluate clustering by computing Within Set Sum of Squared Errors
        def error(point):
            center = clusters.centers[clusters.predict(point)]
            return sqrt(sum([x**2 for x in (point - center)]))
Exemplo n.º 30
0
    :param convergenceTol:   Convergence threshold. Default to 1e-3
    :param maxIterations:    Number of EM iterations to perform. Default to 100
    :param seed:             Random seed
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('inputFile', help='Input File')
    parser.add_argument('k', type=int, help='Number of clusters')
    parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
    parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
    parser.add_argument('--seed', default=random.getrandbits(19),
                        type=long, help='Random seed')
    args = parser.parse_args()

    conf = SparkConf().setAppName("GMM")
    sc = SparkContext(conf=conf)

    lines = sc.textFile(args.inputFile)
    data = lines.map(parseVector)
    model = GaussianMixture.train(data, args.k, args.convergenceTol,
                                  args.maxIterations, args.seed)
    for i in range(args.k):
        print(("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu,
               "sigma = ", model.gaussians[i].sigma.toArray()))
    print("\n")
    print(("The membership value of each vector to all mixture components (first 100): ",
           model.predictSoft(data).take(100)))
    print("\n")
    print(("Cluster labels (first 100): ", model.predict(data).take(100)))
    sc.stop()
Exemplo n.º 31
0
    row_num = info_df.filter(info_df.high == 'IT').count()

    for index, repo in enumerate(repos):
        for pk_aids in repo:
            elements = repo.get(pk_aids)
            for element in elements:
                for col_index, col in enumerate(cols):
                    if element.get(col) is not None:
                        rows[index].get(pk_aids)[col_index]=element.get(col)
                        print(element.get(col))
    for index, row in enumerate(rows):
        for pk_aids in row:
            if rows[index].get(pk_aids) is not None:
                if index == 0:
                    data = rows[index].get(pk_aids)
                else:
                    data = np.concatenate((data, rows[index].get(pk_aids)), axis=0)
    print(data)
    #Parameters:
    #data – RDD of data points
    #k – Number of components
    #convergenceTol – Threshold value to check the convergence criteria. Defaults to 1e-3
    #maxIterations – Number of iterations. Default to 100
    #seed – Random Seed
    #initialModel – GaussianMixtureModel for initializing learning
    model = GaussianMixture.train(data, 10, convergenceTol=0.0001,maxIterations=50)

    labels = model.predict(data).collect()

    print
Exemplo n.º 32
0
df = pd.DataFrame(l, index = ['gp1_P', 'gp2_P', 'gp3_P', 'gp4_P', 'gp5_P', 'gp6_P'],
                  columns = ['gp1_R', 'gp2_R', 'gp3_R', 'gp4_R', 'gp5_R', 'gp6_R'])
df


# ### Interprétation (à finir)
Avec Kmeans, 2 groupes se distinguent : 4 et 6
Le groupe gp1_P regroupe 123 des individus et mélange nettement gp1_R / gp2_R / gp3_R
# ## Gaussian Mixture 

# In[12]:

from pyspark.mllib.clustering import GaussianMixture

# Construction du model avc le mm dataTrain que Kmeans
gmm = GaussianMixture.train(dataTrain, 6)

# sortie des parameters du modele
for i in range(2):
    print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
        "sigma = ", gmm.gaussians[i].sigma.toArray())


# ### Interprétation (à finir)

# # Mesures d'évaluation (en cours)

# In[30]:

from pyspark.mllib.evaluation import MultilabelMetrics
from numpy import array
from pyspark import SparkContext
import matplotlib.pyplot as plt
import numpy as np
#plt.figure()


sc=SparkContext()

data=sc.textFile("./coord.txt")
#test_plot=np.genfromtxt("./coord.txt",delimiter=',',dtype=float)
#plt.plot(test_plot[:,1],test_plot[:,0],'ro')
#plt.show()
parsedData=data.map(lambda line: array([float(x) for x in line.strip().split(',')]))
l=3
gmm = GaussianMixture.train(parsedData,l)
#x=np.zeros(90000)
#y=np.zeros(90000)

#for i in range(0,l):
	#print "w= ",gmm.weights[i]
	#print "sigma= ",gmm.gaussians[i].sigma.toArray()
	#print "mu= ",gmm.gaussians[i].mu
	
#x1=gmm.weights[0]*np.random.multivariate_normal(gmm.gaussians[0].mu,gmm.gaussians[0].sigma.toArray(),90000)
#x2=gmm.weights[1]*np.random.multivariate_normal(gmm.gaussians[1].mu,gmm.gaussians[1].sigma.toArray(),90000)		


file  = open("./GMM.txt",'w')
for j in range(0,l):
	file.write(str(gmm.weights[j])+'\n')
Exemplo n.º 34
0
                        default=1e-3,
                        type=float,
                        help='convergence threshold')
    parser.add_argument('--maxIterations',
                        default=100,
                        type=int,
                        help='Number of iterations')
    parser.add_argument('--seed',
                        default=random.getrandbits(19),
                        type=long,
                        help='Random seed')
    args = parser.parse_args()

    conf = SparkConf().setAppName("GMM")
    sc = SparkContext(conf=conf)

    lines = sc.textFile(args.inputFile)
    data = lines.map(parseVector)
    model = GaussianMixture.train(data, args.k, args.convergenceTol,
                                  args.maxIterations, args.seed)
    for i in range(args.k):
        print(("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu,
               "sigma = ", model.gaussians[i].sigma.toArray()))
    print("\n")
    print((
        "The membership value of each vector to all mixture components (first 100): ",
        model.predictSoft(data).take(100)))
    print("\n")
    print(("Cluster labels (first 100): ", model.predict(data).take(100)))
    sc.stop()