Пример #1
0
    def perform_training(sc: SparkContext, params_dict: dict):
        normal_ekg_data_path = None if 'normal_ekg_data_path' not in params_dict else params_dict[
            'normal_ekg_data_path']
        min_num_of_clusters = 5 if 'min_num_of_clusters' not in params_dict else int(params_dict['min_num_of_clusters'])
        max_num_of_clusters = 20 if 'max_num_of_clusters' not in params_dict else int(
            params_dict['max_num_of_clusters'])
        boundary_ratio = 0.8 if 'boundary_ratio' not in params_dict else int(params_dict['boundary_ratio'])

        ekg_rdd_data = sc.textFile(normal_ekg_data_path).map(
            lambda line: np.array([float(val) for val in line.split(',')]))

        # ekg_rdd_data.foreach(Plotter.plot_signal_window)
        k_range = range(min_num_of_clusters, max_num_of_clusters, 1)
        prev_cost = float(np.inf)
        final_km = KMeansModel(ekg_rdd_data.takeSample(False, 1))
        cost_ratios = []
        found_best = False
        for k in k_range:
            km = KMeans.train(ekg_rdd_data, k)
            # cost equals to sum of squared distances of samples to the nearest cluster centre
            cost = km.computeCost(ekg_rdd_data)
            ratio = cost / prev_cost
            prev_cost = cost
            cost_ratios.append(ratio)
            if (ratio > boundary_ratio) & (not found_best):
                final_km = km
                found_best = True

        Plotter.plot_elbow(cost_ratios, k_range)
        return final_km
Пример #2
0
def kmeans():
    """
    使用mllib对Spark安装包mllib的测试数据集做K-means聚类,由于train方法:
        Training points as an `RDD` of `Vector` or convertible
    所以需对数据集格式化:
        初始数据集 --> ['0.0 0.0 0.0', '0.1 0.1 0.1', '0.2 0.2 0.2']
        格式化后数据集 --> [array([0., 0., 0.]), array([0.1, 0.1, 0.1]), array([0.2, 0.2, 0.2])]
    :return:
    """
    data_rdd = sc.textFile('{}/mllib/kmeans_data.txt'.format(current_dir))
    parsed_data_rdd = data_rdd.map(lambda line: array([float(x) for x in line.split(' ')]))

    # 建立聚类模型
    clusters = KMeans.train(parsed_data_rdd, 2, maxIterations=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x ** 2 for x in (point - center)]))

    WSSSE = parsed_data_rdd.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # 保存 训练好的模型
    model_path = "{}/kmeans_model".format(current_dir)
    if not os.path.exists(model_path):
        clusters.save(sc, model_path)

    trained_model = KMeansModel.load(
        sc, "{}/kmeans_model".format(current_dir)
    )
    return trained_model
Пример #3
0
def logo_feature_cluster(train_feature_list, train_name_list, clusternum):
    '''训练'''
    model = KMeans.train(sc.parallelize(train_feature_list),
                         clusternum,
                         maxIterations=10,
                         initializationMode="random",
                         seed=50,
                         initializationSteps=5,
                         epsilon=1e-4)
    model_path = tempfile.mkdtemp()
    model.save(sc, model_path)
    model = KMeansModel.load(sc, model_path)
    '''预测'''
    predict = model.predict(sc.parallelize(train_feature_list))
    # print(predict.collect())
    try:
        rmtree(model_path)
    except OSError:
        pass

    logo_result_path = os.path.join(
        result_path, "logo_image_result" + str(clusternum) + ".txt")
    writeResultTofile(logo_result_path, train_name_list, predict.collect())
    '''Calinski-Harabasz聚类评估指标'''
    # evaluationCH = metrics.calinski_harabaz_score(train_feature_list, predict.collect())
    # ch = str(round(evaluationCH, 2))
    # print("Calinski-Harabasz聚类评估指标:"+ch)
    # with open(result_path + "Calinski-Harabasz.txt", 'a') as a:
    #     a.write(str(clusternum)+":" + ch + "\n")
    '''Silhouette-Coefficient聚类评估指标'''
def loadModel():
	clusterModel = KMeansModel.load(sc, pv.clusterModelPath)
	classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath)

	if pv.outputDebugMsg:
		Utils.logMessage("\nLoad cluster & classification model finished")
	return clusterModel, classificationModel
def kmeans_w2v_predict():
    # appName='kmeans_w2v_predict'
    # sc = SparkContext(appName=appName)
    # from pyspark.sql import SQLContext
    from pyspark.mllib.clustering import KMeans, KMeansModel
    # sqlContext = SQLContext(sc)
    data = sqlContext.read.parquet(
        "hdfs:///user/rmusters/lambert_w2v_data_jan")
    # df = data.toDF("text", "filtered_text", "split_text", "vectors", "id")
    df = data.toDF("tokens", "vectors", "id")
    df = df.where(df.vectors.isNotNull())
    data = df.rdd
    model = KMeansModel.load(sc,
                             "hdfs:///user/rmusters/lambert_kmeans_w2v_jan")

    # data = data.map(lambda (text, filtered_text, split_text, vectors, id): (text, filtered_text, split_text, vectors, model.predict(vectors), id))
    # df = data.toDF(["text", "filtered_text", "split_text", "vectors", "cluster", "id"])
    data = data.map(lambda (tokens, vectors, id):
                    (tokens, vectors, model.predict(vectors), id))
    df = data.toDF(["tokens", "vectors", "cluster", "id"])
    df = df.select("cluster", "id")
    df = df.sort(df.cluster.asc())
    df.write.format("com.databricks.spark.csv").mode("overwrite").save(
        "lambert_w2v_data_cluster.csv")
    # df.save("hdfs:///user/rmusters/lambert_w2v_data_cluster.csv", "com.databricks.spark.csv")
    df.write.parquet("hdfs:///user/rmusters/lambert_w2v_data_cluster",
                     mode="overwrite")
Пример #6
0
def model_instream(sc, **params):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        sc._jsc.hadoopConfiguration())
    if not fs.exists(
            sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) +
                                              '/model/' + params['path'])):
        raise Exception("Invalid file path, path not exists!")
    if params['type'] == 'kmeans':
        model = KMeansModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'fpgrowth':
        model = FPGrowthModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'logistic-regression':
        model = LogisticRegressionModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'word2vec':
        model = Word2VecModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'decision-tree':
        model = DecisionTreeModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    else:
        raise Exception("Invalid model type!")
    return True, model
Пример #7
0
def getCluster(price, crime, male, female, white, black, asian, hispanic,
               young, mid_age, senior):
    KModel = KMeansModel.load(sc, "project/data/output/KMeansModel")
    cluster = KModel.predict([
        price, crime, male, female, white, black, asian, hispanic, young,
        mid_age, senior
    ])
    return cluster
Пример #8
0
 def KMeans_Processing(self, columns):
     data_point = np.array(self.df_PD[columns])
     model = KMeansModel.load(
         self.sc, self.baseDir + '/fraudModel/Model/' + 'KMeans')
     result = np.array(
         model.predict(self.sc.parallelize(data_point)).collect())
     self.df_PD.insert(len(list(self.df_PD.columns)), 'KMeans_feature',
                       result)
Пример #9
0
def assign_pooling(data):

    image_name, feature_matrix = data[0]
    clusterCenters = data[1]

    feature_matrix = np.array(feature_matrix)

    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        bow[k] = max(bow[k], dist)

    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    return [image_name, group]
def assign_pooling(row, clusterCenters, pooling):
    image_name = row['fileName']
    feature_matrix = np.array(row['features'])
    clusterCenters = clusterCenters.value
    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        if pooling == "max":
            bow[k] = max(bow[k], dist)
        elif pooling == "sum":
            bow[k] = bow[k] + dist
    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    #print(image_name + " in group: " + str(group))
    return [(image_name, group)]
Пример #11
0
def main():
    sc = SparkContext(appName="tileMapper")
    print("I do all the input output jazz")

    ###########################################################################
    big_image = sc.binaryFiles("Reference/108103_sm.jpg")
    tile_avgs = big_image.flatMap(extract_opencv_tiles())
    #buckets = tile_avgs.collect()
    #print("Bucket",buckets)
    tileMap = tile_avgs.map(
        lambda l: [item for sublist in l for item in sublist])
    tileList = tileMap.collect()
    print("Tile Map", tileMap)
    print("Tile Map", tileMap.collect())
    print("Tile List", tileList)
    print("Tile LIst", type(tileList))
    ############################################################################

    clusterIndex = getIndex()
    kmModel = KMeansModel.load(sc, "myModelPath")
    readyToCombine = []
    currentRow = None
    noOfRow = 0
    noOfCol = 0
    firstTile = tileList[0]
    tileSize = firstTile[1]
    #Randomly Get small images using kmeans match
    for tile in tileList:
        if tile[0] == currentRow:
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
            noOfCol = noOfCol + 1
        else:
            currentRow = tile[0]
            noOfCol = 1
            noOfRow = noOfRow + 1
            currentRow = tile[0]
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
    #Put small images into the big image canvas

    canvas = np.zeros((noOfRow * tileSize, noOfCol * tileSize, 3), np.uint8)

    #Print Image
    print("No. of Col", noOfCol)
    print("No. of Row", noOfRow)
    #print("Before Print, Check Once again",readyToCombine)
    mosaicImage = printImage(readyToCombine, canvas, noOfCol, noOfRow,
                             tileSize)

    print("Finished processing of image")
    cv2.imwrite('mosaicImageYeah.jpg', mosaicImage)
Пример #12
0
def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    centers = sameModel.clusterCenters
    print("Cluster Centers: ")
    for n, center in enumerate(centers):
        out_f = OUTPUT_DATA + str(n) + "Cluster.csv"
        numpy.savetxt(out_f, center, newline=";")
        print(center)
def main():
    modelname = sys.argv[1]
    tiffname = sys.argv[2]
    outputname = sys.argv[3]
    sc = SparkContext()
    model = KMeansModel.load(sc, modelname)
    dataset = gdal.Open(tiffname, GA_ReadOnly)
    x, y, data = train.tiff_to_array(dataset, train.weights)
    driver = dataset.GetDriver().ShortName
    clusterdata = sc.parallelize(data)
    result = np.array(clusterdata.map(lambda point: model.predict(point)).collect())
    write_to_tif(outputname, x, y, result, driver)
Пример #14
0
def assign_pooling(data):

    row = data[0]
    clusterCenters = data[1]
    pooling = data[2]

    image_name = row['fileName']
    feature_matrix = np.array(row['features'])

    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        if pooling == "max":
            bow[k] = max(bow[k], dist)
        elif pooling == "sum":
            bow[k] = bow[k] + dist
    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    return [image_name, group]
Пример #15
0
def kmeans_classification(sc, c_tag, util):
    print 'data retrieving'
    _data_ = data_retriever(c_tag)
    print len(_data_)
    #rids = [x[0] for x in _data_]

    blc = nlpb.nlpblockbase()

    __ans = [train_feature_extraction(x, c_tag, blc, util) for x in _data_]
    ans = [[float(k) for k in x] for x in __ans]
    #print ans
    train_data = [np.array(sf.softmax(x)) for x in ans]
    #train_data = ans
    print 'dataprep done'
    assert_len(train_data)
    brotrain = sc.broadcast(train_data)
    clusters = KMeans.train(sc.parallelize(brotrain.value),
                            200,
                            maxIterations=10,
                            initializationMode="random")

    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = map(lambda point: error(point), train_data)
    WSSSE = reduce(lambda x, y: x + y, WSSSE)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    clustered = collections.defaultdict(list)
    print len(_data_)
    cnt = 0
    for row in _data_:
        clustered[clusters.predict(train_data[cnt])].append(
            [row[0], row[2], row[3], row[4]])
        cnt += 1
    print len(clustered)
    for k in clustered.keys():
        print len(clustered[k])

    clusters.save(sc, "MovieModel")
    sameModel = KMeansModel.load(sc, "MovieModel")

    ref = collections.defaultdict(list)
    for point in train_data:
        ref[sameModel.predict(point)].append(point)

    for x, y in zip(ref.keys(), clustered.keys()):
        assert len(ref[x]) == len(clustered[y])

    return clustered
Пример #16
0
def kmeansInitialClusters(dataset):
    model = KMeansModel(CENTER_VECTORS)
    vectorsRdd = dataset.rdd.map(lambda data: Vectors.parse(Vectors.stringify(data['features'])))
    trainedModel = KMeans.train(vectorsRdd, 4, maxIterations=1000, initialModel=model)
    result=[]
    for d in dataset.collect():
        entry = {}
        entry["features"] = d["features"]
        entry["prediction"] = trainedModel.predict(Vectors.parse(Vectors.stringify(d['features'])))
        entry["label"] = d['label']
        result.append(entry)

    plotDiversitySizeClustering(result, CENTERS, "Size", "Diversity", "Song Analysis by Size and Diversity with Initial Clusters")
    centroidArtistSongCount(result, CENTERS)
Пример #17
0
def main(sc):
    data = [[1.0, 1.0], [1.0, 0.8], [-1.0, 1.0], [-1.0, -1.0]]
    parsedData = sc.parallelize(data)
    kmeansModel = KMeans.train(parsedData,
                               2,
                               maxIterations=10,
                               runs=10,
                               initializationMode="random")
    print(kmeansModel.predict([1.0, 1.0]))
    print(kmeansModel.predict([1.0, -2.0]))
    # Save and load model
    kmeansModel.save(sc, "KMeansModel")
    model = KMeansModel.load(sc, "KMeansModel")
    print(model.predict([1.0, 1.0]))
    print(model.predict([1.0, -2.0]))
Пример #18
0
    def print_model(self,model_name):
        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        for c in model.clusterCenters:
            print(c)
        for c in model.clusterCenters:
            l = []
            for i in c:
                i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
                l.append(float(i))
            print(l)
Пример #19
0
def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Data
    dataset = sc.textFile(INPUT_DATA, cpu_count)
    dataset = dataset.map(
        lambda line: array([float(x) for x in line.split(';')]))

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    # Predict cluster labels per row
    labels = sameModel.predict(dataset).collect()

    # Save labels in json file
    with open(OUTPUT_LABEL, 'w') as out_f:
        json.dump(labels, out_f)
Пример #20
0
def predict():
    from pyspark.mllib.clustering import KMeans, KMeansModel
    from pyspark.mllib.linalg import Vectors

    data = sqlContext.read.format("com.databricks.spark.csv").option(
        "header", "true").load("w2v_vector.csv")
    data = data.map(lambda x: [float(a) for a in x])
    df = data.toDF()
    columns = df.columns
    vectors = df.select(columns[1:71])
    vectors = vectors.map(lambda x: Vectors.dense(x))

    for n_clusters in _range:
        model = KMeansModel.load(
            sc, "hdfs:///user/rmusters/w2v_model_kmeans_" + str(n_clusters))

        predicted = model.predict(vectors)
        result = predicted.map(lambda x: (x, )).toDF()
        result.save("clusters_" + str(n_clusters))
def kmeans_lda_predict():
    appName = 'kmeans_lda_predict'
    from pyspark.mllib.clustering import KMeans, KMeansModel
    from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
    sc = SparkContext(appName=appName)
    from pyspark.sql import SQLContext
    sqlContext = SQLContext(sc)
    data = sqlContext.read.parquet(
        "hdfs:///user/rmusters/lda_doc_topic")  #lda_data_jan
    # data = df.rdd
    model = KMeansModel.load(sc, "hdfs:///user/rmusters/kmeans_lda_jan")
    data = data.map(lambda (id, vectors):
                    (id, vectors, model.predict(vectors)))
    df = data.toDF(["id", "vectors", "cluster"])
    df = df.sort(df.cluster.asc())
    # df.write.parquet("hdfs:///user/rmusters/lda_data_cluster", mode= "overwrite")
    df.write.parquet("hdfs:///user/rmusters/lda_data_cluster",
                     mode="overwrite")

    logger.info(appName)
Пример #22
0
    def predict(self, model_name, data):

        '''
        predict unknown data
        :param model_name: the trained model saving on hdfs
        :param data: unknown data
        :return: (cluster_index, cluster)
        '''

        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        # get the predict : means which cluster it belongs to
        index = model.predict(data)
        print('Data:%s belongs to cluster:%s. The index is %s' % (data, model.clusterCenters[index], index))
        return index, model.clusterCenters[index]
Пример #23
0
def train_rotations(sc, split_vecs, M, Cs):
    """
    For compute rotations for each split of the data using given coarse quantizers.
    """

    Rs = []
    mus = []
    counts = []
    for split in xrange(2):

        print 'Starting rotation fitting for split %d' % split

        # Get the data for this split
        data = split_vecs.map(lambda x: x[split])

        # Get kmeans model
        model = KMeansModel(Cs[split])

        R, mu, count = compute_local_rotations(sc, data, model, M / 2)
        Rs.append(R)
        mus.append(mu)
        counts.append(count)

    return Rs, mus, counts
Пример #24
0
        
def parseOwnership(line):
    fields = line.split(',')
    #print len(fields)
    owner = fields[0]
    taxes = int(fields[1])
    lat = float(fields[2])
    lon = float(fields[3])
    cluster = int(fields[4])
    
    return (owner, taxes, lat, lon, cluster)

conf = SparkConf().setMaster("local").setAppName("tugasbigdata")
sc = SparkContext(conf = conf)

clusters = KMeansModel.load(sc,"C:/SparkCourse/FP_model")

lines = sc.textFile("file:///SparkCourse/data_center.csv")
parsedLines = lines.map(parseLine)
#bersih bersih
reserveddata= parsedLines.filter(lambda x : x is not None)
reserveddata1= reserveddata.filter(lambda x : x[1] is not None)
reserveddata2=reserveddata1.filter(lambda x : x[2] is not None)
temp=reserveddata2.filter(lambda x : "San Francisco" in x[0])
tempdata=temp.map(lambda x: (x[1],x[2]))
data=tempdata.map(lambda x: (float(x[0]),float(x[1])))

data_local = data.collect()

ownershipData = lines.map(parseOwner)
#bersih2 lagi
Пример #25
0
def main():
    data_rdd = load_data(project["data_file"])
    print (data_rdd.count())

    listed_data_rdd = data_rdd.map(data_extractor)

    # Filtering unwanted data rows
    elect_filtered_rdd = listed_data_rdd.filter(electric_vehicles_filter)
    filtered_rdd = elect_filtered_rdd.filter(empty_cost_filter)

    # Mapping related data to convenient format for clustering
    cost_tx_rdd = filtered_rdd.map(cost_transform_mapper)
    feature_mapped_rdd = cost_tx_rdd.map(feature_mapper)

    estimated_clusters = KMeansModel.load(spark_context, "identified_clusters")

    optimum_cluster = 4
    optimum_points_rdd = feature_mapped_rdd.filter(
        lambda filtered_data_feature_vector: estimated_clusters.predict(filtered_data_feature_vector[-1])
        == optimum_cluster
    )

    print (optimum_points_rdd.count())

    sample_data = optimum_points_rdd.take(100)

    for data in sample_data:
        feature_vector = data[-1]
        pyplot.scatter(feature_vector[0], feature_vector[1])

    optimum_cluster_manufactures_rad = optimum_points_rdd.map(manufactures_mapper)

    optimum_points_rdd.persist()  # To hold the previously calculated data set in memory

    individual_manufactures_count_rdd = optimum_cluster_manufactures_rad.reduceByKey(operator.add)

    sorted_manufactures_count_rdd = individual_manufactures_count_rdd.sortBy(
        lambda manufactures_set: manufactures_set[1], ascending=False
    )

    top_ten = 10
    vehicle_count = []
    manufactures_name = []
    for manufacture in sorted_manufactures_count_rdd.take(top_ten):
        vehicle_count.append(manufacture[1])
        manufactures_name.append(manufacture[0])
        print (manufacture)

    pyplot.title("Best Vehicle Cluster")
    pyplot.xlabel("Europe Rating")
    pyplot.ylabel("Feature Normalized")

    pyplot.show()

    number_of_manufactures = top_ten  # sorted_manufactures_count_rdd.count()
    index = np.arange(number_of_manufactures)

    bar_width = 0.5

    opacity = 0.4
    error_config = {"ecolor": "0.3"}

    chart = pyplot.bar(
        index, vehicle_count, bar_width, alpha=opacity, color="b", error_kw=error_config, label="manufactures"
    )
    pyplot.xticks(index + bar_width, manufactures_name)

    pyplot.title("Top preforming vehicles")
    pyplot.xlabel("manufactures")
    pyplot.ylabel("Vehicles count")

    pyplot.show()
#print np.shape(parsedData)
# Build the model (cluster the data)

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
	center = clusters.centers[clusters.predict(point)]
	#print center
	return sqrt(sum([x**2 for x in (point - center)]))
#WSSE = parsedData.map(lambda point:error(point)).reduce(lambda x,y:x+y)
WSSE=np.zeros(5)
import time
for i in range(2,7):
	t = time.time()
	clusters = KMeans.train(parsedData, i, maxIterations=100,
        	runs=100, initializationMode="random")
	WSSE[i] = (parsedData.map(lambda point:error(point)).reduce(lambda x,y :x+y))
	print str(WSSE[i])+"   "+str(i)+"   WITH TIME ="+str(time.time()-t)

#WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
#print("Within Set Sum of Squared Error = " + str(WSSSE))



# Save and load model
clusters.save(sc, "./mymodel1")
sameModel = KMeansModel.load(sc, "./mymodel1")

#print clusters

from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import KMeans, KMeansModel
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("data/mllib/kmeans_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
    sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
    # $example off$

    sc.stop()
Пример #28
0
from collections import OrderedDict
from numpy import array
from math import sqrt

if __name__ == "__main__":
    if (len(sys.argv) != 2):
        print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
          "predict.py kddcup.data.file"
        sys.exit(1)

    data_file = sys.argv[1]
    conf = SparkConf().setAppName("KDDCup99") \
      #.set("spark.executor.memory", "2g")

    sc = SparkContext(conf=conf)

    model = KMeansModel.load(sc, "best_model")

    clusters = model.clusterCenters

    with open(data_file) as file:
        for line in file:
            line_split = line.split(",")
            clean_line_split = [line_split[0]] + line_split[4:]
            clusterIndex = model.predict(
                array([float(x) for x in clean_line_split]))
            print clusterIndex
    print

print "DONE!"
Пример #29
0
from pyspark import SparkContext, SparkConf
# Load and parse the data
# 데이터 로드 및 분석
conf = SparkConf().setMaster("local").setAppName("Test")
sc = SparkContext(conf=conf)
data = sc.textFile("data/mllib/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
# 모델 빌드(데이터 클러스터링)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=20,
                        runs=10,
                        initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors


def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "myModelPath")
sameModel = KMeansModel.load(sc, "myModelPath")
Пример #30
0
def process(id, content):
	kmeansDoc = DocumentKmeans(id, '')
	print(type(content))
	kmeansDoc.content = content
	kmeansDoc.doc2vec()
	kmeansDoc.printVec()
	kmeansDoc.kmeansVec()
	kmeansDoc.cluster_id = kmeansPredict.predict(kmeansDoc)
	return kmeansDoc
    
utils = Utils(45000)

sc = SparkContext(appName="PythonKafkaConsumerKmeans")

kmeansModel = KMeansModel.load(sc, '../KmeansModel')

kmeansPredict = KmeansPredict(kmeansModel)

consumer = KafkaConsumer('test', group_id='kafka_consumer_group', bootstrap_servers=['localhost:9092'])

for message in consumer:
	value = message.value
	spl = value.split(':')
	id = spl[0]
	content = utils.normalizeString(spl[1])
	docu = process(id, content)
	print(docu.cluster_id)
    
KafkaConsumer(auto_offset_reset='earliest', enable_auto_commit=False)
KafkaConsumer(value_deserializer=lambda m: json.loads(unicode(m, "utf8")))
Пример #31
0
from numpy import array
from math import sqrt

# Load and parse input data
data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build and train the model: K=2, 10 iterations. 
clusters = KMeans.train(parsedData, 2, 10)

# Evaluate the clustering
def error(point):
  center = clusters.centers[clusters.predict(point)]
  return sqrt(sum([x**2 for x in (point - center)]))
  
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print("Within Set Sum of Squared Error = " + str(WSSSE))


# Saving and loading the model
clusters.save(sc, "MyModels")
sameModel = KMeansModel.load(sc, "MyModels")
sameModel
sameModel.k
sameModel.clusterCenters

  


Пример #32
0
 def kmeans_model_load(self, sc,path):
     return KMeansModel.load(sc, path)
Пример #33
0
 def __init__(self, spark_contect, model_path):
     self.model = KMeansModel.load(spark_contect, model_path)
input_living_index = sys.argv[1]

# Read the parquet data and convert to RDD
parquet_living_index = sqlContext.read.parquet(input_living_index)
parquet_living_index.registerTempTable("living_index_table")
living_index_table = sqlContext.sql("SELECT * FROM living_index_table")
living_index_rdd = living_index_table.map(lambda colName: (str(colName.Community_Code) + "," + str(colName.Crime_Frequency)
                                                              + "," + str(colName.Housing_Crowded) + "," + str(colName.Household_BPL)
                                                              + "," + str(colName.Unemployed) + "," + str(colName.Without_Diploma)
                                                              + "," + str(colName.Age_Bar) + "," + str(colName.Per_Capita_Income)
                                                              + "," + str(colName.Hardship_Index)))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = living_index_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input, 20, maxIterations=5,
        runs=5, initializationMode="random")

# Compute squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

# Save the cluster model
clusters.save(sc, "myModel/living-index")
sameModel = KMeansModel.load(sc, "myModel/living-index")
Пример #35
0
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

from pyspark import SparkContext

sc = SparkContext(appName="AA_kmeans")

# Load and parse the data
data = sc.textFile("hdfs://namenode/kmeans_data.txt")
#print(data.take(5))
#print data.map(lambda line: array([x for x in line.split(' ')])).collect()
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
        runs=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "hdfs://namenode/myModelPath")
sameModel = KMeansModel.load(sc, "hdfs://namenode/myModelPath")
Пример #36
0
from numpy import array
from math import sqrt


from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

from pyspark_cassandra import streaming

def predict(line):
    spl = line.split(':')
    doc = DocumentKmeans(spl[0], spl[1], spl[2])
    doc.kmeansVec()
    doc.cluster_id = kmeansPredict.predict(doc)
    return doc


sc = SparkContext()
sameModel = KMeansModel.load(sc, "../KmeansModel")
kmeansPredict = KmeansPredict(sameModel)

parsedData = sc.textFile('hdfs://localhost:8020/user/manh/vector')\
	.filter(lambda x: len(x) > 2000)\
	.map(lambda x: predict(x))\
	.map(lambda x: {
		'id' : x.id,
		'cluster_id' : int(x.cluster_id),
		'timestamps' : long(x.timestamps),
		'vector' : x.vector
	}).saveToCassandra('reishi', 'dockmeans')
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

# Load and parse the data
data = sc.textFile("/home/grijesh/sampleData/k-means-data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
                        runs=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "myModelPath")
sameModel = KMeansModel.load(sc, "myModelPath")
Пример #38
0
if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("k_means_data.txt")
    parsedData = data.map(
        lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData,
                            2,
                            maxIterations=10,
                            initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(
        lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "KMeansModel")
    sameModel = KMeansModel.load(sc, "KMeansModel")
    # $example off$

    sc.stop()
        print arrays.collect()
        indx = 0
        while indx < count:
            vec = Vectors.dense(arrays.collect()[indx])
            indx += 1
            clusternum = model.predict(vec)
            print "Cluster -> ", clusternum, vec
    return


# Create a local StreamingContext with two working thread and batch interval of 1 second
conf = SparkConf().setAppName("Fraud Detector")
conf = conf.setMaster("local[2]")

sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 8999)
# Split each line into words

model = KMeansModel.load(sc, "kmeansmodel01")
print model.clusterCenters
print "************************** Loaded the model *********************"

words = lines.flatMap(lambda line: line.split(" "))

lines.foreachRDD(detect)
ssc.start()  # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate
Пример #40
0
from pyspark import SparkContext
# Load and parse the data
sc = SparkContext()

data = sc.textFile("/user/hduser/venkat/iris.txt")
print data.first()
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=10,
                        initializationMode="random")
prediction = clusters.predict(parsedData)
print clusters.centers


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
myModelPath = "/user/hduser/output/kmeans_output"
clusters.save(sc, myModelPath)
sameModel = KMeansModel.load(sc, myModelPath)
    # check input db
    input_db = os.path.join(args.input_root_dir, "dbs", "db", "out.parquet")
    if not os.path.isdir(input_db):
        raise Exception("missing db parquet directory")

    # check output dir
    # logger.debug("Create new codebook dir...")
    output_dir = os.path.join(args.input_root_dir, 'features', 'feature')
    if os.path.isdir(output_dir):
        new_name = output_dir + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
        logger.info("backup old output-dir in  %s" % new_name)
        os.rename(output_dir, os.path.join(args.input_root_dir, new_name))
    os.makedirs(output_dir)

    model = KMeansModel.load(sc, input_codebook)

    model = sc.broadcast(model)

    pooling="max"
    feature_name = "SIFT"
    df = sqc.read.parquet(input_db)

    print df.count()

    features_bow = df.map(functools.partial(compute_global_feature,
                                                feature_name="SURF",
                                                model=model,
                                                pooling=pooling))

    print features_bow.first()
Пример #42
0

# Load and parse the data
# conf = SparkConf()
sc = SparkContext()
data = sc.textFile("./business_gps.csv")
parsedData = data.map(parse_line)

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        10,
                        maxIterations=1000,
                        runs=10,
                        initializationMode="k-means")


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "kmeans_model")
sameModel = KMeansModel.load(sc, "kmeans_model")

print("Cluster centers", sameModel.clusterCenters)
Пример #43
0
conf = SparkConf().setAppName('KMeans').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse data
data = sc.textFile('../data/kmeans_data.txt')
parseData = data.map(
    lambda line: np.array([float(x) for x in line.split(' ')]))

# build the model
clusters = KMeans.train(parseData,
                        2,
                        maxIterations=10,
                        runs=10,
                        initializationMode='random')


#evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return math.sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parseData.map(lambda p: error(p)).reduce(lambda x, y: x + y)
print('Within Set Sum of Squared Error :' + str(WSSSE))

# save and load model
clusters.save(sc, '../model/KMeansModel')
sameModel = KMeansModel.load(sc, '../model/KMeansModel')

sc.stop()
Пример #44
0
        "hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv"
    )

    average_per_year = average_year(lines)  # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(),
                                                   k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data,
                                k,
                                maxIterations=100,
                                initialModel=KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model,
                                           count_lines)
    if ascontext.isComputeDataModelOnly():
        ascontext.setSparkOutputSchema(output_schema)
        sys.exit(0)
    else:
        modelpath = ascontext.getModelContentToPath("model")
        model_metadata = json.loads(ascontext.getModelContentToString("model.metadata"))

# create a DataModelTools to handle data model and data conversions
datamodel = model_metadata["datamodel"]
dmt = DataModelTools(datamodel)

predictors = model_metadata["predictors"]
DataModelTools.checkPredictors(datamodel,predictors,df)

from pyspark.mllib.clustering import KMeansModel
model = KMeansModel.load(sc, modelpath)

# to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this
dv = dmt.extractDenseVector(df,predictors,setToFlag=1.0)

def rowToList(row):
        result = []
        for idx in range(0, len(row)):
            result.append(row[idx])
        return result


mapFn = lambda (x,y):rowToList(x)+[y]

rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])])
Пример #46
0
def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))
from pyspark import SparkContext, SparkConf
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

def parse_line(ln):
	split_ln = ln.split(',')
	ln_coord = [float(split_ln[1]), float(split_ln[2])]
	new_line = ln + ',' + cluster_labels[ KMeans_model.predict(ln_coord)]
	return new_line

# load and parse the data
# conf = SparkConf()
sc = SparkContext()

# load previously generated k-means model
KMeans_model = KMeansModel.load(sc, "kmeans_model")

# define cluster label array
cluster_labels = ["Pheonix-AZ", "Edinburgh-UK", "Charlotte-NC", "Madison-WI", "Montreal-Canada", "Waterloo-Canada", "Las Vegas-NV", "Urbana-Champaign-IL", "Pittsburgh-PA", "Karlsruhe-Germany"]

# read the file which has business_ids, latitude, longitude
data = sc.textFile("./business_gps.csv")

# get labelled rows
parsedData = data.map(parse_line)

# save labelled businesses in the output folder
parsedData.saveAsTextFile("./output")
Пример #48
0
from pyspark import SparkContext

from pyspark.mllib.clustering import KMeans, KMeansModel


if __name__ == "__main__":
    sc = SparkContext(appName="KMeansApp")  # SparkContext

   
    # Load and parse the data
    data = sc.textFile("s3://irm238FinalProject/input/citibike*")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10,
                            runs=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "KmeansModel")
    sameModel = KMeansModel.load(sc, "KMeansModel")
    

    sc.stop()
Пример #49
0
import json
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeansModel


def mapper(line):
    # Format the line
    line = line.replace("(", "").replace(")", "").replace("[", "").replace("]", "")
    elements = line.split(",")
    stock_name = elements.pop(0)
    percent_changes = map(lambda x: float(x), elements)

    return stock_name, percent_changes


if __name__ == "__main__":
    sc = SparkContext(appName="ComputeResults")

    model = KMeansModel.load(sc, sys.argv[2])

    mapred_results = sc.textFile(sys.argv[1])
    clusters = mapred_results.map(mapper)\
        .reduceByKey(lambda a, b: a + b)\
        .map(lambda stock: (model.predict(array(stock[1])), [stock[0]]))\
        .reduceByKey(lambda a, b: a + b)\
        .collectAsMap()

    with open('result.json', 'w') as fp:
        json.dump(clusters, fp)
sqlContext = SQLContext(sc)

# Read the input parquet
input_crime = sys.argv[1]

# Read the parquet data and convert to RDD
parquet_crime = sqlContext.read.parquet(input_crime)
parquet_crime.registerTempTable("crime_table")
crime_table = sqlContext.sql("SELECT * FROM crime_table")
crime_rdd = crime_table.map(lambda line: str(line.Year) + "," + str(line.Latitude) + ","
                                       + str(line.Longitude) + "," + str(line.Crime_Frequency))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = crime_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input, 20, maxIterations=5,
        runs=5, initializationMode="random")

# Compute root mean squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

# Save the cluster output into parquet files
clusters.save(sc, "myModel_crime")
sameModel = KMeansModel.load(sc, "myModel_crime")
Пример #51
0
if __name__ == "__main__":

    b = open("name", 'wb')
    sc = SparkContext("local[*]", "kmeans")

    print("data being loaded.....")
    data = sc.textFile(
        sys.argv[1]).map(lambda row: map(lambda x: float(x), row.split(',')))
    #file:///dev/desc_hdfs
    print("data loaded!")
    D = 128
    print("loading and counting")
    data_size = data.count()
    print("count done")
    print("model being loaded.....")
    model = KMeansModel.load(sc, sys.argv[2])
    print("model loaded!")

    centers = model.clusterCenters
    # ################SAMPLING##################################################
    #total_sampled_points = int(sys.argv[3])
    cluster = {}
    samples = {}
    print("data being stored in array....")
    #da = data.collect()
    print("data stored")

    n_clusters = model.k

    for j in range(n_clusters):
        cluster[j] = []
Пример #52
0
    currTime = strftime("%Y-%m-%d-%H-%M-%S")
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/initial_centroids.csv")
    dataset = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/training_data.csv")
    predict_data = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv")

    average_per_year = average_year(lines) # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(), k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data, k, maxIterations = 100, initialModel = KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model, count_lines)
    print("Prob: ", probabilities)
Пример #53
0
# -*- coding: utf-8 -*-
from konlpy.tag import Twitter
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Normalizer
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt


sc = SparkContext()
sqlContext = SQLContext(sc)

normData = sc.pickleFile('idf_normalized')
clusters = KMeansModel.load('KMeasModel')
text = normData.map(lambda x : (x.no,x.eval_content))
data = normData.map(lambda x : (x.no,clusters.predict(x.idf_norm)) )
result = text.join(data).map(lambda (k, (left,right)) : (right,left.encode('uft-8')) )
for i in range(10):
	result.filter(lambda (x,y): x == i).map( lambda (x,y): y).saveAsTextFile("KMeansOutput/cluster_"+str(i))