Пример #1
0
def logo_feature_cluster(train_feature_list, train_name_list, clusternum):
    '''训练'''
    model = KMeans.train(sc.parallelize(train_feature_list),
                         clusternum,
                         maxIterations=10,
                         initializationMode="random",
                         seed=50,
                         initializationSteps=5,
                         epsilon=1e-4)
    model_path = tempfile.mkdtemp()
    model.save(sc, model_path)
    model = KMeansModel.load(sc, model_path)
    '''预测'''
    predict = model.predict(sc.parallelize(train_feature_list))
    # print(predict.collect())
    try:
        rmtree(model_path)
    except OSError:
        pass

    logo_result_path = os.path.join(
        result_path, "logo_image_result" + str(clusternum) + ".txt")
    writeResultTofile(logo_result_path, train_name_list, predict.collect())
    '''Calinski-Harabasz聚类评估指标'''
    # evaluationCH = metrics.calinski_harabaz_score(train_feature_list, predict.collect())
    # ch = str(round(evaluationCH, 2))
    # print("Calinski-Harabasz聚类评估指标:"+ch)
    # with open(result_path + "Calinski-Harabasz.txt", 'a') as a:
    #     a.write(str(clusternum)+":" + ch + "\n")
    '''Silhouette-Coefficient聚类评估指标'''
def loadModel():
	clusterModel = KMeansModel.load(sc, pv.clusterModelPath)
	classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath)

	if pv.outputDebugMsg:
		Utils.logMessage("\nLoad cluster & classification model finished")
	return clusterModel, classificationModel
def kmeans_w2v_predict():
    # appName='kmeans_w2v_predict'
    # sc = SparkContext(appName=appName)
    # from pyspark.sql import SQLContext
    from pyspark.mllib.clustering import KMeans, KMeansModel
    # sqlContext = SQLContext(sc)
    data = sqlContext.read.parquet(
        "hdfs:///user/rmusters/lambert_w2v_data_jan")
    # df = data.toDF("text", "filtered_text", "split_text", "vectors", "id")
    df = data.toDF("tokens", "vectors", "id")
    df = df.where(df.vectors.isNotNull())
    data = df.rdd
    model = KMeansModel.load(sc,
                             "hdfs:///user/rmusters/lambert_kmeans_w2v_jan")

    # data = data.map(lambda (text, filtered_text, split_text, vectors, id): (text, filtered_text, split_text, vectors, model.predict(vectors), id))
    # df = data.toDF(["text", "filtered_text", "split_text", "vectors", "cluster", "id"])
    data = data.map(lambda (tokens, vectors, id):
                    (tokens, vectors, model.predict(vectors), id))
    df = data.toDF(["tokens", "vectors", "cluster", "id"])
    df = df.select("cluster", "id")
    df = df.sort(df.cluster.asc())
    df.write.format("com.databricks.spark.csv").mode("overwrite").save(
        "lambert_w2v_data_cluster.csv")
    # df.save("hdfs:///user/rmusters/lambert_w2v_data_cluster.csv", "com.databricks.spark.csv")
    df.write.parquet("hdfs:///user/rmusters/lambert_w2v_data_cluster",
                     mode="overwrite")
Пример #4
0
def model_instream(sc, **params):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        sc._jsc.hadoopConfiguration())
    if not fs.exists(
            sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) +
                                              '/model/' + params['path'])):
        raise Exception("Invalid file path, path not exists!")
    if params['type'] == 'kmeans':
        model = KMeansModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'fpgrowth':
        model = FPGrowthModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'logistic-regression':
        model = LogisticRegressionModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'word2vec':
        model = Word2VecModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'decision-tree':
        model = DecisionTreeModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    else:
        raise Exception("Invalid model type!")
    return True, model
Пример #5
0
def kmeans():
    """
    使用mllib对Spark安装包mllib的测试数据集做K-means聚类,由于train方法:
        Training points as an `RDD` of `Vector` or convertible
    所以需对数据集格式化:
        初始数据集 --> ['0.0 0.0 0.0', '0.1 0.1 0.1', '0.2 0.2 0.2']
        格式化后数据集 --> [array([0., 0., 0.]), array([0.1, 0.1, 0.1]), array([0.2, 0.2, 0.2])]
    :return:
    """
    data_rdd = sc.textFile('{}/mllib/kmeans_data.txt'.format(current_dir))
    parsed_data_rdd = data_rdd.map(lambda line: array([float(x) for x in line.split(' ')]))

    # 建立聚类模型
    clusters = KMeans.train(parsed_data_rdd, 2, maxIterations=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x ** 2 for x in (point - center)]))

    WSSSE = parsed_data_rdd.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # 保存 训练好的模型
    model_path = "{}/kmeans_model".format(current_dir)
    if not os.path.exists(model_path):
        clusters.save(sc, model_path)

    trained_model = KMeansModel.load(
        sc, "{}/kmeans_model".format(current_dir)
    )
    return trained_model
Пример #6
0
def getCluster(price, crime, male, female, white, black, asian, hispanic,
               young, mid_age, senior):
    KModel = KMeansModel.load(sc, "project/data/output/KMeansModel")
    cluster = KModel.predict([
        price, crime, male, female, white, black, asian, hispanic, young,
        mid_age, senior
    ])
    return cluster
Пример #7
0
 def KMeans_Processing(self, columns):
     data_point = np.array(self.df_PD[columns])
     model = KMeansModel.load(
         self.sc, self.baseDir + '/fraudModel/Model/' + 'KMeans')
     result = np.array(
         model.predict(self.sc.parallelize(data_point)).collect())
     self.df_PD.insert(len(list(self.df_PD.columns)), 'KMeans_feature',
                       result)
Пример #8
0
def main():
    sc = SparkContext(appName="tileMapper")
    print("I do all the input output jazz")

    ###########################################################################
    big_image = sc.binaryFiles("Reference/108103_sm.jpg")
    tile_avgs = big_image.flatMap(extract_opencv_tiles())
    #buckets = tile_avgs.collect()
    #print("Bucket",buckets)
    tileMap = tile_avgs.map(
        lambda l: [item for sublist in l for item in sublist])
    tileList = tileMap.collect()
    print("Tile Map", tileMap)
    print("Tile Map", tileMap.collect())
    print("Tile List", tileList)
    print("Tile LIst", type(tileList))
    ############################################################################

    clusterIndex = getIndex()
    kmModel = KMeansModel.load(sc, "myModelPath")
    readyToCombine = []
    currentRow = None
    noOfRow = 0
    noOfCol = 0
    firstTile = tileList[0]
    tileSize = firstTile[1]
    #Randomly Get small images using kmeans match
    for tile in tileList:
        if tile[0] == currentRow:
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
            noOfCol = noOfCol + 1
        else:
            currentRow = tile[0]
            noOfCol = 1
            noOfRow = noOfRow + 1
            currentRow = tile[0]
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
    #Put small images into the big image canvas

    canvas = np.zeros((noOfRow * tileSize, noOfCol * tileSize, 3), np.uint8)

    #Print Image
    print("No. of Col", noOfCol)
    print("No. of Row", noOfRow)
    #print("Before Print, Check Once again",readyToCombine)
    mosaicImage = printImage(readyToCombine, canvas, noOfCol, noOfRow,
                             tileSize)

    print("Finished processing of image")
    cv2.imwrite('mosaicImageYeah.jpg', mosaicImage)
def main():
    modelname = sys.argv[1]
    tiffname = sys.argv[2]
    outputname = sys.argv[3]
    sc = SparkContext()
    model = KMeansModel.load(sc, modelname)
    dataset = gdal.Open(tiffname, GA_ReadOnly)
    x, y, data = train.tiff_to_array(dataset, train.weights)
    driver = dataset.GetDriver().ShortName
    clusterdata = sc.parallelize(data)
    result = np.array(clusterdata.map(lambda point: model.predict(point)).collect())
    write_to_tif(outputname, x, y, result, driver)
Пример #10
0
def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    centers = sameModel.clusterCenters
    print("Cluster Centers: ")
    for n, center in enumerate(centers):
        out_f = OUTPUT_DATA + str(n) + "Cluster.csv"
        numpy.savetxt(out_f, center, newline=";")
        print(center)
Пример #11
0
def kmeans_classification(sc, c_tag, util):
    print 'data retrieving'
    _data_ = data_retriever(c_tag)
    print len(_data_)
    #rids = [x[0] for x in _data_]

    blc = nlpb.nlpblockbase()

    __ans = [train_feature_extraction(x, c_tag, blc, util) for x in _data_]
    ans = [[float(k) for k in x] for x in __ans]
    #print ans
    train_data = [np.array(sf.softmax(x)) for x in ans]
    #train_data = ans
    print 'dataprep done'
    assert_len(train_data)
    brotrain = sc.broadcast(train_data)
    clusters = KMeans.train(sc.parallelize(brotrain.value),
                            200,
                            maxIterations=10,
                            initializationMode="random")

    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = map(lambda point: error(point), train_data)
    WSSSE = reduce(lambda x, y: x + y, WSSSE)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    clustered = collections.defaultdict(list)
    print len(_data_)
    cnt = 0
    for row in _data_:
        clustered[clusters.predict(train_data[cnt])].append(
            [row[0], row[2], row[3], row[4]])
        cnt += 1
    print len(clustered)
    for k in clustered.keys():
        print len(clustered[k])

    clusters.save(sc, "MovieModel")
    sameModel = KMeansModel.load(sc, "MovieModel")

    ref = collections.defaultdict(list)
    for point in train_data:
        ref[sameModel.predict(point)].append(point)

    for x, y in zip(ref.keys(), clustered.keys()):
        assert len(ref[x]) == len(clustered[y])

    return clustered
Пример #12
0
def main(sc):
    data = [[1.0, 1.0], [1.0, 0.8], [-1.0, 1.0], [-1.0, -1.0]]
    parsedData = sc.parallelize(data)
    kmeansModel = KMeans.train(parsedData,
                               2,
                               maxIterations=10,
                               runs=10,
                               initializationMode="random")
    print(kmeansModel.predict([1.0, 1.0]))
    print(kmeansModel.predict([1.0, -2.0]))
    # Save and load model
    kmeansModel.save(sc, "KMeansModel")
    model = KMeansModel.load(sc, "KMeansModel")
    print(model.predict([1.0, 1.0]))
    print(model.predict([1.0, -2.0]))
Пример #13
0
    def print_model(self,model_name):
        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        for c in model.clusterCenters:
            print(c)
        for c in model.clusterCenters:
            l = []
            for i in c:
                i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
                l.append(float(i))
            print(l)
Пример #14
0
def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Data
    dataset = sc.textFile(INPUT_DATA, cpu_count)
    dataset = dataset.map(
        lambda line: array([float(x) for x in line.split(';')]))

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    # Predict cluster labels per row
    labels = sameModel.predict(dataset).collect()

    # Save labels in json file
    with open(OUTPUT_LABEL, 'w') as out_f:
        json.dump(labels, out_f)
Пример #15
0
def predict():
    from pyspark.mllib.clustering import KMeans, KMeansModel
    from pyspark.mllib.linalg import Vectors

    data = sqlContext.read.format("com.databricks.spark.csv").option(
        "header", "true").load("w2v_vector.csv")
    data = data.map(lambda x: [float(a) for a in x])
    df = data.toDF()
    columns = df.columns
    vectors = df.select(columns[1:71])
    vectors = vectors.map(lambda x: Vectors.dense(x))

    for n_clusters in _range:
        model = KMeansModel.load(
            sc, "hdfs:///user/rmusters/w2v_model_kmeans_" + str(n_clusters))

        predicted = model.predict(vectors)
        result = predicted.map(lambda x: (x, )).toDF()
        result.save("clusters_" + str(n_clusters))
def kmeans_lda_predict():
    appName = 'kmeans_lda_predict'
    from pyspark.mllib.clustering import KMeans, KMeansModel
    from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
    sc = SparkContext(appName=appName)
    from pyspark.sql import SQLContext
    sqlContext = SQLContext(sc)
    data = sqlContext.read.parquet(
        "hdfs:///user/rmusters/lda_doc_topic")  #lda_data_jan
    # data = df.rdd
    model = KMeansModel.load(sc, "hdfs:///user/rmusters/kmeans_lda_jan")
    data = data.map(lambda (id, vectors):
                    (id, vectors, model.predict(vectors)))
    df = data.toDF(["id", "vectors", "cluster"])
    df = df.sort(df.cluster.asc())
    # df.write.parquet("hdfs:///user/rmusters/lda_data_cluster", mode= "overwrite")
    df.write.parquet("hdfs:///user/rmusters/lda_data_cluster",
                     mode="overwrite")

    logger.info(appName)
Пример #17
0
    def predict(self, model_name, data):

        '''
        predict unknown data
        :param model_name: the trained model saving on hdfs
        :param data: unknown data
        :return: (cluster_index, cluster)
        '''

        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        # get the predict : means which cluster it belongs to
        index = model.predict(data)
        print('Data:%s belongs to cluster:%s. The index is %s' % (data, model.clusterCenters[index], index))
        return index, model.clusterCenters[index]
Пример #18
0

# Load and parse the data
# conf = SparkConf()
sc = SparkContext()
data = sc.textFile("./business_gps.csv")
parsedData = data.map(parse_line)

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        10,
                        maxIterations=1000,
                        runs=10,
                        initializationMode="k-means")


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "kmeans_model")
sameModel = KMeansModel.load(sc, "kmeans_model")

print("Cluster centers", sameModel.clusterCenters)
Пример #19
0
from pyspark import SparkContext
# Load and parse the data
sc = SparkContext()

data = sc.textFile("/user/hduser/venkat/iris.txt")
print data.first()
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=10,
                        initializationMode="random")
prediction = clusters.predict(parsedData)
print clusters.centers


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
myModelPath = "/user/hduser/output/kmeans_output"
clusters.save(sc, myModelPath)
sameModel = KMeansModel.load(sc, myModelPath)
Пример #20
0
        
def parseOwnership(line):
    fields = line.split(',')
    #print len(fields)
    owner = fields[0]
    taxes = int(fields[1])
    lat = float(fields[2])
    lon = float(fields[3])
    cluster = int(fields[4])
    
    return (owner, taxes, lat, lon, cluster)

conf = SparkConf().setMaster("local").setAppName("tugasbigdata")
sc = SparkContext(conf = conf)

clusters = KMeansModel.load(sc,"C:/SparkCourse/FP_model")

lines = sc.textFile("file:///SparkCourse/data_center.csv")
parsedLines = lines.map(parseLine)
#bersih bersih
reserveddata= parsedLines.filter(lambda x : x is not None)
reserveddata1= reserveddata.filter(lambda x : x[1] is not None)
reserveddata2=reserveddata1.filter(lambda x : x[2] is not None)
temp=reserveddata2.filter(lambda x : "San Francisco" in x[0])
tempdata=temp.map(lambda x: (x[1],x[2]))
data=tempdata.map(lambda x: (float(x[0]),float(x[1])))

data_local = data.collect()

ownershipData = lines.map(parseOwner)
#bersih2 lagi
Пример #21
0
if __name__ == "__main__":

    b = open("name", 'wb')
    sc = SparkContext("local[*]", "kmeans")

    print("data being loaded.....")
    data = sc.textFile(
        sys.argv[1]).map(lambda row: map(lambda x: float(x), row.split(',')))
    #file:///dev/desc_hdfs
    print("data loaded!")
    D = 128
    print("loading and counting")
    data_size = data.count()
    print("count done")
    print("model being loaded.....")
    model = KMeansModel.load(sc, sys.argv[2])
    print("model loaded!")

    centers = model.clusterCenters
    # ################SAMPLING##################################################
    #total_sampled_points = int(sys.argv[3])
    cluster = {}
    samples = {}
    print("data being stored in array....")
    #da = data.collect()
    print("data stored")

    n_clusters = model.k

    for j in range(n_clusters):
        cluster[j] = []
    # check input db
    input_db = os.path.join(args.input_root_dir, "dbs", "db", "out.parquet")
    if not os.path.isdir(input_db):
        raise Exception("missing db parquet directory")

    # check output dir
    # logger.debug("Create new codebook dir...")
    output_dir = os.path.join(args.input_root_dir, 'features', 'feature')
    if os.path.isdir(output_dir):
        new_name = output_dir + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
        logger.info("backup old output-dir in  %s" % new_name)
        os.rename(output_dir, os.path.join(args.input_root_dir, new_name))
    os.makedirs(output_dir)

    model = KMeansModel.load(sc, input_codebook)

    model = sc.broadcast(model)

    pooling="max"
    feature_name = "SIFT"
    df = sqc.read.parquet(input_db)

    print df.count()

    features_bow = df.map(functools.partial(compute_global_feature,
                                                feature_name="SURF",
                                                model=model,
                                                pooling=pooling))

    print features_bow.first()
Пример #23
0
def process(id, content):
	kmeansDoc = DocumentKmeans(id, '')
	print(type(content))
	kmeansDoc.content = content
	kmeansDoc.doc2vec()
	kmeansDoc.printVec()
	kmeansDoc.kmeansVec()
	kmeansDoc.cluster_id = kmeansPredict.predict(kmeansDoc)
	return kmeansDoc
    
utils = Utils(45000)

sc = SparkContext(appName="PythonKafkaConsumerKmeans")

kmeansModel = KMeansModel.load(sc, '../KmeansModel')

kmeansPredict = KmeansPredict(kmeansModel)

consumer = KafkaConsumer('test', group_id='kafka_consumer_group', bootstrap_servers=['localhost:9092'])

for message in consumer:
	value = message.value
	spl = value.split(':')
	id = spl[0]
	content = utils.normalizeString(spl[1])
	docu = process(id, content)
	print(docu.cluster_id)
    
KafkaConsumer(auto_offset_reset='earliest', enable_auto_commit=False)
KafkaConsumer(value_deserializer=lambda m: json.loads(unicode(m, "utf8")))
sc = SparkContext()
for route, directories, files in os.walk('/media/deepak/data_words.csv'):
    for file in files:
        f_name = os.path.join(route, file).split('/')
        # Load and Parse the data
        data = sc.textFile(os.path.join(route, file))
        dataParsed = data.map(
            lambda line: array([float(x) for x in line.split(',')]))
        # Build the model (cluster the data)
        clusters = KMeans.train(dataParsed,
                                4,
                                maxIterations=10,
                                runs=10,
                                initializationMode="random")

        # Evaluate clustering by computing Within Set Sum of Squared Errors
        def error(point):
            center = clusters.centers[clusters.predict(point)]
            return sqrt(sum([x**2 for x in (point - center)]))

        WSSSE = dataParsed.map(lambda point: error(point)).reduce(
            lambda x, y: x + y)
        print(f_name[-1])
        print('\n')
        print("Within Set Sum of Squared Error is " + str(WSSSE))
        print('\n')
        print(clusters.clusterCenters)
        # Save and load model
        clusters.save(sc, "/media/deepak/Kmean_output/" + f_name[-1])
        sameModel = KMeansModel.load(
            sc, "/media/deepak/Kmean_output/" + f_name[-1])
from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import KMeans, KMeansModel
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("data/mllib/kmeans_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
    sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
    # $example off$

    sc.stop()
Пример #26
0
import sys

from pyspark import SparkConf, SparkContext
from pyspark.mllib.clustering import KMeans, KMeansModel
import numpy as np

if __name__ == "__main__":
    sparkConf = SparkConf()
    sparkContext = SparkContext(conf=sparkConf)

    model = KMeansModel.load(sparkContext, path=sys.argv[1])

    print("Model loaded")

    point = [1.1, 3.2]
    print("Point 1: " + str(point))
    print("Predict: " + str(model.predict(point)))

    point = [5.1, 1.4]
    print("Point 2: " + str(point))
    print("Predict: " + str(model.predict(point)))

    point = np.array([5.2, 2.0])
    print("Point 3: " + str(point))
    print("Predict: " + str(model.predict(point)))

    point = np.array([1.0, 4.0])
    print("Point 4: " + str(point))
    print("Predict: " + str(model.predict(point)))

    point = [3.4, 2.0]
Пример #27
0
 def __init__(self, spark_contect, model_path):
     self.model = KMeansModel.load(spark_contect, model_path)
Пример #28
0
def main():
    data_rdd = load_data(project["data_file"])
    print (data_rdd.count())

    listed_data_rdd = data_rdd.map(data_extractor)

    # Filtering unwanted data rows
    elect_filtered_rdd = listed_data_rdd.filter(electric_vehicles_filter)
    filtered_rdd = elect_filtered_rdd.filter(empty_cost_filter)

    # Mapping related data to convenient format for clustering
    cost_tx_rdd = filtered_rdd.map(cost_transform_mapper)
    feature_mapped_rdd = cost_tx_rdd.map(feature_mapper)

    estimated_clusters = KMeansModel.load(spark_context, "identified_clusters")

    optimum_cluster = 4
    optimum_points_rdd = feature_mapped_rdd.filter(
        lambda filtered_data_feature_vector: estimated_clusters.predict(filtered_data_feature_vector[-1])
        == optimum_cluster
    )

    print (optimum_points_rdd.count())

    sample_data = optimum_points_rdd.take(100)

    for data in sample_data:
        feature_vector = data[-1]
        pyplot.scatter(feature_vector[0], feature_vector[1])

    optimum_cluster_manufactures_rad = optimum_points_rdd.map(manufactures_mapper)

    optimum_points_rdd.persist()  # To hold the previously calculated data set in memory

    individual_manufactures_count_rdd = optimum_cluster_manufactures_rad.reduceByKey(operator.add)

    sorted_manufactures_count_rdd = individual_manufactures_count_rdd.sortBy(
        lambda manufactures_set: manufactures_set[1], ascending=False
    )

    top_ten = 10
    vehicle_count = []
    manufactures_name = []
    for manufacture in sorted_manufactures_count_rdd.take(top_ten):
        vehicle_count.append(manufacture[1])
        manufactures_name.append(manufacture[0])
        print (manufacture)

    pyplot.title("Best Vehicle Cluster")
    pyplot.xlabel("Europe Rating")
    pyplot.ylabel("Feature Normalized")

    pyplot.show()

    number_of_manufactures = top_ten  # sorted_manufactures_count_rdd.count()
    index = np.arange(number_of_manufactures)

    bar_width = 0.5

    opacity = 0.4
    error_config = {"ecolor": "0.3"}

    chart = pyplot.bar(
        index, vehicle_count, bar_width, alpha=opacity, color="b", error_kw=error_config, label="manufactures"
    )
    pyplot.xticks(index + bar_width, manufactures_name)

    pyplot.title("Top preforming vehicles")
    pyplot.xlabel("manufactures")
    pyplot.ylabel("Vehicles count")

    pyplot.show()
Пример #29
0
def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))
Пример #30
0
import json
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeansModel


def mapper(line):
    # Format the line
    line = line.replace("(", "").replace(")", "").replace("[", "").replace("]", "")
    elements = line.split(",")
    stock_name = elements.pop(0)
    percent_changes = map(lambda x: float(x), elements)

    return stock_name, percent_changes


if __name__ == "__main__":
    sc = SparkContext(appName="ComputeResults")

    model = KMeansModel.load(sc, sys.argv[2])

    mapred_results = sc.textFile(sys.argv[1])
    clusters = mapred_results.map(mapper)\
        .reduceByKey(lambda a, b: a + b)\
        .map(lambda stock: (model.predict(array(stock[1])), [stock[0]]))\
        .reduceByKey(lambda a, b: a + b)\
        .collectAsMap()

    with open('result.json', 'w') as fp:
        json.dump(clusters, fp)
Пример #31
0
from collections import OrderedDict
from numpy import array
from math import sqrt

if __name__ == "__main__":
    if (len(sys.argv) != 2):
        print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
          "predict.py kddcup.data.file"
        sys.exit(1)

    data_file = sys.argv[1]
    conf = SparkConf().setAppName("KDDCup99") \
      #.set("spark.executor.memory", "2g")

    sc = SparkContext(conf=conf)

    model = KMeansModel.load(sc, "best_model")

    clusters = model.clusterCenters

    with open(data_file) as file:
        for line in file:
            line_split = line.split(",")
            clean_line_split = [line_split[0]] + line_split[4:]
            clusterIndex = model.predict(
                array([float(x) for x in clean_line_split]))
            print clusterIndex
    print

print "DONE!"
# Clustering
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

# Load and parse input data
data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build and train the model: K=2, 10 iterations.
clusters = KMeans.train(parsedData, 2, 10)


# Evaluate the clustering
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print("Within Set Sum of Squared Error = " + str(WSSSE))

# Saving and loading the model
clusters.save(sc, "MyModels")
sameModel = KMeansModel.load(sc, "MyModels")
sameModel
sameModel.k
sameModel.clusterCenters
sqlContext = SQLContext(sc)

# Read the input parquet
input_crime = sys.argv[1]

# Read the parquet data and convert to RDD
parquet_crime = sqlContext.read.parquet(input_crime)
parquet_crime.registerTempTable("crime_table")
crime_table = sqlContext.sql("SELECT * FROM crime_table")
crime_rdd = crime_table.map(lambda line: str(line.Year) + "," + str(line.Latitude) + ","
                                       + str(line.Longitude) + "," + str(line.Crime_Frequency))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = crime_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input, 20, maxIterations=5,
        runs=5, initializationMode="random")

# Compute root mean squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

# Save the cluster output into parquet files
clusters.save(sc, "myModel_crime")
sameModel = KMeansModel.load(sc, "myModel_crime")
        SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize'))
        SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory)
        SparkContext.setSystemProperty('spark.cores.max', args.core_max)

        sc = SparkContext(args.sp_master, 'single_predict:'+str(args.row_id))
        flag_model = ml_opts['learning_algorithm']        
        save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str

        if flag_model == "linear_svm_with_sgd":
            mllib_model = SVMModel.load(sc, save_dir)
            col_num = len(mllib_model.weights)
        elif flag_model == "logistic_regression_with_lbfgs" or flag_model == "logistic_regression_with_sgd":
            mllib_model = LogisticRegressionModel.load(sc, save_dir)
            col_num = mllib_model.numFeatures # len(mllib_model.weights) return 3x value
        elif flag_model == "kmeans":
            mllib_model = KMeansModel.load(sc, save_dir)
            col_num =len(mllib_model.clusterCenters[0])
        else:
            print "ERROR: Training model selection error: no valid ML model selected!"
            return
        # get the model dimension
        #col_num = len(mllib_model.weights)
        print "INFO: total feature # in mllib model=",col_num

        # calculate hypothesis value ================
        model_weight=None
        if learning_algorithm not in ("kmeans") :
            model_weight=mllib_model.weights
            intercept=mllib_model.intercept 

        coef_arr=None
Пример #35
0
from numpy import array
from math import sqrt


from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

from pyspark_cassandra import streaming

def predict(line):
    spl = line.split(':')
    doc = DocumentKmeans(spl[0], spl[1], spl[2])
    doc.kmeansVec()
    doc.cluster_id = kmeansPredict.predict(doc)
    return doc


sc = SparkContext()
sameModel = KMeansModel.load(sc, "../KmeansModel")
kmeansPredict = KmeansPredict(sameModel)

parsedData = sc.textFile('hdfs://localhost:8020/user/manh/vector')\
	.filter(lambda x: len(x) > 2000)\
	.map(lambda x: predict(x))\
	.map(lambda x: {
		'id' : x.id,
		'cluster_id' : int(x.cluster_id),
		'timestamps' : long(x.timestamps),
		'vector' : x.vector
	}).saveToCassandra('reishi', 'dockmeans')
        print arrays.collect()
        indx = 0
        while indx < count:
            vec = Vectors.dense(arrays.collect()[indx])
            indx += 1
            clusternum = model.predict(vec)
            print "Cluster -> ", clusternum, vec
    return


# Create a local StreamingContext with two working thread and batch interval of 1 second
conf = SparkConf().setAppName("Fraud Detector")
conf = conf.setMaster("local[2]")

sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 8999)
# Split each line into words

model = KMeansModel.load(sc, "kmeansmodel01")
print model.clusterCenters
print "************************** Loaded the model *********************"

words = lines.flatMap(lambda line: line.split(" "))

lines.foreachRDD(detect)
ssc.start()  # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate
Пример #37
0
 def kmeans_model_load(self, sc,path):
     return KMeansModel.load(sc, path)
Пример #38
0
from pyspark import SparkContext

from pyspark.mllib.clustering import KMeans, KMeansModel


if __name__ == "__main__":
    sc = SparkContext(appName="KMeansApp")  # SparkContext

   
    # Load and parse the data
    data = sc.textFile("s3://irm238FinalProject/input/citibike*")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10,
                            runs=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "KmeansModel")
    sameModel = KMeansModel.load(sc, "KMeansModel")
    

    sc.stop()
#print np.shape(parsedData)
# Build the model (cluster the data)

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
	center = clusters.centers[clusters.predict(point)]
	#print center
	return sqrt(sum([x**2 for x in (point - center)]))
#WSSE = parsedData.map(lambda point:error(point)).reduce(lambda x,y:x+y)
WSSE=np.zeros(5)
import time
for i in range(2,7):
	t = time.time()
	clusters = KMeans.train(parsedData, i, maxIterations=100,
        	runs=100, initializationMode="random")
	WSSE[i] = (parsedData.map(lambda point:error(point)).reduce(lambda x,y :x+y))
	print str(WSSE[i])+"   "+str(i)+"   WITH TIME ="+str(time.time()-t)

#WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
#print("Within Set Sum of Squared Error = " + str(WSSSE))



# Save and load model
clusters.save(sc, "./mymodel1")
sameModel = KMeansModel.load(sc, "./mymodel1")

#print clusters

from pyspark import SparkContext, SparkConf
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

def parse_line(ln):
	split_ln = ln.split(',')
	ln_coord = [float(split_ln[1]), float(split_ln[2])]
	new_line = ln + ',' + cluster_labels[ KMeans_model.predict(ln_coord)]
	return new_line

# load and parse the data
# conf = SparkConf()
sc = SparkContext()

# load previously generated k-means model
KMeans_model = KMeansModel.load(sc, "kmeans_model")

# define cluster label array
cluster_labels = ["Pheonix-AZ", "Edinburgh-UK", "Charlotte-NC", "Madison-WI", "Montreal-Canada", "Waterloo-Canada", "Las Vegas-NV", "Urbana-Champaign-IL", "Pittsburgh-PA", "Karlsruhe-Germany"]

# read the file which has business_ids, latitude, longitude
data = sc.textFile("./business_gps.csv")

# get labelled rows
parsedData = data.map(parse_line)

# save labelled businesses in the output folder
parsedData.saveAsTextFile("./output")
Пример #41
0
print('\n ====== Predicting ' + sys.argv[1] + ' clusters ====== \n')

spark = SparkSession.builder.appName('spark_benchmark').getOrCreate()
sc = spark.sparkContext
sc.setLogLevel(
    'ERROR'
)  # By default spark displays a lot of information, we limit it to errors

# Load and parse the data
data = sc.textFile(
    "data/household_power_consumption_no_head_no_date_prediction.csv")
parsedData = data.map(lambda line: array([float(x) for x in line.split(';')]))

finalData = sc.parallelize(parsedData.take(int(sys.argv[1]))).repartition(360)

model = KMeansModel.load(sc, "models/" + sys.argv[2])

start = timer()

# Build the model (cluster the data)
predictions = model.predict(finalData)

end = timer()

WSSSE = model.computeCost(finalData)
print("Within Set Sum of Squared Error = " + str(WSSSE))

out = open('out/learning.csv', 'a')
out.write(repr(end - start) + ' ' + repr(WSSSE) + ' ')

out.flush()
Пример #42
0
if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("k_means_data.txt")
    parsedData = data.map(
        lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData,
                            2,
                            maxIterations=10,
                            initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(
        lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "KMeansModel")
    sameModel = KMeansModel.load(sc, "KMeansModel")
    # $example off$

    sc.stop()
input_living_index = sys.argv[1]

# Read the parquet data and convert to RDD
parquet_living_index = sqlContext.read.parquet(input_living_index)
parquet_living_index.registerTempTable("living_index_table")
living_index_table = sqlContext.sql("SELECT * FROM living_index_table")
living_index_rdd = living_index_table.map(lambda colName: (str(colName.Community_Code) + "," + str(colName.Crime_Frequency)
                                                              + "," + str(colName.Housing_Crowded) + "," + str(colName.Household_BPL)
                                                              + "," + str(colName.Unemployed) + "," + str(colName.Without_Diploma)
                                                              + "," + str(colName.Age_Bar) + "," + str(colName.Per_Capita_Income)
                                                              + "," + str(colName.Hardship_Index)))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = living_index_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input, 20, maxIterations=5,
        runs=5, initializationMode="random")

# Compute squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

# Save the cluster model
clusters.save(sc, "myModel/living-index")
sameModel = KMeansModel.load(sc, "myModel/living-index")
Пример #44
0
from pyspark import SparkContext, SparkConf
# Load and parse the data
# 데이터 로드 및 분석
conf = SparkConf().setMaster("local").setAppName("Test")
sc = SparkContext(conf=conf)
data = sc.textFile("data/mllib/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
# 모델 빌드(데이터 클러스터링)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=20,
                        runs=10,
                        initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors


def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "myModelPath")
sameModel = KMeansModel.load(sc, "myModelPath")
Пример #45
0
from numpy import array
from math import sqrt

# Load and parse input data
data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build and train the model: K=2, 10 iterations. 
clusters = KMeans.train(parsedData, 2, 10)

# Evaluate the clustering
def error(point):
  center = clusters.centers[clusters.predict(point)]
  return sqrt(sum([x**2 for x in (point - center)]))
  
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print("Within Set Sum of Squared Error = " + str(WSSSE))


# Saving and loading the model
clusters.save(sc, "MyModels")
sameModel = KMeansModel.load(sc, "MyModels")
sameModel
sameModel.k
sameModel.clusterCenters

  


Пример #46
0
parquet_crime.registerTempTable("crime_table")
crime_table = sqlContext.sql("SELECT * FROM crime_table")
crime_rdd = crime_table.map(
    lambda line: str(line.Year) + "," + str(line.Latitude) + "," + str(
        line.Longitude) + "," + str(line.Crime_Frequency))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = crime_rdd.map(
    lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input,
                        20,
                        maxIterations=5,
                        runs=5,
                        initializationMode="random")


# Compute root mean squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


error = cluster_input.map(lambda point: squared_error(point)).reduce(
    lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

clusters.save(sc, "myModel_crime/crime")
sameModel = KMeansModel.load(sc, "myModel_crime/crime")
    if ascontext.isComputeDataModelOnly():
        ascontext.setSparkOutputSchema(output_schema)
        sys.exit(0)
    else:
        modelpath = ascontext.getModelContentToPath("model")
        model_metadata = json.loads(ascontext.getModelContentToString("model.metadata"))

# create a DataModelTools to handle data model and data conversions
datamodel = model_metadata["datamodel"]
dmt = DataModelTools(datamodel)

predictors = model_metadata["predictors"]
DataModelTools.checkPredictors(datamodel,predictors,df)

from pyspark.mllib.clustering import KMeansModel
model = KMeansModel.load(sc, modelpath)

# to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this
dv = dmt.extractDenseVector(df,predictors,setToFlag=1.0)

def rowToList(row):
        result = []
        for idx in range(0, len(row)):
            result.append(row[idx])
        return result


mapFn = lambda (x,y):rowToList(x)+[y]

rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])])
Пример #48
0
                        maxIterations=10,
                        initializationMode="random")

# Evaluate clustWithin Set Sum of Squared Errors

# Compute the Within Set Sum of Squared Errors
# Steps to Find WSSE:
# 1. Square the distance of one point from its respective centroid
# 2. Summate all the rest of the points distances from that centroid
# 3. Do the same for the next cluster and its centroid and so on
# 4. Summate the totals of the each respective clusters' squared errors
# 5. The WSSE has been found; note that a lower number tends to mean
#    the centroids are more tigtly linked with their respective points
#    that does not necessarily mean that the model is better though
#    depending on context.


#function to calculate squared error of one point from its respective centroid
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


#add together all the squared errors and print the result
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "hdfs///user/lev/KMeansModel_#1")
loadModel = KMeansModel.load(sc, "hdfs///user/lev/KMeansModel_#1")
Пример #49
0
    
	#for each cluster center (w) in words add 1 to bow array
	for w in words:
         bow[w] += 1
    return Row(fileName=image_name, bow=bow.tolist())
	
if __name__ == "__main__":
    sc = SparkContext(appName="kmeans_assign")
    sqlContext = SQLContext(sc)

    try:
        feature_parquet_path = sys.argv[1]
        kmeans_model_path = sys.argv[2]
        bow_parquet_path = sys.argv[3]
    except:
        print("not all parameters chosen")

	#read features, kmeans model, get cluster centers from model and send it to nodes as a variable (centers)
    features = sqlContext.read.parquet(feature_parquet_path)
    vocabulary = KMeansModel.load(sc, kmeans_model_path)
    centers = vocabulary.clusterCenters
    centers = sc.broadcast(centers)
   
	#map function for quantizing
    bag_of_words = features.rdd.map(functools.partial(quantizing, centers=centers))
	
	featuresSchema = sqlContext.createDataFrame(bag_of_words)
    featuresSchema.registerTempTable("images")
    featuresSchema.write.parquet(bow_parquet_path)
    sc.stop()
Пример #50
0
conf = SparkConf().setAppName('KMeans').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse data
data = sc.textFile('../data/kmeans_data.txt')
parseData = data.map(
    lambda line: np.array([float(x) for x in line.split(' ')]))

# build the model
clusters = KMeans.train(parseData,
                        2,
                        maxIterations=10,
                        runs=10,
                        initializationMode='random')


#evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return math.sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parseData.map(lambda p: error(p)).reduce(lambda x, y: x + y)
print('Within Set Sum of Squared Error :' + str(WSSSE))

# save and load model
clusters.save(sc, '../model/KMeansModel')
sameModel = KMeansModel.load(sc, '../model/KMeansModel')

sc.stop()
Пример #51
0
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

from pyspark import SparkContext

sc = SparkContext(appName="AA_kmeans")

# Load and parse the data
data = sc.textFile("hdfs://namenode/kmeans_data.txt")
#print(data.take(5))
#print data.map(lambda line: array([x for x in line.split(' ')])).collect()
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
        runs=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "hdfs://namenode/myModelPath")
sameModel = KMeansModel.load(sc, "hdfs://namenode/myModelPath")
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

# Load and parse the data
data = sc.textFile("/home/grijesh/sampleData/k-means-data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
                        runs=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "myModelPath")
sameModel = KMeansModel.load(sc, "myModelPath")
Пример #53
0
# -*- coding: utf-8 -*-
from konlpy.tag import Twitter
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Normalizer
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt


sc = SparkContext()
sqlContext = SQLContext(sc)

normData = sc.pickleFile('idf_normalized')
clusters = KMeansModel.load('KMeasModel')
text = normData.map(lambda x : (x.no,x.eval_content))
data = normData.map(lambda x : (x.no,clusters.predict(x.idf_norm)) )
result = text.join(data).map(lambda (k, (left,right)) : (right,left.encode('uft-8')) )
for i in range(10):
	result.filter(lambda (x,y): x == i).map( lambda (x,y): y).saveAsTextFile("KMeansOutput/cluster_"+str(i))