示例#1
0
文件: yispark.py 项目: eason001/imBot
def kmeans(df):
	kmeans = KMeans(k=2,seed=1)
	model = kmeans.fit(df)
	centers = model.clusterCenters()
	print len(centers)
	kmFeatures = model.transform(df).select("features", "prediction")
	dfwrite(kmFeatures,'kmFeatures')	
示例#2
0
文件: project.py 项目: sam46/Yelper
def clustering(input_df, input_col_name, n):
    """ KMeans and PCA """
    input_df = input_df.select('state','categories','stars',input_col_name)
    norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0)
    df = norm.transform(input_df)
    kmeans = KMeans(k=n, seed=2)
    KMmodel = kmeans.fit(df)
    predicted = KMmodel.transform(df).cache()
    pca = PCA(k=2, inputCol='features', outputCol="pc")
    df =  pca.fit(dfsample).transform(dfsample).cache()
    return df
示例#3
0
 def test_kmeans_cosine_distance(self):
     data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),),
             (Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),),
             (Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine")
     model = kmeans.fit(df)
     result = model.transform(df).collect()
     self.assertTrue(result[0].prediction == result[1].prediction)
     self.assertTrue(result[2].prediction == result[3].prediction)
     self.assertTrue(result[4].prediction == result[5].prediction)
示例#4
0
def elbow(elbowset, clusters):
	wsseList = []	
	for k in clusters:
		print("Training for cluster size {} ".format(k))
		kmeans = KM(k = k, seed = 1)
		model = kmeans.fit(elbowset)
		transformed = model.transform(elbowset)
		featuresAndPrediction = transformed.select("features", "prediction")

		W = computeCost(featuresAndPrediction, model)
		print("......................WSSE = {} ".format(W))

		wsseList.append(W)
	return wsseList
示例#5
0
文件: tests.py 项目: Bella-Lin/spark
 def test_kmeans_param(self):
     algo = KMeans()
     self.assertEqual(algo.getInitMode(), "k-means||")
     algo.setK(10)
     self.assertEqual(algo.getK(), 10)
     algo.setInitSteps(10)
     self.assertEqual(algo.getInitSteps(), 10)
示例#6
0
 def test_kmeans_summary(self):
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 1)
示例#7
0
 def test_kmean_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     path = tempfile.mkdtemp()
     km_path = path + "/km-pmml"
     model.write().format("pmml").save(km_path)
     pmml_text_list = self.sc.textFile(km_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
示例#8
0
文件: views.py 项目: eason001/imPro
def kmeans(inputdir,df,alg,k):
	from pyspark.ml.clustering import KMeans
        from numpy import array
        from math import sqrt	
	kmeans = KMeans(k=int(k), seed=1,initSteps=5, tol=1e-4, maxIter=20, initMode="k-means||", featuresCol="features")
        model = kmeans.fit(df)
        kmFeatures = model.transform(df).select("labels", "prediction")
        erFeatures = model.transform(df).select("features", "prediction")
	###Evaluation
        rows = erFeatures.collect()
        WSSSE = 0
        for i in rows:
		WSSSE += sqrt(sum([x**2 for x in (model.clusterCenters()[i[1]]-i[0])]))
        print("Within Set Sum of Squared Error = " + str(WSSSE))

	output_data = writeOutClu(inputdir,kmFeatures,alg,k,WSSSE)
	return output_data
示例#9
0
def doKMeans(data):
    # groupedData = data.groupBy("station_id","stationIndex").agg(avg("max_temp"),avg("med_temp"),avg("min_temp"),avg("max_pressure"),avg("min_pressure"),avg("precip"),avg("insolation"))
    # selectedData = groupedData.select("station_id","stationIndex","avg(max_temp)","avg(med_temp)","avg(min_temp)","avg(max_pressure)","avg(min_pressure)","avg(precip)","avg(insolation)")

    # columnsToPredict = ["stationIndex","avg(max_temp)","avg(med_temp)","avg(min_temp)","avg(max_pressure)","avg(min_pressure)","avg(precip)","avg(insolation)"]
    # assembler = VectorAssembler(inputCols=columnsToPredict,outputCol="features")
    # assembledData = assembler.transform(selectedData)

    # groupedData = data.groupBy("station_id","stationIndex").agg(avg("max_temp"),avg("med_temp"),avg("min_temp"),avg("max_pressure"),avg("min_pressure"),avg("precip"),avg("insolation"))
    # selectedData = groupedData.select("station_id","stationIndex","avg(max_temp)","avg(med_temp)","avg(min_temp)","avg(max_pressure)","avg(min_pressure)","avg(precip)","avg(insolation)")

    columnsToPredict = ["stationIndex", "max_temp", "med_temp", "min_temp", "max_pressure", "min_pressure", "precip",
                        "insolation"]
    assembler = VectorAssembler(inputCols=columnsToPredict, outputCol="features")
    assembledData = assembler.transform(data)

    sampledData = assembledData

    feature_data = sampledData.withColumn("label", sampledData.stationIndex).withColumn("features",
                                                                                        sampledData.features)

    print("Sampling...")
    test_data = feature_data.sample(False, 0.1)
    train_data = feature_data.sample(False, 0.9)
    print("Test data: " + str(test_data.count()) + " , Train data: " + str(train_data.count()))

    # Trains a k-means model.
    # kmeans = KMeans().setK(stations.count()).setSeed(1)
    kmeans = KMeans().setK(stations.count()).setSeed(1)
    model = kmeans.fit(feature_data)

    # Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = model.computeCost(feature_data)
    print("Within Set Sum of Squared Errors = " + str(wssse))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)

    print("Predicting...")
    predictions = model.transform(test_data)
    # predictions.show()
    predictions.select("station_id", "stationIndex", "label", "prediction").show()
    def _fit(self, dataset):
        k = self.getK()
        vocabulary = self.getVocabulary()

        # Trains a k-means model on word-vectors
        kmeans = KMeans(featuresCol="vector", predictionCol="cluster", initMode="random") \
                .setK(k).setSeed(1)
        model_kmeans = kmeans.fit(vocabulary)
        data_vocabulary = model_kmeans.transform(vocabulary)

        # Create and broadcast the dictionary <word, cluster>
        rdd_dictionary = data_vocabulary.select('word','cluster') \
            .map(lambda row: (row['word'],str(row['cluster'])))
        dictionary = dict(rdd_dictionary.collect())

        return (WordClusterModel().setInputCol(
            self.getInputCol()).setPredictionCol(
                self.getPredictionCol()).setDictionary(dictionary).setK(k))
示例#11
0
def train(df, hiperparameter):
    '''
    KMeans training, returning KMeans model.
    input: - Dataframe
           - config (configurasi hiperparameter)
    
    return: kmeans model
    '''
    k_means = KMeans(featuresCol=hiperparameter['featuresCol'],
                     predictionCol=hiperparameter['predictionCol'],
                     k=hiperparameter['k'],
                     initMode=hiperparameter['initMode'],
                     initSteps=hiperparameter['initSteps'],
                     tol=hiperparameter['tol'],
                     maxIter=hiperparameter['maxIter'],
                     seed=hiperparameter['seed'])
    model = k_means.fit(df)
    return model
示例#12
0
 def test_kmeans(self):
     kmeans = KMeans(k=2, seed=1)
     path = tempfile.mkdtemp()
     km_path = path + "/km"
     kmeans.save(km_path)
     kmeans2 = KMeans.load(km_path)
     self.assertEqual(kmeans.uid, kmeans2.uid)
     self.assertEqual(type(kmeans.uid), type(kmeans2.uid))
     self.assertEqual(kmeans2.uid, kmeans2.k.parent,
                      "Loaded KMeans instance uid (%s) did not match Param's uid (%s)"
                      % (kmeans2.uid, kmeans2.k.parent))
     self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k],
                      "Loaded KMeans instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
示例#13
0
 def test_kmeans(self):
     kmeans = KMeans(k=2, seed=1)
     path = tempfile.mkdtemp()
     km_path = path + "/km"
     kmeans.save(km_path)
     kmeans2 = KMeans.load(km_path)
     self.assertEqual(kmeans.uid, kmeans2.uid)
     self.assertEqual(type(kmeans.uid), type(kmeans2.uid))
     self.assertEqual(kmeans2.uid, kmeans2.k.parent,
                      "Loaded KMeans instance uid (%s) did not match Param's uid (%s)"
                      % (kmeans2.uid, kmeans2.k.parent))
     self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k],
                      "Loaded KMeans instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
def train(model="kmean", scaledData=None, test_id=4):
    """
    train models based on the model type
    args:
         model: str, the model type
         scaledData: the scaled data from the output of norm_data function
    """
    for i in range(5):
        kmeans = KMeans().setK(2).setSeed(1)
        model = kmeans.fit(scaledData)
        predictions = model.transform(scaledData)
        evaluator = ClusteringEvaluator()
        silhouette = evaluator.evaluate(predictions)
        centers = model.clusterCenters()
    for center in centers:
        print(center)
    predictions.rdd.saveAsPickleFile("k_mean_prediction_worker2_testid_" +
                                     str(test_id) + ".pkl")
示例#15
0
def _compute_cluster_analysis(spark_df, clusters=5):
    numeric_columns = list(map(lambda col_dtype: col_dtype[0], spark_df.dtypes))
    if (len(numeric_columns) == 0):
        raise ValueError("The provided spark dataframe does not contain any numeric columns. "
                         "Cannot compute cluster analysis with k-means on categorical columns. "
                         "The numeric datatypes are: {}" \
                         " and the number of numeric datatypes in the dataframe is: {} ({})".format(
            constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes))
    if (len(numeric_columns) == 1):
        raise ValueError("The provided spark dataframe does contains only one numeric column. "
                         "Cluster analysis will filter out numeric columns and then "
                         "use pca to reduce dataset dimension to 2 dimensions and "
                         "then apply KMeans, this is not possible when the input data have only one numeric column."
                         "The numeric datatypes are: {}"
                         " and the number of numeric datatypes in the dataframe is: {} ({})".format(
            constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes))
    vecAssembler = VectorAssembler(inputCols=numeric_columns,
                                   outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN)
    spark_df_1 = vecAssembler.transform(spark_df)
    kmeans = KMeans(k=clusters, seed=1, maxIter=20,
                    featuresCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
                    predictionCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN)
    model = kmeans.fit(spark_df_1.select(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN))
    spark_df_2 = model.transform(spark_df_1)
    spark_df_3 = spark_df_2.select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
                                    constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN])
    count = spark_df_3.count()
    if count < constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE:
        spark_df_4 = spark_df_3
    else:
        spark_df_4 = spark_df_3.sample(True,
                                       float(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE) / float(count))

    pca = PCA(k=2,
              inputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN,
              outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN)
    model = pca.fit(spark_df_4)
    spark_df_5 = model.transform(spark_df_4).select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN,
                                                     constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN])
    spark_df_6 = spark_df_5.withColumnRenamed(
        constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN,
        constants.FEATURE_STORE.CLUSTERING_ANALYSIS_FEATURES_COLUMN)
    spark_df_7 = spark_df_6.withColumnRenamed(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN, "clusters")
    return json.loads(spark_df_7.toPandas().to_json())
示例#16
0
def create_clusters(sqlContext):
    specific_data = sqlContext.sql(
        'SELECT Description, Date, Latitude, Longitude FROM crime_data')
    #where `Primary Type` == "HOMOCIDE" or `Primary Type` == "ASSAULT"
    specific_data.filter(specific_data.Description != "SIMPLE")

    specific_data = specific_data.rdd.filter(lambda x: datetime.strptime(
        x[1], "%m/%d/%Y %I:%M:%S %p").weekday() in [5, 6]).toDF()

    specific_data = specific_data.sample(False, .0001)
    #create different k-means outputs depending on the date since that is what we are testing

    vecAssembler = VectorAssembler(inputCols=['Latitude', 'Longitude'],
                                   outputCol="features")
    df_kmeans = vecAssembler.transform(specific_data).select(
        'Description', 'features', 'Latitude', 'Longitude')
    # df_kmeans.show()

    # cost = [0] * 40
    # for k in range(41,50):
    #     kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    #     model = kmeans.fit(df_kmeans.sample(False,0.1, seed=42))
    #     cost[k-41] = model.computeCost(df_kmeans) # requires Spark 2.0 or later
    # print (cost)

    # 48 for k since this is where it starts to have diminishing returns
    k = 48
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(df_kmeans)
    centers = model.clusterCenters()

    # print("Cluster Centers: ")
    # for center in centers:
    #     print(center)

    #TODO: arrange based on season, time of day, etc
    transformed = model.transform(df_kmeans).select('Description',
                                                    'prediction', 'Latitude',
                                                    'Longitude')
    transformed.show()

    #return_rdd = transformed.rdd.map(lambda x: (x.prediction, (x.Description, (x.Latitude, x.Longitude))))

    create_visuals(transformed)
def clustering(poses):
    print(len(poses))
    dff = map(lambda x: (0,(Vectors.dense(x[0:2]))), poses)
    mydf = spark.createDataFrame(dff,schema=["None","features"])
    k = 2
    cost = 99999999999
    while(cost >= 20):
        kmeans = KMeans().setK(k).setSeed(1)
        model = kmeans.fit(mydf)
        cost = model.computeCost(mydf)/len(poses)
        k+=1
        print("K: ", k, " Cost: ", cost)
    centers = model.clusterCenters()
    centroids = []
    for center in centers:
        centroids.append(center)
    prediction = model.transform(mydf).select('prediction').collect()
    labels = [p.prediction for p in prediction ]
    return centroids, labels
示例#18
0
def getSilhouette(df, model='KMeans'):
    silhouette_ls = []
    if model == 'KMeans':
        for i in range(1, 10):
            kmeans = KMeans().setK(i + 1).setSeed(123)
            model_k = kmeans.fit(df)
            # Make predictions
            predictions = model_k.transform(df)
            # Evaluate clustering by computing Silhouette score
            evaluator = ClusteringEvaluator()
            silhouette_ls.append(round(evaluator.evaluate(predictions), 2))
        best_k = silhouette_ls.index(np.max(silhouette_ls)) + 2
        print(
            '****************************SILHOUETTE*************************************************'
        )
        print(
            f'The best K is: {best_k} associated with a silhoutte of: {np.max(silhouette_ls)}'
        )
    return silhouette_ls, best_k
示例#19
0
 def test_kmeans_summary(self):
     data = [
         (Vectors.dense([0.0, 0.0]), ),
         (Vectors.dense([1.0, 1.0]), ),
         (Vectors.dense([9.0, 8.0]), ),
         (Vectors.dense([8.0, 9.0]), ),
     ]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 1)
示例#20
0
def main():
    cfg = Path(__file__).parent.joinpath("kmeans_config.yaml")
    cfg = load_config(str(cfg))

    spark = init_spark(cfg)
    kmeans = KMeans(k=3, seed=0)

    X = read(cfg, spark)
    fit(cfg, kmeans, X)
    print(timings)
def create_kmeans_pipeline_from(data_frame,
                                feature_extractor,
                                total_clusters,
                                max_iterations):
    """
    Creates a pipeline for tokenizing the document to words,
    removing stop words, and add TF-IDF, and finally does
    clustering using k-means algorithm
    """
    tokenizer_transformer = RegexTokenizer(inputCol="doc_text",
                                           outputCol="words",
                                           pattern="\\W")

    stop_words_transformer = StopWordsRemover(inputCol="words",
                                              outputCol="filtered_words")

    pipeline_stages = [tokenizer_transformer, stop_words_transformer,]

    if feature_extractor == 'TFIDF':

        # create TF counter using CountVectorizer
        tf_estimator = CountVectorizer(inputCol="filtered_words", outputCol="TF")

        # create inverse-document-frequency counter
        idf_estimator = IDF(inputCol="TF", outputCol="features")

        # add them to the pipeline stages
        pipeline_stages.extend([tf_estimator, idf_estimator])

    elif feature_extractor == 'Word2Vec':

        # create word2vec feature extractor
        w2v_estimator = Word2Vec(inputCol="filtered_words", outputCol="features")
        
        # add this to pipeline stage
        pipeline_stages.append(w2v_estimator)
    else:
        raise ValueError('Unknown feature extractor:' % feature_extractor)

    # create KMeans clustering
    kmeans = KMeans(k=total_clusters,
                    featuresCol="features",
                    predictionCol="DocumentClass",
                    seed=1,
                    maxIter=max_iterations)

    # finally add Kmeans to the pipeline
    # which takes "features" output from the previous stage and 
    # does the prediction.
    # NOTE:
    # For document clustering cosine_similarity measure is the preferred one.
    # This pipeline uses SSE method
    #
    pipeline_stages.append(kmeans)
    return Pipeline(stages=pipeline_stages).fit(data_frame)
示例#22
0
def kmeans(df, optimalK, seed=1):
    """
    Input:
    df: pyspark dataframe
    optimalK: int, k for cluster
    seed: int, random seed
    
    Output:
    new_df_kmeans_transformed: pyspark dataframe, with the cluster prediction
    model: k-means model
    """

    # transfer to "features" for pyspark ML
    scaled_column = [col for col in df.columns if "scaled" in col]
    df_vec = get_feature(df, scaled_column)

    kmeans = KMeans(k=optimalK, seed=seed)
    model = kmeans.fit(df_vec)
    new_df_kmeans_transformed = model.transform(df_vec)
    return new_df_kmeans_transformed, model
示例#23
0
 def test_kmean_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     data = [
         (Vectors.dense([0.0, 0.0]), ),
         (Vectors.dense([1.0, 1.0]), ),
         (Vectors.dense([9.0, 8.0]), ),
         (Vectors.dense([8.0, 9.0]), ),
     ]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     path = tempfile.mkdtemp()
     km_path = path + "/km-pmml"
     model.write().format("pmml").save(km_path)
     pmml_text_list = self.sc.textFile(km_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
示例#24
0
def kmeans():
    spark = SparkSession \
        .builder \
        .appName('my_first_app_name') \
        .getOrCreate()
    df = spark.read.csv('./seeds_dataset.csv', header=True, inferSchema=True)
    df.show(5)

    assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
    final_df = assembler.transform(df)
    final_df.show(3)
    final_df.take(1)

    # instantiated kmeans with 3 cluster
    kmeans = KMeans(k=3)
    # fitting the model
    model = kmeans.fit(final_df)
    centers = model.clusterCenters()
    print(centers)
    model.transform(final_df).select('prediction').show()
示例#25
0
def cluster():
    ld = load(open(DATAP + '\\temp\olangdict.json', 'r', encoding='UTF-8'))

    spark = SparkSession.builder\
                        .master("local")\
                        .appName("Word Count")\
                        .config("spark.some.config.option", "some-value")\
                        .getOrCreate()

    df = spark.createDataFrame([["0"], ["1"], ["2"], ["3"], ["4"]], ["id"])
    df.show()

    vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"],
                                   outputCol="features")
    new_df = vecAssembler.transform(df)

    kmeans = KMeans(k=2, seed=1)  # 2 clusters here
    model = kmeans.fit(new_df.select('features'))
    transformed = model.transform(new_df)
    print(transformed.show())
示例#26
0
def analysing_emissions_data(spark, co2_emisssion_data):

    # creating feature vector for sending as input to ML models
    vecAssembler = VectorAssembler(inputCols=['change_in_emissions_scaled'],
                                   outputCol="features")

    # adding feature vector to our aperk dataframe
    co2_emisssion_data = vecAssembler.setHandleInvalid("skip").transform(
        co2_emisssion_data)

    # creating Kmeans object (7 clusters)
    kmeans = KMeans(k=7)

    # clustering operation
    model = kmeans.fit(co2_emisssion_data.select('features'))

    # adding column of predicted clusters to our dataframe
    co2_emisssion_data = model.transform(co2_emisssion_data)

    return co2_emisssion_data.drop("features")
示例#27
0
def clustering(fp, cols):
	assembler = VectorAssembler(inputCols=cols, outputCol='features')
	assembled_data = assembler.transform(fp)
	scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
	print('dkks' scaler)
	#scaler.show()
	scaler_model = scaler.fit(assembled_data)
	scaled_data = scaler_model.transform(assembled_data)

	#scaled_data.printSchema()
	#scaled_data.show(4)
	#scaled_data.select('scaledFeatures').show()

	k_means_3 = KMeans(featuresCol='scaledFeatures', k=10) #clusters = KMeans(fp, 5, maxIterations=10, initializationMode="random")#clusters.show()
	model_k3 = k_means_3.fit(scaled_data)
	model_k3_data = model_k3.transform(scaled_data)
	#details = model_k3_data
	#model_k3_data.groupBy('prediction').count().show()
	#details.show(50)
	return model_k3_data
示例#28
0
 def __k_mean(cls, df, k_clusters, xnorm, label_name):
     from sklearn.cluster import KMeans
     kmeans_cat = label_name
     if xnorm.shape[1] > 0:
         #n_clusters = elbow point
         start_time = time.time()
         estimator = KMeans(n_clusters=k_clusters)
         estimator.fit(xnorm)
         y_pred = estimator.predict(xnorm)
         res=estimator.__dict__
         #print(res['cluster_centers_'])
         cluster_center = res['cluster_centers_']
         print("training time: ", time.time()-start_time, "(sec)")
     else:
         #no split
         y_pred = 0
         cluster_center = []
         print("no kmeans split")
     df[kmeans_cat] = y_pred + 1
     return df, kmeans_cat, cluster_center
def buildAnomalyDetector(data, normalizeFunction):
    normalizedData = data.map(normalizeFunction)
    normalizedData.cache()
    model = KMeans.train(normalizedData, 150, maxIterations=10, epsilon=1.0e-6)
    normalizedData.unpersist()
    distances = normalizedData.map(lambda datum: distToCentroid(datum, model))
    threshold = distances.top(100).pop()

    def f(datum):
        return distToCentroid(normalizeFunction(datum), model) > threshold
    return f
示例#30
0
def executeKmeans(k, username, jobNum, input_file_name, model_file_name,
                  prediction_file_name, features_col):
    file_path_in_hdfs = username + '/' + jobNum + '/'
    ip_file_in_hdfs = file_path_in_hdfs + input_file_name
    df_kmeans = preprocessData(ip_file_in_hdfs, features_col)
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(df_kmeans)

    centers = model.clusterCenters()
    model_file_in_hdfs = file_path_in_hdfs + model_file_name
    saveModelinHDFS.apply_async(
        (file_path_in_hdfs, json.dumps(centers,
                                       cls=NumpyEncoder), model_file_in_hdfs))

    predictions = makePredictions(df_kmeans, model)
    prediction_file_in_hdfs = file_path_in_hdfs + prediction_file_name
    savePredictioninHDFS.apply_async(
        (file_path_in_hdfs,
         json.dumps(predictions.values.tolist(),
                    cls=NumpyEncoder), prediction_file_in_hdfs))
示例#31
0
def main(argv):

    #se instancia el contexto de spark.
    sc = SparkContext(appName="KMeans-Clustering-dhoyoso-dsernae")
    #se inicia sesion en spark.
    spark = SparkSession(sc)
    #se guarda el lenguaje a partir del cual se quitaran las stop words.
    language = argv[4]  #"spanish"
    #se guarda la ruta para la salida de los clusters.
    pathout = argv[3]
    #se guarda la ruta de la cual se leeran los archivos.
    path = argv[2]  #"hdfs:///user/dhoyoso/datasets/dataset/"
    #se guarda el numero de clusters que se desea hacer.
    k = int(argv[1])  #4
    #se sacan los archivos a procesar a partir de la ruta.
    files = sc.wholeTextFiles(path)
    #se crea la estructura del dataframe; 2 columnas una para la ruta y otra para el texto.
    schema = StructType([
        StructField("path", StringType(), True),
        StructField("text", StringType(), True)
    ])
    #se crea el dataframe a partir de la estructura y los archivos.
    df = spark.createDataFrame(files, schema)
    #se tokeniza el texto usando la clase de Ml tokenizer.
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    #se le dice al stop words remover que idioma es el que estamos tratando.
    StopWordsRemover.loadDefaultStopWords(language)
    #se remueven las stopwords de los tokens.
    stopWords = StopWordsRemover(inputCol="tokens",
                                 outputCol="stopWordsRemovedTokens")
    #se hace el hashing tf de los tokens restantes.
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2000)
    #se hace el idf de la salida del hashingTF
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
    #se inicializa el kmeans con el idf y el k deseado.
    kmeans = KMeans(k=k)
    #creacion del mapa de transformaciones.
    pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans])
    #inserta el dataframe como el inicio de las transformaciones
    model = pipeline.fit(df)
    #ejecuta las trasformaciones mapeadas y guarda el resultado
    results = model.transform(df)
    results.cache()
    #se corta la ruta para dejar solo el nombre y su respectivo cluster(prediction).
    split_col = split(results['path'], '/')
    results = results.withColumn('docname', split_col.getItem(7))
    df = results.select("docname", "prediction")

    #se agrupan los documentos del mismo cluster en cluster_docs_list y se guardan en el path de salida como un json.
    grouped = df.groupBy(['prediction']).agg(
        collect_list("docname").alias('cluster_docs_list'))
    grouped.coalesce(1).write.json(path=pathout, mode="overwrite")
示例#32
0
def cluster(dataframe, k, maxIter=50):
    clf = KMeans(k=k, maxIter=maxIter)
    model = clf.fit(dataframe)
    print('=================================')
    print('Cost:', model.summary.trainingCost)
    # print('Cluster:', model.summary.cluster)
    # print('Cluster centers:', model.clusterCenters())
    print('Cluster size:', model.summary.clusterSizes)
    print('Iter:', model.summary.numIter)
    # print('Predictions:', model.summary.predictions)
    print('k:', model.summary.k)
    print('featuresCol:', model.summary.featuresCol)
    print('predictionCol:', model.summary.predictionCol)
    # print(dir(model.summary))
    # print('=================================')
    # print('Saving centers...')
    # np.save('tmp/centers' + str(options.k), model.clusterCenters())
    # print('Saving predictions...')
    # model.summary.predictions.write.format('parquet').saveAsTable('predictions' + str(options.k))
    return model.clusterCenters()
    def kmeans_pipeline(self, inputCols, cluster_count=None, max_cluster=None):
        if max_cluster != None:
            self._max_cluster = max_cluster

        assembler = VectorAssembler(inputCols=inputCols, outputCol="features")
        assembled = assembler.transform(self._data_frame)
        mmScaler = StandardScaler(inputCol="features",
                                  outputCol="featuresCol",
                                  withStd=True,
                                  withMean=False)
        scale_model = mmScaler.fit(assembled)
        vectorized_data = scale_model.transform(assembled)

        if cluster_count == None:
            cluster_count_array = list(range(2, self._max_cluster))
            wssse_output = []
            for n_cluster in cluster_count_array:
                kmeans = KMeans().setK(n_cluster).setSeed(1)
                kmeans_model = kmeans.fit(vectorized_data)
                wssse = kmeans_model.computeCost(vectorized_data)
                wssse_output.append(wssse)
            wssse_dict = dict(list(zip(cluster_count_array, wssse_output)))

            cluster_count = min(wssse_dict, key=wssse_dict.get)
            kmeans = KMeans().setK(cluster_count).setSeed(1)
            kmeans_model = kmeans.fit(vectorized_data)
            wssse = kmeans_model.computeCost(vectorized_data)
            centers = kmeans_model.clusterCenters()
            cluster_prediction = kmeans_model.transform(vectorized_data)
        else:
            wssse_dict = {}
            kmeans = KMeans().setK(cluster_count).setSeed(1)
            kmeans_model = kmeans.fit(vectorized_data)
            wssse = kmeans_model.computeCost(vectorized_data)
            centers = kmeans_model.clusterCenters()
            cluster_prediction = kmeans_model.transform(vectorized_data)

        self._kmeans_result["cluster_count"] = cluster_count
        self._kmeans_result["wssse"] = wssse
        self._kmeans_result["wssse_dict"] = wssse_dict
        self._kmeans_result["centers"] = centers
        self._kmeans_result["cluster_count"] = cluster_count
        self._kmeans_result["inputCols"] = inputCols
        # cluster_prediction = cluster_prediction.withColumn("prediction", cluster_prediction["prediction"].cast(StringType()))
        # print cluster_prediction.printSchema()
        self._predictedData = cluster_prediction
示例#34
0
def newTrain():
    fileSave = "/home/hadoop/data_school/sparkMlib/KMeans"
    # 男: 1, 女: 2
    df = spark.read.format('csv').option('header', 'true').load(fileSave).fillna('0')
    df = df.where(df.TotalFee != '0').where(df.DiseaseCode == '13104')
    df = df.withColumn("Sex", df.Sex.cast(IntegerType())) \
        .withColumn("Age", df.Age.cast(IntegerType())) \
        .withColumn("TotalFee", df.TotalFee.cast(FloatType()))

    # vecAss = VectorAssembler(inputCols=df.columns[2:], outputCol='feature')
    # data = vecAss.transform(df).select("feature")
    # data.show()
    data = df.drop("Sex", "DiseaseCode")
    data.show()

    # 转换数据
    featureCreator = VectorAssembler(inputCols=data.columns, outputCol='features')
    data = featureCreator.transform(data)

    distance = []
    for k in range(2, 10):
        # 评估器
        kmeans = KMeans(featuresCol='features').setK(k)
        # 模型拟合
        model = kmeans.fit(data)
        # 聚合
        test = model.transform(data).select('features', 'prediction')

        evaluator = ClusteringEvaluator()

        evaResult = evaluator.evaluate(test)
        print("the distance = " + str(evaResult))
        distance.append(evaResult)

    import matplotlib.pyplot as plt
    plt.figure()
    x = [i for i in range(2, 10)]
    plt.plot(x, distance)
    plt.xlabel("K")
    plt.ylabel("Distance")
    plt.show()
示例#35
0
def topic_generator(subreddit_input):
    subreddit_filter = requests.get(
        url + 'reddit_post.json?orderBy="subreddit"&equalTo="' +
        str(subreddit_input) + '"')
    subreddits = json.loads(subreddit_filter.text)

    results = []
    for x in subreddits:
        try:
            results.append(subreddits[x])
        except KeyError:
            continue
    data = pd.DataFrame.from_dict(results, orient='columns')
    data1 = spark.createDataFrame(pd.DataFrame(data["title"]))

    #text clean
    clean_data_udf = udf(clean_data, StringType())
    data1 = data1.withColumn("new_title", clean_data_udf("title"))
    #text tokenizer
    tokenizer = Tokenizer(inputCol="new_title", outputCol="words")
    data1 = tokenizer.transform(data1)
    #stopwords removal
    remover = StopWordsRemover(inputCol="words", outputCol="rm_words")
    data1 = remover.transform(data1)
    #TFIDF vectorization
    hashingTF = HashingTF(inputCol="rm_words",
                          outputCol="rawFeatures",
                          numFeatures=2000)
    data1 = hashingTF.transform(data1)
    #Document frequency
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(data1)
    data1 = idfModel.transform(data1)
    #Kmeans topic clustering
    kmeans = KMeans(k=2, featuresCol="features").setSeed(1)
    kmeans_model = kmeans.fit(data1)
    data1 = kmeans_model.transform(data1)

    data["prediction"] = data1.select("prediction").toPandas()

    return data
示例#36
0
def kmeans_info(i, df, inputCols):
    vecAssembler = VectorAssembler(inputCols=inputCols, outputCol='features')
    df = vecAssembler.transform(df)
    info = '---> With ' + str(i) + ' clusters \n'
    kmeans = KMeans(k=i, seed=1)
    model = kmeans.fit(df.select('features'))
    # Make predictions
    predictions = model.transform(df)
    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    info += "Silhouette with squared euclidean distance = " + str(
        silhouette) + '\n'
    info += "Cluster Centers: " + '\n'
    ctr = []
    centers = model.clusterCenters()
    for center in centers:
        ctr.append(center)
        info += str(center) + '\n'
    info += '--------------------\n'
    return info
示例#37
0
    def __process_dataset(self):
        """Making model from dataset
        """
        data = []
        for i in range(len(self.model)):
            logger.info("Training consumers batch {}".format(i))

            logger.info("Assembling vector!")
            assembler = VectorAssembler(inputCols=["_c12"],
                                        outputCol='features')

            data.append(assembler.transform(self.model[i]))

            logger.info("Training Model!")
            kmeans = KMeans().setK(10).setSeed(1)
            self.model[i] = kmeans.fit(data[i])

            logger.info("Making Preditction!")
            self.predictions = self.model[i].transform(data[i])

            logger.info("Clustering model built!")
def visualizationInR(rawData):
    def preprocessing(line):
        values = line.split(",")
        del values[1:4]
        values.pop()
        return Vectors.dense(map(lambda x: float(x), values))

    data = rawData.map(preprocessing).cache()
    model = KMeans.train(data, 100, maxIterations=10, epsilon=1.0e-6)

    sample = data.map(lambda datum: model.predict(datum) + "," + ",".join(datum)).sample(False, fraction=0.05, seed=None)
    sample.saveAsTextFile("file:///user/ds/sample")
示例#39
0
def clustering(player_profile):
    vecAssembler = VectorAssembler(
        inputCols=["fouls", "goals", "owngoals", "pass_acc", "shots", "matches"], outputCol="features")
    print(player_profile.printSchema())
    player_profile = player_profile.drop("name")
    new_df = vecAssembler.transform(player_profile)
    kmeans = KMeans(k=5)
    model = kmeans.fit(new_df.select("features"))
    # Make predictions
    predictions = model.transform(new_df)
    predictions.show()
    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))
    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    return predictions, centers
示例#40
0
def cluster():
    ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8'))

    spark = SparkSession.builder\
                        .master("local")\
                        .appName("Word Count")\
                        .config("spark.some.config.option", "some-value")\
                        .getOrCreate()

    df = spark.createDataFrame([["0"],
                                ["1"],
                                ["2"],
                                ["3"],
                                ["4"]],
                               ["id"])
    df.show()

    vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features")
    new_df = vecAssembler.transform(df)

    kmeans = KMeans(k=2, seed=1)  # 2 clusters here
    model = kmeans.fit(new_df.select('features'))
    transformed = model.transform(new_df)
    print(transformed.show())
示例#41
0
trainingData = VectorAssembler(inputCols=["duration", "tempo", "loudness"], outputCol="features").transform(
    table("songsTable")
)

# COMMAND ----------

# MAGIC %md We can now pass this new DataFrame to the `KMeans` model and ask it to categorize different rows in our data to two different classes (`setK(2)`). We place the model in a variable named `model`.
# MAGIC
# MAGIC **Note:** This command multiple spark jobs (one job per iteration in the KMeans algorithm). You will see the progress bar starting over and over again.

# COMMAND ----------

from pyspark.ml.clustering import KMeans

model = KMeans().setK(2).fit(trainingData)

# COMMAND ----------

# MAGIC %md To see the result of our clustering, we produce a scatter plot matrix that shows interaction between input variables and learned clusters. To get that we apply the model on the original data and pick four columns: `prediction` and the original features (`duration`, `tempo`, and `loudness`).

# COMMAND ----------

transformed = model.transform(trainingData).select("duration", "tempo", "loudness", "prediction")

# COMMAND ----------

# MAGIC %md To comfortably visualize the data we produce a random sample.
# MAGIC Remember the `display()` function? We can use it to produce a nicely rendered table of transformed DataFrame.

# COMMAND ----------
# COMMAND ----------

fittedPipeline = transformationPipeline.fit(trainDataFrame)


# COMMAND ----------

transformedTraining = fittedPipeline.transform(trainDataFrame)


# COMMAND ----------

from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
  .setK(20)\
  .setSeed(1L)


# COMMAND ----------

kmModel = kmeans.fit(transformedTraining)


# COMMAND ----------

transformedTest = fittedPipeline.transform(testDataFrame)


# COMMAND ----------
示例#43
0

from pyspark.mllib.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark import SparkContext
from pyspark.sql import SQLContext

# sc = SparkContext(appName="test")
# sqlContext = SQLContext(sc)

data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = sqlContext.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)

centers = model.clusterCenters()
model.transform(df).select("features", "prediction").collect()

示例#44
0
# For now, analysis is still required. We cache the output because we are going to perform
# multiple runs on the dataset.
df0 = tfs.analyze(df).cache()


mllib_df.count()
df0.count()

np.random.seed(2)
init_centers = np.random.randn(k, num_features)
start_centers = init_centers
dataframe = df0


ta_0 = time.time()
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode(
        "random").setMaxIter(num_iters)
mod = kmeans.fit(mllib_df)
ta_1 = time.time()

tb_0 = time.time()
(centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=False)
tb_1 = time.time()

tc_0 = time.time()
(centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=True)
tc_1 = time.time()

mllib_dt = ta_1 - ta_0
tf_dt = tb_1 - tb_0
tf2_dt = tc_1 - tc_0
sales = va.transform(spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/data/retail-data/by-day/*.csv")
  .limit(50)
  .coalesce(1)
  .where("Description IS NOT NULL"))

sales.cache()


# COMMAND ----------

from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print km.explainParams()
kmModel = km.fit(sales)


# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

示例#46
0
def assign_cluster(data):
    """Train kmeans on rescaled data and then label the rescaled data."""
    kmeans = KMeans(k=2, seed=1, featuresCol="features_scaled", predictionCol="label")
    model = kmeans.fit(data)
    label_df = model.transform(data)
    return label_df
示例#47
0
# COMMAND ----------

display(transformed)

# COMMAND ----------

# MAGIC %md
# MAGIC #### K-Means Visualized

# COMMAND ----------

modelCenters = []
iterations = [0, 2, 4, 7, 10, 20]
for i in iterations:
    kmeans = KMeans(k=3, seed=5, maxIter=i, initSteps=1)
    model = kmeans.fit(irisTwoFeatures)
    modelCenters.append(model.clusterCenters())

# COMMAND ----------

print 'modelCenters:'
for centroids in modelCenters:
  print centroids

# COMMAND ----------

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
示例#48
0
twDF = tw.map(lambda p: Row(text=p)).toDF()	
t0 = time.time()
word2Vec = Word2Vec(vectorSize=100, minCount=5, stepSize=0.025, inputCol="text", outputCol="result")
modelW2V = word2Vec.fit(twDF)
wordVectorsDF = modelW2V.getVectors()
timeW2V = time.time() - t0

## Train K-means on top of the Word2Vec matrix:
t0 = time.time()
vocabSize = wordVectorsDF.count()
K = int(math.floor(math.sqrt(float(vocabSize)/2)))
         # K ~ sqrt(n/2) this is a rule of thumb for choosing K,
         # where n is the number of words in the model
         # feel free to choose K with a fancier algorithm         
dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector','features')
kmeans = KMeans(k=K, seed=1)
modelK = kmeans.fit(dfW2V)
labelsDF = modelK.transform(dfW2V).select('prediction').withColumnRenamed('prediction','labels')
vocabSize = wordVectorsDF.count()
timeKmeans = time.time() - t0

sc.stop()


## Print Some Results
printResults = 1 # set t 
if (printResults):
    ## Read Tweets

    print "="*80
    print "Read Tweets..."
示例#49
0
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["Take-out","GoodFor_lunch", "GoodFor_dinner", "GoodFor_breakfast"])

dfhot.show(5)

# Taining set
assembler = VectorAssembler(inputCols = list(set(dfhot.columns) | set(['stars','review_count'])), outputCol="features")
train = assembler.transform(dfhot)

# Kmeans set for 5 clusters
knum = 5
kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=knum, seed=0)
model = kmeans.fit(train)
print "Model Created!"

# See cluster centers:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
    
# Apply the clustering model to our data:
prediction = model.transform(train)
prediction.groupBy("cluster").count().orderBy("cluster").show()

# Look at the features of each cluster
customerCluster = {}
示例#50
0
# $example off$

from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    # Trains a k-means model.
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)

    # Make predictions
    predictions = model.transform(dataset)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
# Sukurtas `vecAssembler` objektas (kurio tipas yra `pyspark.ml.feature.VectorAssembler`) turi `transform` metodą, kuriuo iš stulpelių `x` ir `y` bus pagamintas vektorių stulpelis `features` ir iš jam perduodamo `DataFrame` objekto bus sukurtas naujas `DataFrame` su `features` stulpeliu:

# In[61]:

ca1mlFeaturizedDF = vecAssembler.transform(ca1MlDF)
ca1mlFeaturizedDF.show(5)


# Sukursime `pyspark.ml.clustering.KMeans` objektą (atminkime, kad 1 pavyzdyje naudojome `pyspark.mllib.clustering.KMeans`), kuriuo apmokysime `pyspark.ml.clustering.KMeansModel`. Duomenis pateiksime nebe `RDD`, o `DataFrame` objektu.

# In[62]:

from pyspark.ml.clustering import KMeans as MlKMeans

firstMlKMeans = MlKMeans(
    featuresCol="features", predictionCol="prediction", k=2, 
    initMode="k-means||", maxIter=20)
type(firstMlKMeans)


# `pyspark.ml` paketo modelių klasės turi `explainParams` metodą, kuruo išvedami modelio parametrų paaiškinimai.

# In[63]:

print(firstMlKMeans.explainParams())


# Apmokykime modelį.

# In[64]:
示例#52
0
 def test_kmeans_param(self):
     algo = KMeans()
     self.assertEqual(algo.getInitMode(), "k-means||")
     algo.setK(10)
     self.assertEqual(algo.getK(), 10)
     algo.setInitSteps(10)
     self.assertEqual(algo.getInitSteps(), 10)
     self.assertEqual(algo.getDistanceMeasure(), "euclidean")
     algo.setDistanceMeasure("cosine")
     self.assertEqual(algo.getDistanceMeasure(), "cosine")
示例#53
0
print(colStdDev)

#Place the means and std.dev values in a broadcast variable
bcMeans = sc.broadcast(colMeans)
bcStdDev = sc.broadcast(colStdDev)
csAuto = autoVector.map(centerAndScale)
#csAuto.collect()
#csAuto.foreach(println)
print(csAuto)

#Create Spark Data Frame
autoRows = csAuto.map(lambda f:Row(features=f))
autoDf = SQLContext.createDataFrame(autoRows)
autoDf.select("features").show(10)

kmeans = KMeans(k=3, seed=1)
model = kmeans.fit(autoDf)
predictions = model.transform(autoDf)
predictions.collect()
predictions.foreach(println)

#Plot the results in a scatter plot
unstripped = predictions.map(unstripData)
predList=unstripped.collect()
predPd = pd.DataFrame(predList)

# preparing to save the clustered data
list_current_gni_final_maped = current_gni_final_maped.collect()
list_current_gni_rdd = current_gni_rdd.collect()
list_predictions_pandas=predictions.toPandas()
list_predictions_temp=list_predictions_pandas.as_matrix()