def kmeans(df): kmeans = KMeans(k=2,seed=1) model = kmeans.fit(df) centers = model.clusterCenters() print len(centers) kmFeatures = model.transform(df).select("features", "prediction") dfwrite(kmFeatures,'kmFeatures')
def clustering(input_df, input_col_name, n): """ KMeans and PCA """ input_df = input_df.select('state','categories','stars',input_col_name) norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0) df = norm.transform(input_df) kmeans = KMeans(k=n, seed=2) KMmodel = kmeans.fit(df) predicted = KMmodel.transform(df).cache() pca = PCA(k=2, inputCol='features', outputCol="pc") df = pca.fit(dfsample).transform(dfsample).cache() return df
def test_kmeans_cosine_distance(self): data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),), (Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),), (Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine") model = kmeans.fit(df) result = model.transform(df).collect() self.assertTrue(result[0].prediction == result[1].prediction) self.assertTrue(result[2].prediction == result[3].prediction) self.assertTrue(result[4].prediction == result[5].prediction)
def elbow(elbowset, clusters): wsseList = [] for k in clusters: print("Training for cluster size {} ".format(k)) kmeans = KM(k = k, seed = 1) model = kmeans.fit(elbowset) transformed = model.transform(elbowset) featuresAndPrediction = transformed.select("features", "prediction") W = computeCost(featuresAndPrediction, model) print("......................WSSE = {} ".format(W)) wsseList.append(W) return wsseList
def test_kmeans_param(self): algo = KMeans() self.assertEqual(algo.getInitMode(), "k-means||") algo.setK(10) self.assertEqual(algo.getK(), 10) algo.setInitSteps(10) self.assertEqual(algo.getInitSteps(), 10)
def test_kmeans_summary(self): data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 1)
def test_kmean_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) path = tempfile.mkdtemp() km_path = path + "/km-pmml" model.write().format("pmml").save(km_path) pmml_text_list = self.sc.textFile(km_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def kmeans(inputdir,df,alg,k): from pyspark.ml.clustering import KMeans from numpy import array from math import sqrt kmeans = KMeans(k=int(k), seed=1,initSteps=5, tol=1e-4, maxIter=20, initMode="k-means||", featuresCol="features") model = kmeans.fit(df) kmFeatures = model.transform(df).select("labels", "prediction") erFeatures = model.transform(df).select("features", "prediction") ###Evaluation rows = erFeatures.collect() WSSSE = 0 for i in rows: WSSSE += sqrt(sum([x**2 for x in (model.clusterCenters()[i[1]]-i[0])])) print("Within Set Sum of Squared Error = " + str(WSSSE)) output_data = writeOutClu(inputdir,kmFeatures,alg,k,WSSSE) return output_data
def doKMeans(data): # groupedData = data.groupBy("station_id","stationIndex").agg(avg("max_temp"),avg("med_temp"),avg("min_temp"),avg("max_pressure"),avg("min_pressure"),avg("precip"),avg("insolation")) # selectedData = groupedData.select("station_id","stationIndex","avg(max_temp)","avg(med_temp)","avg(min_temp)","avg(max_pressure)","avg(min_pressure)","avg(precip)","avg(insolation)") # columnsToPredict = ["stationIndex","avg(max_temp)","avg(med_temp)","avg(min_temp)","avg(max_pressure)","avg(min_pressure)","avg(precip)","avg(insolation)"] # assembler = VectorAssembler(inputCols=columnsToPredict,outputCol="features") # assembledData = assembler.transform(selectedData) # groupedData = data.groupBy("station_id","stationIndex").agg(avg("max_temp"),avg("med_temp"),avg("min_temp"),avg("max_pressure"),avg("min_pressure"),avg("precip"),avg("insolation")) # selectedData = groupedData.select("station_id","stationIndex","avg(max_temp)","avg(med_temp)","avg(min_temp)","avg(max_pressure)","avg(min_pressure)","avg(precip)","avg(insolation)") columnsToPredict = ["stationIndex", "max_temp", "med_temp", "min_temp", "max_pressure", "min_pressure", "precip", "insolation"] assembler = VectorAssembler(inputCols=columnsToPredict, outputCol="features") assembledData = assembler.transform(data) sampledData = assembledData feature_data = sampledData.withColumn("label", sampledData.stationIndex).withColumn("features", sampledData.features) print("Sampling...") test_data = feature_data.sample(False, 0.1) train_data = feature_data.sample(False, 0.9) print("Test data: " + str(test_data.count()) + " , Train data: " + str(train_data.count())) # Trains a k-means model. # kmeans = KMeans().setK(stations.count()).setSeed(1) kmeans = KMeans().setK(stations.count()).setSeed(1) model = kmeans.fit(feature_data) # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(feature_data) print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) print("Predicting...") predictions = model.transform(test_data) # predictions.show() predictions.select("station_id", "stationIndex", "label", "prediction").show()
def _fit(self, dataset): k = self.getK() vocabulary = self.getVocabulary() # Trains a k-means model on word-vectors kmeans = KMeans(featuresCol="vector", predictionCol="cluster", initMode="random") \ .setK(k).setSeed(1) model_kmeans = kmeans.fit(vocabulary) data_vocabulary = model_kmeans.transform(vocabulary) # Create and broadcast the dictionary <word, cluster> rdd_dictionary = data_vocabulary.select('word','cluster') \ .map(lambda row: (row['word'],str(row['cluster']))) dictionary = dict(rdd_dictionary.collect()) return (WordClusterModel().setInputCol( self.getInputCol()).setPredictionCol( self.getPredictionCol()).setDictionary(dictionary).setK(k))
def train(df, hiperparameter): ''' KMeans training, returning KMeans model. input: - Dataframe - config (configurasi hiperparameter) return: kmeans model ''' k_means = KMeans(featuresCol=hiperparameter['featuresCol'], predictionCol=hiperparameter['predictionCol'], k=hiperparameter['k'], initMode=hiperparameter['initMode'], initSteps=hiperparameter['initSteps'], tol=hiperparameter['tol'], maxIter=hiperparameter['maxIter'], seed=hiperparameter['seed']) model = k_means.fit(df) return model
def test_kmeans(self): kmeans = KMeans(k=2, seed=1) path = tempfile.mkdtemp() km_path = path + "/km" kmeans.save(km_path) kmeans2 = KMeans.load(km_path) self.assertEqual(kmeans.uid, kmeans2.uid) self.assertEqual(type(kmeans.uid), type(kmeans2.uid)) self.assertEqual(kmeans2.uid, kmeans2.k.parent, "Loaded KMeans instance uid (%s) did not match Param's uid (%s)" % (kmeans2.uid, kmeans2.k.parent)) self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k], "Loaded KMeans instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def train(model="kmean", scaledData=None, test_id=4): """ train models based on the model type args: model: str, the model type scaledData: the scaled data from the output of norm_data function """ for i in range(5): kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(scaledData) predictions = model.transform(scaledData) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) centers = model.clusterCenters() for center in centers: print(center) predictions.rdd.saveAsPickleFile("k_mean_prediction_worker2_testid_" + str(test_id) + ".pkl")
def _compute_cluster_analysis(spark_df, clusters=5): numeric_columns = list(map(lambda col_dtype: col_dtype[0], spark_df.dtypes)) if (len(numeric_columns) == 0): raise ValueError("The provided spark dataframe does not contain any numeric columns. " "Cannot compute cluster analysis with k-means on categorical columns. " "The numeric datatypes are: {}" \ " and the number of numeric datatypes in the dataframe is: {} ({})".format( constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes)) if (len(numeric_columns) == 1): raise ValueError("The provided spark dataframe does contains only one numeric column. " "Cluster analysis will filter out numeric columns and then " "use pca to reduce dataset dimension to 2 dimensions and " "then apply KMeans, this is not possible when the input data have only one numeric column." "The numeric datatypes are: {}" " and the number of numeric datatypes in the dataframe is: {} ({})".format( constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes)) vecAssembler = VectorAssembler(inputCols=numeric_columns, outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN) spark_df_1 = vecAssembler.transform(spark_df) kmeans = KMeans(k=clusters, seed=1, maxIter=20, featuresCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, predictionCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN) model = kmeans.fit(spark_df_1.select(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN)) spark_df_2 = model.transform(spark_df_1) spark_df_3 = spark_df_2.select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN]) count = spark_df_3.count() if count < constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE: spark_df_4 = spark_df_3 else: spark_df_4 = spark_df_3.sample(True, float(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_SAMPLE_SIZE) / float(count)) pca = PCA(k=2, inputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_INPUT_COLUMN, outputCol=constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN) model = pca.fit(spark_df_4) spark_df_5 = model.transform(spark_df_4).select([constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN]) spark_df_6 = spark_df_5.withColumnRenamed( constants.FEATURE_STORE.CLUSTERING_ANALYSIS_PCA_COLUMN, constants.FEATURE_STORE.CLUSTERING_ANALYSIS_FEATURES_COLUMN) spark_df_7 = spark_df_6.withColumnRenamed(constants.FEATURE_STORE.CLUSTERING_ANALYSIS_OUTPUT_COLUMN, "clusters") return json.loads(spark_df_7.toPandas().to_json())
def create_clusters(sqlContext): specific_data = sqlContext.sql( 'SELECT Description, Date, Latitude, Longitude FROM crime_data') #where `Primary Type` == "HOMOCIDE" or `Primary Type` == "ASSAULT" specific_data.filter(specific_data.Description != "SIMPLE") specific_data = specific_data.rdd.filter(lambda x: datetime.strptime( x[1], "%m/%d/%Y %I:%M:%S %p").weekday() in [5, 6]).toDF() specific_data = specific_data.sample(False, .0001) #create different k-means outputs depending on the date since that is what we are testing vecAssembler = VectorAssembler(inputCols=['Latitude', 'Longitude'], outputCol="features") df_kmeans = vecAssembler.transform(specific_data).select( 'Description', 'features', 'Latitude', 'Longitude') # df_kmeans.show() # cost = [0] * 40 # for k in range(41,50): # kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") # model = kmeans.fit(df_kmeans.sample(False,0.1, seed=42)) # cost[k-41] = model.computeCost(df_kmeans) # requires Spark 2.0 or later # print (cost) # 48 for k since this is where it starts to have diminishing returns k = 48 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(df_kmeans) centers = model.clusterCenters() # print("Cluster Centers: ") # for center in centers: # print(center) #TODO: arrange based on season, time of day, etc transformed = model.transform(df_kmeans).select('Description', 'prediction', 'Latitude', 'Longitude') transformed.show() #return_rdd = transformed.rdd.map(lambda x: (x.prediction, (x.Description, (x.Latitude, x.Longitude)))) create_visuals(transformed)
def clustering(poses): print(len(poses)) dff = map(lambda x: (0,(Vectors.dense(x[0:2]))), poses) mydf = spark.createDataFrame(dff,schema=["None","features"]) k = 2 cost = 99999999999 while(cost >= 20): kmeans = KMeans().setK(k).setSeed(1) model = kmeans.fit(mydf) cost = model.computeCost(mydf)/len(poses) k+=1 print("K: ", k, " Cost: ", cost) centers = model.clusterCenters() centroids = [] for center in centers: centroids.append(center) prediction = model.transform(mydf).select('prediction').collect() labels = [p.prediction for p in prediction ] return centroids, labels
def getSilhouette(df, model='KMeans'): silhouette_ls = [] if model == 'KMeans': for i in range(1, 10): kmeans = KMeans().setK(i + 1).setSeed(123) model_k = kmeans.fit(df) # Make predictions predictions = model_k.transform(df) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette_ls.append(round(evaluator.evaluate(predictions), 2)) best_k = silhouette_ls.index(np.max(silhouette_ls)) + 2 print( '****************************SILHOUETTE*************************************************' ) print( f'The best K is: {best_k} associated with a silhoutte of: {np.max(silhouette_ls)}' ) return silhouette_ls, best_k
def test_kmeans_summary(self): data = [ (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), ), ] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 1)
def main(): cfg = Path(__file__).parent.joinpath("kmeans_config.yaml") cfg = load_config(str(cfg)) spark = init_spark(cfg) kmeans = KMeans(k=3, seed=0) X = read(cfg, spark) fit(cfg, kmeans, X) print(timings)
def create_kmeans_pipeline_from(data_frame, feature_extractor, total_clusters, max_iterations): """ Creates a pipeline for tokenizing the document to words, removing stop words, and add TF-IDF, and finally does clustering using k-means algorithm """ tokenizer_transformer = RegexTokenizer(inputCol="doc_text", outputCol="words", pattern="\\W") stop_words_transformer = StopWordsRemover(inputCol="words", outputCol="filtered_words") pipeline_stages = [tokenizer_transformer, stop_words_transformer,] if feature_extractor == 'TFIDF': # create TF counter using CountVectorizer tf_estimator = CountVectorizer(inputCol="filtered_words", outputCol="TF") # create inverse-document-frequency counter idf_estimator = IDF(inputCol="TF", outputCol="features") # add them to the pipeline stages pipeline_stages.extend([tf_estimator, idf_estimator]) elif feature_extractor == 'Word2Vec': # create word2vec feature extractor w2v_estimator = Word2Vec(inputCol="filtered_words", outputCol="features") # add this to pipeline stage pipeline_stages.append(w2v_estimator) else: raise ValueError('Unknown feature extractor:' % feature_extractor) # create KMeans clustering kmeans = KMeans(k=total_clusters, featuresCol="features", predictionCol="DocumentClass", seed=1, maxIter=max_iterations) # finally add Kmeans to the pipeline # which takes "features" output from the previous stage and # does the prediction. # NOTE: # For document clustering cosine_similarity measure is the preferred one. # This pipeline uses SSE method # pipeline_stages.append(kmeans) return Pipeline(stages=pipeline_stages).fit(data_frame)
def kmeans(df, optimalK, seed=1): """ Input: df: pyspark dataframe optimalK: int, k for cluster seed: int, random seed Output: new_df_kmeans_transformed: pyspark dataframe, with the cluster prediction model: k-means model """ # transfer to "features" for pyspark ML scaled_column = [col for col in df.columns if "scaled" in col] df_vec = get_feature(df, scaled_column) kmeans = KMeans(k=optimalK, seed=seed) model = kmeans.fit(df_vec) new_df_kmeans_transformed = model.transform(df_vec) return new_df_kmeans_transformed, model
def test_kmean_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). data = [ (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), ), ] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) path = tempfile.mkdtemp() km_path = path + "/km-pmml" model.write().format("pmml").save(km_path) pmml_text_list = self.sc.textFile(km_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def kmeans(): spark = SparkSession \ .builder \ .appName('my_first_app_name') \ .getOrCreate() df = spark.read.csv('./seeds_dataset.csv', header=True, inferSchema=True) df.show(5) assembler = VectorAssembler(inputCols=df.columns, outputCol='features') final_df = assembler.transform(df) final_df.show(3) final_df.take(1) # instantiated kmeans with 3 cluster kmeans = KMeans(k=3) # fitting the model model = kmeans.fit(final_df) centers = model.clusterCenters() print(centers) model.transform(final_df).select('prediction').show()
def cluster(): ld = load(open(DATAP + '\\temp\olangdict.json', 'r', encoding='UTF-8')) spark = SparkSession.builder\ .master("local")\ .appName("Word Count")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() df = spark.createDataFrame([["0"], ["1"], ["2"], ["3"], ["4"]], ["id"]) df.show() vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features") new_df = vecAssembler.transform(df) kmeans = KMeans(k=2, seed=1) # 2 clusters here model = kmeans.fit(new_df.select('features')) transformed = model.transform(new_df) print(transformed.show())
def analysing_emissions_data(spark, co2_emisssion_data): # creating feature vector for sending as input to ML models vecAssembler = VectorAssembler(inputCols=['change_in_emissions_scaled'], outputCol="features") # adding feature vector to our aperk dataframe co2_emisssion_data = vecAssembler.setHandleInvalid("skip").transform( co2_emisssion_data) # creating Kmeans object (7 clusters) kmeans = KMeans(k=7) # clustering operation model = kmeans.fit(co2_emisssion_data.select('features')) # adding column of predicted clusters to our dataframe co2_emisssion_data = model.transform(co2_emisssion_data) return co2_emisssion_data.drop("features")
def clustering(fp, cols): assembler = VectorAssembler(inputCols=cols, outputCol='features') assembled_data = assembler.transform(fp) scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures') print('dkks' scaler) #scaler.show() scaler_model = scaler.fit(assembled_data) scaled_data = scaler_model.transform(assembled_data) #scaled_data.printSchema() #scaled_data.show(4) #scaled_data.select('scaledFeatures').show() k_means_3 = KMeans(featuresCol='scaledFeatures', k=10) #clusters = KMeans(fp, 5, maxIterations=10, initializationMode="random")#clusters.show() model_k3 = k_means_3.fit(scaled_data) model_k3_data = model_k3.transform(scaled_data) #details = model_k3_data #model_k3_data.groupBy('prediction').count().show() #details.show(50) return model_k3_data
def __k_mean(cls, df, k_clusters, xnorm, label_name): from sklearn.cluster import KMeans kmeans_cat = label_name if xnorm.shape[1] > 0: #n_clusters = elbow point start_time = time.time() estimator = KMeans(n_clusters=k_clusters) estimator.fit(xnorm) y_pred = estimator.predict(xnorm) res=estimator.__dict__ #print(res['cluster_centers_']) cluster_center = res['cluster_centers_'] print("training time: ", time.time()-start_time, "(sec)") else: #no split y_pred = 0 cluster_center = [] print("no kmeans split") df[kmeans_cat] = y_pred + 1 return df, kmeans_cat, cluster_center
def buildAnomalyDetector(data, normalizeFunction): normalizedData = data.map(normalizeFunction) normalizedData.cache() model = KMeans.train(normalizedData, 150, maxIterations=10, epsilon=1.0e-6) normalizedData.unpersist() distances = normalizedData.map(lambda datum: distToCentroid(datum, model)) threshold = distances.top(100).pop() def f(datum): return distToCentroid(normalizeFunction(datum), model) > threshold return f
def executeKmeans(k, username, jobNum, input_file_name, model_file_name, prediction_file_name, features_col): file_path_in_hdfs = username + '/' + jobNum + '/' ip_file_in_hdfs = file_path_in_hdfs + input_file_name df_kmeans = preprocessData(ip_file_in_hdfs, features_col) kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(df_kmeans) centers = model.clusterCenters() model_file_in_hdfs = file_path_in_hdfs + model_file_name saveModelinHDFS.apply_async( (file_path_in_hdfs, json.dumps(centers, cls=NumpyEncoder), model_file_in_hdfs)) predictions = makePredictions(df_kmeans, model) prediction_file_in_hdfs = file_path_in_hdfs + prediction_file_name savePredictioninHDFS.apply_async( (file_path_in_hdfs, json.dumps(predictions.values.tolist(), cls=NumpyEncoder), prediction_file_in_hdfs))
def main(argv): #se instancia el contexto de spark. sc = SparkContext(appName="KMeans-Clustering-dhoyoso-dsernae") #se inicia sesion en spark. spark = SparkSession(sc) #se guarda el lenguaje a partir del cual se quitaran las stop words. language = argv[4] #"spanish" #se guarda la ruta para la salida de los clusters. pathout = argv[3] #se guarda la ruta de la cual se leeran los archivos. path = argv[2] #"hdfs:///user/dhoyoso/datasets/dataset/" #se guarda el numero de clusters que se desea hacer. k = int(argv[1]) #4 #se sacan los archivos a procesar a partir de la ruta. files = sc.wholeTextFiles(path) #se crea la estructura del dataframe; 2 columnas una para la ruta y otra para el texto. schema = StructType([ StructField("path", StringType(), True), StructField("text", StringType(), True) ]) #se crea el dataframe a partir de la estructura y los archivos. df = spark.createDataFrame(files, schema) #se tokeniza el texto usando la clase de Ml tokenizer. tokenizer = Tokenizer(inputCol="text", outputCol="tokens") #se le dice al stop words remover que idioma es el que estamos tratando. StopWordsRemover.loadDefaultStopWords(language) #se remueven las stopwords de los tokens. stopWords = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") #se hace el hashing tf de los tokens restantes. hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2000) #se hace el idf de la salida del hashingTF idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1) #se inicializa el kmeans con el idf y el k deseado. kmeans = KMeans(k=k) #creacion del mapa de transformaciones. pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans]) #inserta el dataframe como el inicio de las transformaciones model = pipeline.fit(df) #ejecuta las trasformaciones mapeadas y guarda el resultado results = model.transform(df) results.cache() #se corta la ruta para dejar solo el nombre y su respectivo cluster(prediction). split_col = split(results['path'], '/') results = results.withColumn('docname', split_col.getItem(7)) df = results.select("docname", "prediction") #se agrupan los documentos del mismo cluster en cluster_docs_list y se guardan en el path de salida como un json. grouped = df.groupBy(['prediction']).agg( collect_list("docname").alias('cluster_docs_list')) grouped.coalesce(1).write.json(path=pathout, mode="overwrite")
def cluster(dataframe, k, maxIter=50): clf = KMeans(k=k, maxIter=maxIter) model = clf.fit(dataframe) print('=================================') print('Cost:', model.summary.trainingCost) # print('Cluster:', model.summary.cluster) # print('Cluster centers:', model.clusterCenters()) print('Cluster size:', model.summary.clusterSizes) print('Iter:', model.summary.numIter) # print('Predictions:', model.summary.predictions) print('k:', model.summary.k) print('featuresCol:', model.summary.featuresCol) print('predictionCol:', model.summary.predictionCol) # print(dir(model.summary)) # print('=================================') # print('Saving centers...') # np.save('tmp/centers' + str(options.k), model.clusterCenters()) # print('Saving predictions...') # model.summary.predictions.write.format('parquet').saveAsTable('predictions' + str(options.k)) return model.clusterCenters()
def kmeans_pipeline(self, inputCols, cluster_count=None, max_cluster=None): if max_cluster != None: self._max_cluster = max_cluster assembler = VectorAssembler(inputCols=inputCols, outputCol="features") assembled = assembler.transform(self._data_frame) mmScaler = StandardScaler(inputCol="features", outputCol="featuresCol", withStd=True, withMean=False) scale_model = mmScaler.fit(assembled) vectorized_data = scale_model.transform(assembled) if cluster_count == None: cluster_count_array = list(range(2, self._max_cluster)) wssse_output = [] for n_cluster in cluster_count_array: kmeans = KMeans().setK(n_cluster).setSeed(1) kmeans_model = kmeans.fit(vectorized_data) wssse = kmeans_model.computeCost(vectorized_data) wssse_output.append(wssse) wssse_dict = dict(list(zip(cluster_count_array, wssse_output))) cluster_count = min(wssse_dict, key=wssse_dict.get) kmeans = KMeans().setK(cluster_count).setSeed(1) kmeans_model = kmeans.fit(vectorized_data) wssse = kmeans_model.computeCost(vectorized_data) centers = kmeans_model.clusterCenters() cluster_prediction = kmeans_model.transform(vectorized_data) else: wssse_dict = {} kmeans = KMeans().setK(cluster_count).setSeed(1) kmeans_model = kmeans.fit(vectorized_data) wssse = kmeans_model.computeCost(vectorized_data) centers = kmeans_model.clusterCenters() cluster_prediction = kmeans_model.transform(vectorized_data) self._kmeans_result["cluster_count"] = cluster_count self._kmeans_result["wssse"] = wssse self._kmeans_result["wssse_dict"] = wssse_dict self._kmeans_result["centers"] = centers self._kmeans_result["cluster_count"] = cluster_count self._kmeans_result["inputCols"] = inputCols # cluster_prediction = cluster_prediction.withColumn("prediction", cluster_prediction["prediction"].cast(StringType())) # print cluster_prediction.printSchema() self._predictedData = cluster_prediction
def newTrain(): fileSave = "/home/hadoop/data_school/sparkMlib/KMeans" # 男: 1, 女: 2 df = spark.read.format('csv').option('header', 'true').load(fileSave).fillna('0') df = df.where(df.TotalFee != '0').where(df.DiseaseCode == '13104') df = df.withColumn("Sex", df.Sex.cast(IntegerType())) \ .withColumn("Age", df.Age.cast(IntegerType())) \ .withColumn("TotalFee", df.TotalFee.cast(FloatType())) # vecAss = VectorAssembler(inputCols=df.columns[2:], outputCol='feature') # data = vecAss.transform(df).select("feature") # data.show() data = df.drop("Sex", "DiseaseCode") data.show() # 转换数据 featureCreator = VectorAssembler(inputCols=data.columns, outputCol='features') data = featureCreator.transform(data) distance = [] for k in range(2, 10): # 评估器 kmeans = KMeans(featuresCol='features').setK(k) # 模型拟合 model = kmeans.fit(data) # 聚合 test = model.transform(data).select('features', 'prediction') evaluator = ClusteringEvaluator() evaResult = evaluator.evaluate(test) print("the distance = " + str(evaResult)) distance.append(evaResult) import matplotlib.pyplot as plt plt.figure() x = [i for i in range(2, 10)] plt.plot(x, distance) plt.xlabel("K") plt.ylabel("Distance") plt.show()
def topic_generator(subreddit_input): subreddit_filter = requests.get( url + 'reddit_post.json?orderBy="subreddit"&equalTo="' + str(subreddit_input) + '"') subreddits = json.loads(subreddit_filter.text) results = [] for x in subreddits: try: results.append(subreddits[x]) except KeyError: continue data = pd.DataFrame.from_dict(results, orient='columns') data1 = spark.createDataFrame(pd.DataFrame(data["title"])) #text clean clean_data_udf = udf(clean_data, StringType()) data1 = data1.withColumn("new_title", clean_data_udf("title")) #text tokenizer tokenizer = Tokenizer(inputCol="new_title", outputCol="words") data1 = tokenizer.transform(data1) #stopwords removal remover = StopWordsRemover(inputCol="words", outputCol="rm_words") data1 = remover.transform(data1) #TFIDF vectorization hashingTF = HashingTF(inputCol="rm_words", outputCol="rawFeatures", numFeatures=2000) data1 = hashingTF.transform(data1) #Document frequency idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(data1) data1 = idfModel.transform(data1) #Kmeans topic clustering kmeans = KMeans(k=2, featuresCol="features").setSeed(1) kmeans_model = kmeans.fit(data1) data1 = kmeans_model.transform(data1) data["prediction"] = data1.select("prediction").toPandas() return data
def kmeans_info(i, df, inputCols): vecAssembler = VectorAssembler(inputCols=inputCols, outputCol='features') df = vecAssembler.transform(df) info = '---> With ' + str(i) + ' clusters \n' kmeans = KMeans(k=i, seed=1) model = kmeans.fit(df.select('features')) # Make predictions predictions = model.transform(df) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) info += "Silhouette with squared euclidean distance = " + str( silhouette) + '\n' info += "Cluster Centers: " + '\n' ctr = [] centers = model.clusterCenters() for center in centers: ctr.append(center) info += str(center) + '\n' info += '--------------------\n' return info
def __process_dataset(self): """Making model from dataset """ data = [] for i in range(len(self.model)): logger.info("Training consumers batch {}".format(i)) logger.info("Assembling vector!") assembler = VectorAssembler(inputCols=["_c12"], outputCol='features') data.append(assembler.transform(self.model[i])) logger.info("Training Model!") kmeans = KMeans().setK(10).setSeed(1) self.model[i] = kmeans.fit(data[i]) logger.info("Making Preditction!") self.predictions = self.model[i].transform(data[i]) logger.info("Clustering model built!")
def visualizationInR(rawData): def preprocessing(line): values = line.split(",") del values[1:4] values.pop() return Vectors.dense(map(lambda x: float(x), values)) data = rawData.map(preprocessing).cache() model = KMeans.train(data, 100, maxIterations=10, epsilon=1.0e-6) sample = data.map(lambda datum: model.predict(datum) + "," + ",".join(datum)).sample(False, fraction=0.05, seed=None) sample.saveAsTextFile("file:///user/ds/sample")
def clustering(player_profile): vecAssembler = VectorAssembler( inputCols=["fouls", "goals", "owngoals", "pass_acc", "shots", "matches"], outputCol="features") print(player_profile.printSchema()) player_profile = player_profile.drop("name") new_df = vecAssembler.transform(player_profile) kmeans = KMeans(k=5) model = kmeans.fit(new_df.select("features")) # Make predictions predictions = model.transform(new_df) predictions.show() # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) return predictions, centers
def cluster(): ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8')) spark = SparkSession.builder\ .master("local")\ .appName("Word Count")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() df = spark.createDataFrame([["0"], ["1"], ["2"], ["3"], ["4"]], ["id"]) df.show() vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features") new_df = vecAssembler.transform(df) kmeans = KMeans(k=2, seed=1) # 2 clusters here model = kmeans.fit(new_df.select('features')) transformed = model.transform(new_df) print(transformed.show())
trainingData = VectorAssembler(inputCols=["duration", "tempo", "loudness"], outputCol="features").transform( table("songsTable") ) # COMMAND ---------- # MAGIC %md We can now pass this new DataFrame to the `KMeans` model and ask it to categorize different rows in our data to two different classes (`setK(2)`). We place the model in a variable named `model`. # MAGIC # MAGIC **Note:** This command multiple spark jobs (one job per iteration in the KMeans algorithm). You will see the progress bar starting over and over again. # COMMAND ---------- from pyspark.ml.clustering import KMeans model = KMeans().setK(2).fit(trainingData) # COMMAND ---------- # MAGIC %md To see the result of our clustering, we produce a scatter plot matrix that shows interaction between input variables and learned clusters. To get that we apply the model on the original data and pick four columns: `prediction` and the original features (`duration`, `tempo`, and `loudness`). # COMMAND ---------- transformed = model.transform(trainingData).select("duration", "tempo", "loudness", "prediction") # COMMAND ---------- # MAGIC %md To comfortably visualize the data we produce a random sample. # MAGIC Remember the `display()` function? We can use it to produce a nicely rendered table of transformed DataFrame. # COMMAND ----------
# COMMAND ---------- fittedPipeline = transformationPipeline.fit(trainDataFrame) # COMMAND ---------- transformedTraining = fittedPipeline.transform(trainDataFrame) # COMMAND ---------- from pyspark.ml.clustering import KMeans kmeans = KMeans()\ .setK(20)\ .setSeed(1L) # COMMAND ---------- kmModel = kmeans.fit(transformedTraining) # COMMAND ---------- transformedTest = fittedPipeline.transform(testDataFrame) # COMMAND ----------
from pyspark.mllib.linalg import Vectors from pyspark.ml.clustering import KMeans from pyspark import SparkContext from pyspark.sql import SQLContext # sc = SparkContext(appName="test") # sqlContext = SQLContext(sc) data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = sqlContext.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) centers = model.clusterCenters() model.transform(df).select("features", "prediction").collect()
# For now, analysis is still required. We cache the output because we are going to perform # multiple runs on the dataset. df0 = tfs.analyze(df).cache() mllib_df.count() df0.count() np.random.seed(2) init_centers = np.random.randn(k, num_features) start_centers = init_centers dataframe = df0 ta_0 = time.time() kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode( "random").setMaxIter(num_iters) mod = kmeans.fit(mllib_df) ta_1 = time.time() tb_0 = time.time() (centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=False) tb_1 = time.time() tc_0 = time.time() (centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=True) tc_1 = time.time() mllib_dt = ta_1 - ta_0 tf_dt = tb_1 - tb_0 tf2_dt = tc_1 - tc_0
sales = va.transform(spark.read.format("csv") .option("header", "true") .option("inferSchema", "true") .load("/data/retail-data/by-day/*.csv") .limit(50) .coalesce(1) .where("Description IS NOT NULL")) sales.cache() # COMMAND ---------- from pyspark.ml.clustering import KMeans km = KMeans().setK(5) print km.explainParams() kmModel = km.fit(sales) # COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def assign_cluster(data): """Train kmeans on rescaled data and then label the rescaled data.""" kmeans = KMeans(k=2, seed=1, featuresCol="features_scaled", predictionCol="label") model = kmeans.fit(data) label_df = model.transform(data) return label_df
# COMMAND ---------- display(transformed) # COMMAND ---------- # MAGIC %md # MAGIC #### K-Means Visualized # COMMAND ---------- modelCenters = [] iterations = [0, 2, 4, 7, 10, 20] for i in iterations: kmeans = KMeans(k=3, seed=5, maxIter=i, initSteps=1) model = kmeans.fit(irisTwoFeatures) modelCenters.append(model.clusterCenters()) # COMMAND ---------- print 'modelCenters:' for centroids in modelCenters: print centroids # COMMAND ---------- import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np
twDF = tw.map(lambda p: Row(text=p)).toDF() t0 = time.time() word2Vec = Word2Vec(vectorSize=100, minCount=5, stepSize=0.025, inputCol="text", outputCol="result") modelW2V = word2Vec.fit(twDF) wordVectorsDF = modelW2V.getVectors() timeW2V = time.time() - t0 ## Train K-means on top of the Word2Vec matrix: t0 = time.time() vocabSize = wordVectorsDF.count() K = int(math.floor(math.sqrt(float(vocabSize)/2))) # K ~ sqrt(n/2) this is a rule of thumb for choosing K, # where n is the number of words in the model # feel free to choose K with a fancier algorithm dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector','features') kmeans = KMeans(k=K, seed=1) modelK = kmeans.fit(dfW2V) labelsDF = modelK.transform(dfW2V).select('prediction').withColumnRenamed('prediction','labels') vocabSize = wordVectorsDF.count() timeKmeans = time.time() - t0 sc.stop() ## Print Some Results printResults = 1 # set t if (printResults): ## Read Tweets print "="*80 print "Read Tweets..."
onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False) newdf = onehotenc.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+"-onehot", c) return newdf dfhot = oneHotEncodeColumns(dfnumeric, ["Take-out","GoodFor_lunch", "GoodFor_dinner", "GoodFor_breakfast"]) dfhot.show(5) # Taining set assembler = VectorAssembler(inputCols = list(set(dfhot.columns) | set(['stars','review_count'])), outputCol="features") train = assembler.transform(dfhot) # Kmeans set for 5 clusters knum = 5 kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=knum, seed=0) model = kmeans.fit(train) print "Model Created!" # See cluster centers: centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # Apply the clustering model to our data: prediction = model.transform(train) prediction.groupBy("cluster").count().orderBy("cluster").show() # Look at the features of each cluster customerCluster = {}
# $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("KMeansExample")\ .getOrCreate() # $example on$ # Loads data. dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers:
# Sukurtas `vecAssembler` objektas (kurio tipas yra `pyspark.ml.feature.VectorAssembler`) turi `transform` metodą, kuriuo iš stulpelių `x` ir `y` bus pagamintas vektorių stulpelis `features` ir iš jam perduodamo `DataFrame` objekto bus sukurtas naujas `DataFrame` su `features` stulpeliu: # In[61]: ca1mlFeaturizedDF = vecAssembler.transform(ca1MlDF) ca1mlFeaturizedDF.show(5) # Sukursime `pyspark.ml.clustering.KMeans` objektą (atminkime, kad 1 pavyzdyje naudojome `pyspark.mllib.clustering.KMeans`), kuriuo apmokysime `pyspark.ml.clustering.KMeansModel`. Duomenis pateiksime nebe `RDD`, o `DataFrame` objektu. # In[62]: from pyspark.ml.clustering import KMeans as MlKMeans firstMlKMeans = MlKMeans( featuresCol="features", predictionCol="prediction", k=2, initMode="k-means||", maxIter=20) type(firstMlKMeans) # `pyspark.ml` paketo modelių klasės turi `explainParams` metodą, kuruo išvedami modelio parametrų paaiškinimai. # In[63]: print(firstMlKMeans.explainParams()) # Apmokykime modelį. # In[64]:
def test_kmeans_param(self): algo = KMeans() self.assertEqual(algo.getInitMode(), "k-means||") algo.setK(10) self.assertEqual(algo.getK(), 10) algo.setInitSteps(10) self.assertEqual(algo.getInitSteps(), 10) self.assertEqual(algo.getDistanceMeasure(), "euclidean") algo.setDistanceMeasure("cosine") self.assertEqual(algo.getDistanceMeasure(), "cosine")
print(colStdDev) #Place the means and std.dev values in a broadcast variable bcMeans = sc.broadcast(colMeans) bcStdDev = sc.broadcast(colStdDev) csAuto = autoVector.map(centerAndScale) #csAuto.collect() #csAuto.foreach(println) print(csAuto) #Create Spark Data Frame autoRows = csAuto.map(lambda f:Row(features=f)) autoDf = SQLContext.createDataFrame(autoRows) autoDf.select("features").show(10) kmeans = KMeans(k=3, seed=1) model = kmeans.fit(autoDf) predictions = model.transform(autoDf) predictions.collect() predictions.foreach(println) #Plot the results in a scatter plot unstripped = predictions.map(unstripData) predList=unstripped.collect() predPd = pd.DataFrame(predList) # preparing to save the clustered data list_current_gni_final_maped = current_gni_final_maped.collect() list_current_gni_rdd = current_gni_rdd.collect() list_predictions_pandas=predictions.toPandas() list_predictions_temp=list_predictions_pandas.as_matrix()