#spark-submit --master local[*] --packages com.databricks:spark-csv_2.10:1.2.0 cluster.py sc = SparkContext() sqlContext = SQLContext(sc) text = sc.textFile('file:/Users/wangmengyuan/Desktop/rr/listings.txt').map(lambda l:l.split('\t'))\ .map(lambda l: (l[0],l[1])) df = sqlContext.createDataFrame(text, ["houseid", "description"]) tokenizer = Tokenizer(inputCol="description", outputCol="tokens") tokenized = tokenizer.transform(df).cache() remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") stopWordsRemoved_df = remover.transform(tokenized).cache() hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=200) tfVectors = hashingTF.transform(stopWordsRemoved_df).cache() idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors).cache() normalizer = Normalizer(inputCol="features", outputCol="normFeatures") l2NormData = normalizer.transform(tfIdfVectors) kmeans = KMeans().setK(10).setMaxIter(20) km_model = kmeans.fit(l2NormData) clustersTable = km_model.transform(l2NormData) #save to hdfs df1 = clustersTable[['houseid', 'prediction']] #df1.select('houseid', 'prediction').write.format('com.databricks.spark.csv').save('cluster.csv') df1.select('houseid', 'prediction').show(20) sc.stop()
@author: kach """ from pyspark.context import SparkContext from pyspark.sql.session import SparkSession sc = SparkContext('local') spark = SparkSession(sc) from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator # Loads data. dataset = spark.read.format("libsvm").load("data/sample_kmeans_data.txt") # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers:
# I converted csv to a parquet file to save space and time def lis(x): return [float(i) for i in x[1:-1].split(',')] spark.read.load("train_fet.csv", format="csv", inferSchema="true", header="true").rdd \ .map(lambda x: (x[2], x[1], DenseVector(lis(x[0])))) \ .toDF(["index", "file", "features"]) .write.parquet("train_fet.parquet") # Now I create the Bag of Visual Words representation using K-means schema = spark.read.parquet("train_fet.parquet").persist(StorageLevel(True, True, False, False, 1)) start = time.clock() kmeans = KMeans(k=K, initMode='random') print(time.clock()-start) start = time.clock() model = kmeans.fit(schema) print(time.clock()-start) start = time.clock() centers = model.clusterCenters() print(time.clock()-start) model.save('KmeansModel') # Next I create the Hamming Embedding Matrix G = np.random.randn(db, d) P, _ = np.linalg.qr(G) np.save('P.npy', P)
df2.printSchema() df2.show(50) # Perform unsupervised learning on df2 with k-means # You can use whole df2 as both training and testing data, # Evaluate the clustering result using Accuracy. from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.evaluation import BinaryClassificationEvaluator df3 = df2 kmeans = KMeans(k=2, seed=1) # 2 clusters here model = kmeans.fit(df3.select('features')) transformed = model.transform(df3) transformed.show(50) #Generate a scatter plot using the first two PCA components to investigate the data distribution. from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] df3 = spark.createDataFrame(data, ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
from pyspark.sql import SparkSession from pyspark.ml.linalg import Vectors from pyspark.ml.clustering import KMeans if __name__ == "__main__": spark = SparkSession.builder.appName("Q1").config( "spark.some.config.option", "some-value").getOrCreate() sc = spark.sparkContext rates = sc.textFile("itemusermat").map(lambda x: x.split(" ")).map(lambda x: [int(i) for i in x])\ .collect() # make use of kmean model to fit the matrix rates = [(Vectors.dense(x), ) for x in rates] df = spark.createDataFrame(rates, ["features"]) kmeans = KMeans(k=10, seed=1) model = kmeans.fit(df) # group all movies by cluster transformed = model.transform(df).select("prediction", "features").rdd.map(lambda x: (int(x.prediction), int(list(x.features)[0]))). \ groupByKey().mapValues(lambda x: list(x)[:5]) # create pair in format of (movie_id, cluster_id) # generate pairs like (movie_id, cluster_id) from x which is in the form of (cluster_id, [id1,id2,id3...]) fiveincluster = transformed.flatMap(lambda x: [(a, x[0]) for a in x[1]]) # read movie info file moviedetails = sc.textFile("movies.dat").map(lambda x: x.split("::")).map( lambda x: (int(x[0]), (x[1], x[2]))) # join the (movie_id, cluster_id) and movie info result = fiveincluster.join(moviedetails).map(lambda x: (x[1][0], x[0], x[
" ") typedData = csvData for colName in columnsToKeep: typedData = csvData.withColumn( colName, typedData[colName].cast(IntegerType()).alias(colName)) typedData = typedData.na.drop() print(typedData.schema) assembler = VectorAssembler().setInputCols(columnsToKeep).setOutputCol( "features") dataWithFeatures = assembler.transform(typedData) dataWithFeatures.show() normalizer = Normalizer().setInputCol("features").setOutputCol("normFeatures") normData = normalizer.transform(dataWithFeatures) kmeans = KMeans().setK(5).setFeaturesCol("normFeatures") model = kmeans.fit(normData) predictions = model.transform(normData) predictions.select("features", "prediction").show() evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) spark.stop()
from pyspark.sql.functions import * from pyspark.ml.feature import VectorAssembler from pyspark.ml.clustering import KMeans spark = SparkSession.builder.getOrCreate() #Reading points pointsData = spark.read.csv("/user/s2279444/Cluster3/*") pointsData = pointsData.selectExpr("_c0 as Latitudes", "_c1 as Longitudes") pointsData = pointsData.withColumn("Latitudes", pointsData["Latitudes"].cast("double")) pointsData = pointsData.withColumn("Longitudes", pointsData["Longitudes"].cast("double")) #Pre Processing the dataset columns = pointsData.columns assembler = VectorAssembler(inputCols=columns,outputCol="features") dataset = assembler.transform(pointsData) kValue = 4 kmeans = KMeans().setK(kValue).setSeed(1) model = kmeans.fit(dataset) predictions = model.transform(dataset) finalData = predictions.select("Latitudes","Longitudes", "prediction") for i in range(0,kValue): data = finalData.filter(finalData["prediction"] == i) data = data.select(data["Latitudes"], data["Longitudes"]) dirName = "Cluster33_"+str(i) print("ASHISH: Running for cluster... dirName ->",dirName) data.write.csv(dirName)
inputColumns = map(lambda x: x.name, numerical) for name in map(lambda x: x.name, nonnumerical): if (df.select([name]).distinct().count() > 1): model = StringIndexer(inputCol=name, outputCol=name + " Index").fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol=name + " Index", outputCol=name + " Vec") df = encoder.transform(indexed) inputColumns.append(name + " Vec") """ STEP3 Clustering """ assembler = VectorAssembler().setInputCols(inputColumns).setOutputCol( "features") kmeans = KMeans().setK(k).setFeaturesCol("features").setPredictionCol( "prediction") df = assembler.transform(df) clusters = kmeans.fit(df).transform(df) """ STEP4 Write file to S3 """ clusters.write.json(outputFile)
transformed_data = assembler.transform(dataset) # In[10]: transformed_data.toPandas().head() # ### Define the clustering model # Use K-means clustering # * <b>k: </b>Defines the number of clusters # * <b>seed: </b>This value is used to set the cluster centers. A different value of seed for the same k will result in clusters being defined differently. In order to reproduce similar clusters when re-running the clustering algorithm use the same values of k and seed # In[11]: from pyspark.ml.clustering import KMeans kmeans = KMeans(k=5, seed=3) model = kmeans.fit(transformed_data) # #### Create the clusters using the model # In[12]: clusterdData = model.transform(transformed_data) # #### Use ClusteringEvaluator to evaluate the clusters # <b>From Wikipedia: </b>The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters. # In[13]: from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors, VectorUDT sc = SparkContext('local') spark = SparkSession(sc) sqlContext = SQLContext(sc) data = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load('/home/cloudera/Desktop/datatest.csv') feature = StringIndexer(inputCol="Hotttness", outputCol="target") target = feature.fit(data).transform(data) def transData(row): return Row(label=row["target"], features=Vectors.dense([ row["Duration"], row["KeySignature"], row["KeySignatureConfidence"], row["Tempo"], row["TimeSignature"], row["TimeSignatureConfidence"] ])) transformed = target.rdd.map(transData).toDF() kmeans = KMeans(k=4) model = kmeans.fit(transformed) predict_data = model.transform(transformed) train_err = predict_data.filter( predict_data['label'] != predict_data['prediction']).count() total = predict_data.count() print 23333333333333333333333333333333333333333333333333333333333333333333333333333333333333 print train_err, total, float(train_err) / total print 23333333333333333333333333333333333333333333333333333333333333333333333333333333333333
cols = [ 'Session_Connection_Time', 'Bytes_Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed' ] #Assembling The Features assembler = VectorAssembler(inputCols=cols, outputCol='features') #Creating the new Dataframe with Features assembled_data = assembler.transform(data) #Scaling the Features scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures') scaler_model = scaler.fit(assembled_data) scaled_data = scaler_model.transform(assembled_data) #Creating the Model k_means = KMeans(featuresCol='scaledFeatures', k=n) #Training The Model model = k_means.fit(scaled_data) #Prediction model_data = model.transform(scaled_data) #Grouping and Displaying By Cluster model_data.groupBy('prediction').count().show()
dataframe_mysql.show() # read demographic data demographic_df = spark.read.format("libsvm").csv("data/demographic.csv", inferSchema="true", header="true") # assemble feathers vector vecAssembler = VectorAssembler(inputCols=[ columns_names.gender, columns_names.education, columns_names.age, columns_names.longitude, columns_names.latitude ], outputCol="features") demographic_df = vecAssembler.transform(demographic_df) # Trains a k-means model. kmeans = KMeans().setK(5).setInitMode("k-means||").setSeed(1).setFeaturesCol( "features") #print(kmeans.explainParams()) model = kmeans.fit(demographic_df) # Make predictions predictions = model.transform(demographic_df) predictions.show() predictions = predictions.drop(predictions["features"]) predictions.show() #predictions = predictions.toPandas() predictions.coalesce(1).write.option("header", "true").option( "inferSchema", "true").csv("data/5_clusters.csv")
from pyspark.sql import Row from pyspark.ml.clustering import KMeans from pyspark.mllib.linalg import Vectors def transData(row): return Row(label=row["player_name"], features=Vectors.dense([ row["SHOT_DIST"], row["CLOSE_DEF_DIST"], row["SHOT_CLOCK"], row["SHOT_DIST2"] ])) # 转化成Dataframe格式 transformed = target.map(transData).toDF() kmeans = KMeans(k=3) model = kmeans.fit(transformed) predict_data = model.transform(transformed) train_err = predict_data.filter( predict_data['label'] != predict_data['prediction']).count() total = predict_data.count() print(float(train_err), total, float(train_err) / total) # # #导入数据 # data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] # df = spark.createDataFrame(data, ["features"])
# Load in all the mongo data from the database and collection defined df = spark.read.format("mongo").load() # Parse game data and generate numeric values so we can feed it into kmeans parsedData = df.rdd.map(transformChessData) # Convert PipelinedRDD to dataframe sqlContext = SQLContext(sc) schemaFeatures = sqlContext.createDataFrame(parsedData) # Normalize all the columns normalizedDF = normalizeData( schemaFeatures, ["w_attack", "w_defend", "b_attack", "b_defend", "evals"]) # normalizedDF.show() # Combine all normalized columns into one "features" column assembler = VectorAssembler(inputCols=[ "w_attack_norm", "w_defend_norm", "b_attack_norm", "b_defend_norm", "evals_norm" ], outputCol="features") training = assembler.transform(normalizedDF) # Build the model (cluster the data) kmeans = KMeans(k=2, maxIter=100) model = kmeans.fit(training) model.save("KMeansModel_final_both_norm") for center in model.centers: print(center)
def __train_with_clustering(self,df): kmeans = KMeans().setK(2).setSeed(1) return kmeans.fit(df)
hashingTF.getOutputCol()).setOutputCol('idf')) normalizer = (Normalizer().setInputCol( idf.getOutputCol()).setOutputCol('features')) # COMMAND ---------- # MAGIC %md # MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages. We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`. This will take about a minute to run. # COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.clustering import KMeans kmeans = (KMeans().setFeaturesCol('features').setPredictionCol( 'prediction').setK(5).setSeed(0)) pipeline = Pipeline().setStages( [tokenizer, hashingTF, idf, normalizer, kmeans]) model = pipeline.fit(parsed) # COMMAND ---------- # MAGIC %md # MAGIC Let's take a look at a sample of the data to see if we can see a pattern between predicted clusters and titles. We'll use a stratified sample to over-weight the less frequent predictions for inspection purposes. # COMMAND ---------- predictions = model.transform(parsed) stratifiedMap = {0: .03, 1: .04, 2: .06, 3: .40, 4: .005} sampleDF = predictions.sampleBy('prediction', stratifiedMap, 0)
data = StringIndexer(inputCol="Embarked", outputCol="EmbarkIndex").fit(data).transform(data) data = OneHotEncoder(inputCol="EmbarkIndex", outputCol="EmbarkVec").transform(data) final_data = VectorAssembler( inputCols=["Survived", "Pclass", "SexVec", "Age", "Fare", "EmbarkVec"], outputCol="features").transform(data) # Split data into train and test sets # Nor necessary for Clustering # Model training kmeans = KMeans(k=5) model = kmeans.fit(final_data) # Transform the test data using the model to get predictions clustered_data = model.transform(final_data) # Prediction and model status clustered_data_sorted = clustered_data.orderBy("prediction") clustered_data_sorted.show(10000) clustered_data.groupBy("prediction").agg(avg("Survived"), avg("Pclass"), avg("Age"), avg("Fare"), avg("SexIndex"), avg("EmbarkIndex"),
.option("mode", "DROPMALFORMED") \ .csv("file:///Users/beginspark/Temp/data3.csv") d1.printSchema() d2 = d1.toDF("number", "name", "SI", "GOO", "DONG", "x", "y", "b_code", "h_code", "utmk_x", "utmk_y", "wtm_x", "wtm_y") d3 = d2.select(d2.GOO.alias("loc"), d2.x, d2.y) d3.show(5, False) indexer = StringIndexer(inputCol="loc", outputCol="loccode") assembler = VectorAssembler(inputCols=["loccode", "x", "y"], outputCol="features") kmeans = KMeans(k=5, seed=1, featuresCol="features") pipeline = Pipeline(stages=[indexer, assembler, kmeans]) model = pipeline.fit(d3) d4 = model.transform(d3) d4.groupBy("prediction") \ .agg(functions.collect_set("loc").alias("loc")) \ .orderBy("prediction").show(100, False) WSSSE = model.stages[2].computeCost(d4) print("Within Set Sum of Squared Errors = %d" % WSSSE) print("Cluster Centers: ")
# use the model that has min RMSE num_iter, param = 200, 0.2 als = ALS(maxIter=num_iter, regParam=param, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop") model = als.fit(ratings) user_feature = model.userFactors book_feature = model.itemFactors k = 50 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(user_feature) transformed = model.transform(user_feature).select('id', 'prediction') rows = transformed.collect() df = spark.createDataFrame(rows) df.write.jdbc(url='jdbc:%s' % url + 'yelp', table='book_user_feature200_50', mode='overwrite', properties=properties) k = 50 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(book_feature) transformed = model.transform(book_feature).select('id', 'prediction') rows = transformed.collect()
format="csv", sep=",", inferSchema="true", header="true") columnaInicial = int(os.environ.get('COLUMNA_INICIAL')) columnaFinal = int(os.environ.get('COLUMNA_FINAL')) array = data.columns start_time = time() # Comienzo de contar tiempo data = VectorAssembler(inputCols=array[columnaInicial:columnaFinal], outputCol="features").transform(data) # Trains a k-means model. kmeans = KMeans().setK(4) model = kmeans.fit(data) elapsed_time = time() - start_time elapsed_time = format(elapsed_time, '.6f') salida = 'Tiempo ejecución:' + str(elapsed_time) + ' segundos' # Make predictions predictions = model.transform(data) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette))
# In[11]: # Applying K-Means(12) on Pickup Trimmed Coordinates to get Zones/Neighbourhood vecAssembler = VectorAssembler(inputCols=['Pickup Trimmed Long', 'Pickup Trimmed Lat'], outputCol="features") vector_df = vecAssembler.transform(sample_data) # Vectorizing the features k = 12 # for k in range(10,20): # kmeans = KMeans().setK(k).setSeed(1) # model = kmeans.fit(vector_df) # cost = model.computeCost(vector_df) # print(k, "Within Set Sum of Squared Errors = " + str(cost)) kmeans = KMeans().setK(k).setSeed(1) model = kmeans.fit(vector_df) transformed_data = model.transform(vector_df) # In[12]: # Extracting the Street Addresses and Zipcodes of Area Zones I got from K-Means centers = model.clusterCenters() street = list() zipcode = list() for center in centers: long, lat = round(center[0],3), round(center[1], 3) data = requests.get('https://nominatim.openstreetmap.org/reverse?format=json&lat={}&lon={}&zoom=18&addressdetails=1'.format(lat, long))
""".format(args.agg_table_name)) agg_table = spark.table(args.agg_table_name) existing_tables = [table.name for table in spark.catalog.listTables()] # K-means on artist features if args.feature_kmeans_table_name not in existing_tables: # normalize features va = VectorAssembler(inputCols=[column for column in agg_table.columns if column != "a_id"], outputCol="raw_features") feature_table = va.transform(agg_table) standard_scaler = StandardScaler(inputCol="raw_features", outputCol="features") feature_table = standard_scaler.fit(feature_table).transform(feature_table).select("a_id", "raw_features", "features") feature_table.show() # k-means kmeans = KMeans(k=100) model = kmeans.fit(feature_table) clustered = model.transform(feature_table).select("a_id", "prediction") #clustered.show() clustered.write.saveAsTable(args.feature_kmeans_table_name, format="orc", mode="error") if args.smoothed_kmeans_table_name not in existing_tables: # Compute artist collaboration graph as edge list with self-loop collaboration = spark.sql("select a.artist_id node, b.artist_id neighbor from track_artists a, track_artists b where a.track_id = b.track_id") # and a.artist_id != b.artist_id collaboration.registerTempTable("collaboration") # Smooth the features of artists by averaging over their neighbors. For artist with no collaborator, its features should remain unchanged. artist_features = spark.sql("""select node, avg(am.a_track_number) track_number, avg(am.a_mode) modality, avg(am.a_acousticness) acousticness, avg(am.a_danceability) danceability, avg(am.a_energy) energy, avg(am.a_loudness) loudness, avg(am.a_speechiness) speechiness, avg(am.a_instrumentalness) instrumentalness, avg(am.a_liveness) liveness, avg(am.a_valence) valence, avg(am.a_tempo) tempo from collaboration, {0} am where am.a_id = neighbor group by node """.format(args.agg_table_name))
from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator csv_kt_file_name = "/user/ubuntu/kim/merge_ratio.csv" # Loads data. #dataset = spark.read.format("libsvm").load('a.txt') kt = spark.read.csv(csv_kt_file_name,header=True,inferSchema=True) # kt를 txt 파일로 변환하는 과정이 필요 # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) #################### import pandas as pd
palette="muted", data=df_show.toPandas()) plt.show() df_show.toPandas().hist( bins=[1000, 2000, 4000, 6000, 8000, 10000, 15000, 20000, 25000]) plt.show() # As K means can be applied only on numeric data. at this moment we will remove them # Data df_num = data.drop("protocol_type", "service", "flag").cache() col = df_num.columns # FeatureVector assembler = VectorAssembler(inputCols=col[:-1], outputCol='featureVector') # model kmeans = KMeans(predictionCol="cluster", k=2, featuresCol='featureVector') # pipeline to process it pipeline = Pipeline(stages=[assembler, kmeans]) pipModel = pipeline.fit(df_num) prediction = pipModel.transform(df_num) prediction.select("cluster", "label").groupBy( "cluster", "label").count().orderBy("cluster", "label", ascending=True).show(25) # ## Coice of k cost = np.zeros(6) i = 0 for k in range(20, 140, 20): kmea = KMeans().setK(k).setSeed(1).setFeaturesCol("featureVector") model = kmea.fit(prediction.sample(False, 0.1, seed=42))
genre_and_sentences_after_flatmap.persist() # TFIDF tfidf_dataFrame = genre_and_sentences_after_flatmap.toDF(["genre","sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tfidf_words_data = tokenizer.transform(tfidf_dataFrame) hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=512) tfidf_featurized_data = hashing_tf.transform(tfidf_words_data) idf_model = IDF(inputCol="rawFeatures", outputCol="features").fit(tfidf_featurized_data) tfidf_rescaled_data = idf_model.transform(tfidf_featurized_data) tfidf_genre_features = tfidf_rescaled_data.select("genre", "features") # Confusion matrix for TFIDF tfidf_kmeansmodel = KMeans().setK(5).setFeaturesCol('features').setPredictionCol('prediction').fit(tfidf_genre_features) tfidf_predictions = tfidf_kmeansmodel.transform(tfidf_genre_features).select("prediction", "genre") tfidf_res = tfidf_predictions.groupBy(['prediction', 'genre']).count().collect() print("Confusion matrix for TFIDF:") toPrint(tfidf_res) print() ####################################################################### ## Vocabulary Exploration - Part B ## ####################################################################### # pretrained pretrained_genre_features = genre_and_sentences_after_flatmap.mapPartitions(emb) pretrained_dataFrame = pretrained_genre_features.map(toList).toDF(["genre","features"]) new_schema = ArrayType(DoubleType(), containsNull=False)
result = model.transform(testData) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) # COMMAND ---------- evaluator = MulticlassClassificationEvaluator(metricName="f1") print("F1 Score = " + str(evaluator.evaluate(predictionAndLabels))) # COMMAND ---------- from pyspark.ml.clustering import KMeans # Trains a k-means model. model = KMeans().setK(20).setSeed(1).fit(df_) # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(df_) print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.feature import PCA as PCAml from pyspark.ml.linalg import Vectors # Pre 2.0 pyspark.mllib.linalg
#.drop("Player") # see github for scraping python pipeline (after abstract) # join player's cluster back to all_clean.bsv to ES for visualization and grouping cols_features = list(set(feature_data.columns) - {'Player'}) function = lambda df, column: df.withColumnRenamed(column, column+"_").withColumn(column, col(column+"_").cast("double")).drop(column+"_") feature_data_2 = reduce(function, cols_features, feature_data) VectorAss = VectorAssembler(inputCols = cols_features, outputCol = "features") vdf = VectorAss.transform(feature_data_2) kmeans = KMeans(k=numCluster, seed=1) kmm = kmeans.fit(vdf.select("features")) transformed = kmm.transform(vdf) #print kmm.clusterCenters() #print (type(kmm)) if os.path.exists(output_file): shutil.rmtree(output_file) transformed.drop("features").write.option("header", "true").csv(output_file) if os.path.exists(output_file_2): shutil.rmtree(output_file_2)
display(kmeans_df) # Need to infer correctly the schema. Data are doubles, not string kmeans_df = sqlContext.read.format("com.databricks.spark.csv") \ .option("header", "false").option("delimiter"," ").option("inferschema", "true") \ .load("/FileStore/tables/1x1xr57q1502297004187/kmeans_data.txt") # Prepare data for training (see later the explanation about ML Pipelines) from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline assembler = VectorAssembler(inputCols=["C0","C1","C2"], outputCol="features") assembler.transform(kmeans_df) # Create the KMeans model kmeans_estimator = KMeans().setFeaturesCol("features").setPredictionCol("prediction") # Pipeline stages definition pipeline = Pipeline(stages=[assembler, kmeans_estimator]) # Pipeline training model = pipeline.fit(kmeans_df) # Get the results: results = model.transform(kmeans_df) # Check results: display(results)
tokenizer = Tokenizer(inputCol="MsgLine", outputCol="MsgLine2") wordsData = tokenizer.transform(df) remover = StopWordsRemover(inputCol='MsgLine2', outputCol='MsgLine3') wordsData = remover.transform(wordsData) TF = HashingTF(inputCol="MsgLine3", outputCol="Itf-MSG") tfidfDf = TF.transform(wordsData) #tfidfDf.show(truncate=False) idf = IDF(inputCol="Itf-MSG", outputCol="Itf-MSG2") idfd = idf.fit(tfidfDf) tfidf = idfd.transform(tfidfDf) kmeans = KMeans().setK(11).setSeed(1).setFeaturesCol('Itf-MSG2') model = kmeans.fit(tfidf) print(model.summary.trainingCost) transformed = model.transform(tfidf) transformed.show(truncate=False) transformed.printSchema() writetoFile = transformed.select("Thread", "Serial", "MsgLine", "prediction") writetoFile.repartition(1).write.csv('/home/vishnu/Desktop/out_cluster.csv') sc.stop() #wordsData.withColumn("words_clean", concat_ws(" - ",col("words_clean"))) #df_words.printSchema() #wordsData.show(truncate=False) # Calculate cost and plot
def kmeans_train(pm_options, spark): """ Kmeans Training function :param pm_options: :param spark: :return: """ # Import Data ################################## input_data = (spark.read.format("csv") .option("header", pm_options.with_headers) .option("ignoreLeadingWhiteSpace", "true") .option("ignoreTrailingWhiteSpace", "true") .option("inferschema", "true") .load(pm_options.data_file)).repartition(10) # If Data doesn't have headers Create column names c0-cn column_names_all = input_data.columns if not pm_options.with_headers == "true": for col_index in range(0, len(column_names_all)): input_data = input_data.withColumnRenamed(column_names_all[col_index], 'c' + str(col_index)) input_data = input_data.cache() # Set both train and tesst data to the entire dataset input_train = input_data input_test = input_data # SparkML pipeline ################################## # Create column names for vector assembler. Handle exclude columns for vector assembler exclude_cols = [] # No columns to exclude - kmeans of all columns column_names = input_train.columns input_col_names = [] for elmts in column_names: ind = True for excludes in exclude_cols: if elmts == excludes: ind = False if ind: input_col_names.append(elmts) print(input_col_names) # Set hyper parameters search parameters k_range = pm_options.KRange.split(',') db_index_max = np.finfo(np.float64).max k_max = k_range[0] db_index_array = np.zeros(len(k_range)) for index_hs in range (0,len(k_range)): vector_assembler = VectorAssembler( inputCols=input_col_names, outputCol="features") kmeans_pipe = KMeans( k=int(k_range[index_hs]), initMode="k-means||", initSteps=5, tol=1e-4, maxIter=100, featuresCol="features") full_pipe = [vector_assembler, kmeans_pipe] model_kmeans = Pipeline(stages=full_pipe).fit(input_train) # Test validation and statistics collection ############################################################ predicted_df = model_kmeans.transform(input_test) print("model_kmeans.stages(1) = ", model_kmeans.stages[1]) sum_errors = model_kmeans.stages[1].computeCost(predicted_df) print("Sum of Errors for Kmeans = " + str(sum_errors)) kmeans_centers = model_kmeans.stages[1].clusterCenters() print("Kmeans Centers: ") for center in kmeans_centers: print(center) # calculating stats ############################################################ # Calculating Inter cluster distance inter_cluster_distance = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): inter_cluster_distance[centerIndex1, centerIndex2] = \ eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2]) print("inter_cluster_distance = ", inter_cluster_distance) # Calculating Intra cluster distances and the bars for the cluster distribution intra_cluster_distance = np.zeros(len(kmeans_centers)) cluster_dist = np.zeros(len(kmeans_centers)) for centerIndex1 in range(0, len(kmeans_centers)): filtered_df = predicted_df.filter(predicted_df["prediction"] == centerIndex1) cluster_dist[centerIndex1] = filtered_df.count() if cluster_dist[centerIndex1] == 0: intra_cluster_distance[centerIndex1] = 0 else: filtered_df = \ filtered_df.withColumn('distance', udf(eq_dist, FloatType())(col("features"), array([lit(v) for v in kmeans_centers[centerIndex1]]))) intra_cluster_distance[centerIndex1] = \ filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1] # calculating Davies-Boulding Index ############################################################ # R[i,j] = (S[i] + S[j])/M[i,j] # D[i] = max(R[i,j]) for i !=j # DB = (1/K) * sum(D[i]) r_index = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): r_index[centerIndex1, centerIndex2] = 0 if not inter_cluster_distance[centerIndex1, centerIndex2] == 0: r_index[centerIndex1, centerIndex2] = \ (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2]) \ / inter_cluster_distance[centerIndex1, centerIndex2] d_index = np.max(r_index, axis=0) db_index = np.sum(d_index, axis=0) / len(kmeans_centers) db_index_array[index_hs] = db_index # Check Hyper Parameter Search max if (db_index < db_index_max): db_index_max = db_index k_max = k_range[index_hs] model_kmeans_max = model_kmeans sum_errors_max = sum_errors kmeans_centers_max = kmeans_centers inter_cluster_distance_max = inter_cluster_distance intra_cluster_distance_max = intra_cluster_distance cluster_dist_max = cluster_dist # PM stats ############################################################ print("Optimal K = " + str(k_max)) pm.set_stat("Optimal number of clusters", k_max, st.TIME_SERIES) print("Sum of Errors for Kmeans = " + str(sum_errors_max)) pm.set_stat("Sum of Errors for Kmeans", sum_errors_max, st.TIME_SERIES) print("Davies-Bouldin index = " + str(db_index_max)) pm.set_stat("Davies-Bouldin index", db_index_max, st.TIME_SERIES) # Tables tbl_col_name = [] for j in range(0, len(k_range)): tbl_col_name.append(str(k_range[j])) tbl = Table().name("Davies-Bouldin index for hyper parameter Search").cols(tbl_col_name) tbl.add_row("Davies-Bouldin index:", ["%.2f" % x for x in db_index_array]) pm.set_stat(tbl) tbl_col_name = [] for j in range(0, len(kmeans_centers_max)): tbl_col_name.append(str(j)) tbl = Table().name("Inter cluster distance").cols(tbl_col_name) for j in range(0, len(kmeans_centers_max)): tbl.add_row(str(j) + ":", ["%.2f" % x for x in inter_cluster_distance_max[j, :]]) pm.set_stat(tbl) tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name) tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance_max]) pm.set_stat(tbl) if (len(kmeans_centers_max) < 6) & (len(kmeans_centers_max[0]) < 12): tbl_col_name1 = [] for j in range(0, len(kmeans_centers_max[0])): tbl_col_name1.append(str(j)) tbl = Table().name("Centers (for K<6, Attr<12)").cols(tbl_col_name1) for j in range(0, len(kmeans_centers_max)): tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers_max[j]]) pm.set_stat(tbl) # BarGraph bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(cluster_dist_max.tolist()) pm.set_stat(bar) return model_kmeans_max