示例#1
0
#spark-submit --master local[*] --packages com.databricks:spark-csv_2.10:1.2.0 cluster.py

sc = SparkContext()
sqlContext = SQLContext(sc)
text = sc.textFile('file:/Users/wangmengyuan/Desktop/rr/listings.txt').map(lambda l:l.split('\t'))\
 .map(lambda l: (l[0],l[1]))
df = sqlContext.createDataFrame(text, ["houseid", "description"])
tokenizer = Tokenizer(inputCol="description", outputCol="tokens")
tokenized = tokenizer.transform(df).cache()
remover = StopWordsRemover(inputCol="tokens",
                           outputCol="stopWordsRemovedTokens")
stopWordsRemoved_df = remover.transform(tokenized).cache()
hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                      outputCol="rawFeatures",
                      numFeatures=200)
tfVectors = hashingTF.transform(stopWordsRemoved_df).cache()
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors).cache()
normalizer = Normalizer(inputCol="features", outputCol="normFeatures")
l2NormData = normalizer.transform(tfIdfVectors)
kmeans = KMeans().setK(10).setMaxIter(20)
km_model = kmeans.fit(l2NormData)
clustersTable = km_model.transform(l2NormData)

#save to hdfs
df1 = clustersTable[['houseid', 'prediction']]
#df1.select('houseid', 'prediction').write.format('com.databricks.spark.csv').save('cluster.csv')
df1.select('houseid', 'prediction').show(20)
sc.stop()
示例#2
0
@author: kach
"""

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Loads data.
dataset = spark.read.format("libsvm").load("data/sample_kmeans_data.txt")

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
示例#3
0
			
# I converted csv to a parquet file to save space and time

def lis(x):
    return [float(i) for i in x[1:-1].split(',')]

spark.read.load("train_fet.csv", format="csv", inferSchema="true", header="true").rdd \
          .map(lambda x: (x[2], x[1], DenseVector(lis(x[0])))) \
          .toDF(["index", "file", "features"])
          .write.parquet("train_fet.parquet")

# Now I create the Bag of Visual Words representation using K-means

schema = spark.read.parquet("train_fet.parquet").persist(StorageLevel(True, True, False, False, 1))
start = time.clock()
kmeans = KMeans(k=K, initMode='random')
print(time.clock()-start)
start = time.clock()
model = kmeans.fit(schema)
print(time.clock()-start)
start = time.clock()
centers = model.clusterCenters()
print(time.clock()-start)
model.save('KmeansModel')

# Next I create the Hamming Embedding Matrix

G = np.random.randn(db, d)
P, _ = np.linalg.qr(G)
np.save('P.npy', P)
示例#4
0
df2.printSchema()

df2.show(50)

# Perform unsupervised learning on df2 with k-means
# You can use whole df2 as both training and testing data,
# Evaluate the clustering result using Accuracy.
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

df3 = df2

kmeans = KMeans(k=2, seed=1)  # 2 clusters here
model = kmeans.fit(df3.select('features'))

transformed = model.transform(df3)
transformed.show(50)

#Generate a scatter plot using the first two PCA components to investigate the data distribution.
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]
df3 = spark.createDataFrame(data, ["features"])

pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
示例#5
0
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans

if __name__ == "__main__":
    spark = SparkSession.builder.appName("Q1").config(
        "spark.some.config.option", "some-value").getOrCreate()
    sc = spark.sparkContext
    rates = sc.textFile("itemusermat").map(lambda x: x.split(" ")).map(lambda x: [int(i) for i in x])\
        .collect()

    # make use of kmean model to fit the matrix
    rates = [(Vectors.dense(x), ) for x in rates]
    df = spark.createDataFrame(rates, ["features"])
    kmeans = KMeans(k=10, seed=1)
    model = kmeans.fit(df)

    # group all movies by cluster
    transformed = model.transform(df).select("prediction", "features").rdd.map(lambda x: (int(x.prediction), int(list(x.features)[0]))). \
        groupByKey().mapValues(lambda x: list(x)[:5])

    # create pair in format of (movie_id, cluster_id)
    # generate pairs like (movie_id, cluster_id) from x which is in the form of (cluster_id, [id1,id2,id3...])
    fiveincluster = transformed.flatMap(lambda x: [(a, x[0]) for a in x[1]])

    # read movie info file
    moviedetails = sc.textFile("movies.dat").map(lambda x: x.split("::")).map(
        lambda x: (int(x[0]), (x[1], x[2])))

    # join the (movie_id, cluster_id) and movie info
    result = fiveincluster.join(moviedetails).map(lambda x: (x[1][0], x[0], x[
示例#6
0
    " ")
typedData = csvData

for colName in columnsToKeep:
    typedData = csvData.withColumn(
        colName, typedData[colName].cast(IntegerType()).alias(colName))

typedData = typedData.na.drop()
print(typedData.schema)

assembler = VectorAssembler().setInputCols(columnsToKeep).setOutputCol(
    "features")
dataWithFeatures = assembler.transform(typedData)
dataWithFeatures.show()

normalizer = Normalizer().setInputCol("features").setOutputCol("normFeatures")
normData = normalizer.transform(dataWithFeatures)

kmeans = KMeans().setK(5).setFeaturesCol("normFeatures")
model = kmeans.fit(normData)

predictions = model.transform(normData)
predictions.select("features", "prediction").show()

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

spark.stop()
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

spark = SparkSession.builder.getOrCreate()

#Reading points
pointsData    = spark.read.csv("/user/s2279444/Cluster3/*")
pointsData    = pointsData.selectExpr("_c0 as Latitudes", "_c1 as Longitudes")
pointsData    = pointsData.withColumn("Latitudes", pointsData["Latitudes"].cast("double"))
pointsData    = pointsData.withColumn("Longitudes", pointsData["Longitudes"].cast("double"))

#Pre Processing the dataset
columns       = pointsData.columns
assembler     = VectorAssembler(inputCols=columns,outputCol="features")
dataset       = assembler.transform(pointsData)

kValue        = 4
kmeans        = KMeans().setK(kValue).setSeed(1)
model         = kmeans.fit(dataset)
predictions   = model.transform(dataset)

finalData     = predictions.select("Latitudes","Longitudes", "prediction")

for i in range(0,kValue):
	data = finalData.filter(finalData["prediction"] == i)
	data = data.select(data["Latitudes"], data["Longitudes"])
	dirName = "Cluster33_"+str(i)
	print("ASHISH: Running for cluster... dirName ->",dirName)
	data.write.csv(dirName)
示例#8
0
    inputColumns = map(lambda x: x.name, numerical)
    for name in map(lambda x: x.name, nonnumerical):
        if (df.select([name]).distinct().count() > 1):
            model = StringIndexer(inputCol=name,
                                  outputCol=name + " Index").fit(df)
            indexed = model.transform(df)

            encoder = OneHotEncoder(inputCol=name + " Index",
                                    outputCol=name + " Vec")
            df = encoder.transform(indexed)

            inputColumns.append(name + " Vec")
    """
    STEP3
    Clustering
    """

    assembler = VectorAssembler().setInputCols(inputColumns).setOutputCol(
        "features")
    kmeans = KMeans().setK(k).setFeaturesCol("features").setPredictionCol(
        "prediction")

    df = assembler.transform(df)
    clusters = kmeans.fit(df).transform(df)
    """
    STEP4
    Write file to S3
    """

    clusters.write.json(outputFile)
transformed_data = assembler.transform(dataset)

# In[10]:

transformed_data.toPandas().head()

# ### Define the clustering model
# Use K-means clustering
# * <b>k: </b>Defines the number of clusters
# * <b>seed: </b>This value is used to set the cluster centers. A different value of seed for the same k will result in clusters being defined differently. In order to reproduce similar clusters when re-running the clustering algorithm use the same values of k and seed

# In[11]:

from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=5, seed=3)
model = kmeans.fit(transformed_data)

# #### Create the clusters using the model

# In[12]:

clusterdData = model.transform(transformed_data)

# #### Use ClusteringEvaluator to evaluate the clusters
# <b>From Wikipedia: </b>The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters.

# In[13]:

from pyspark.ml.evaluation import ClusteringEvaluator
示例#10
0
from pyspark.ml.linalg import Vectors, VectorUDT
sc = SparkContext('local')
spark = SparkSession(sc)

sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv').options(
    header='true',
    inferschema='true').load('/home/cloudera/Desktop/datatest.csv')
feature = StringIndexer(inputCol="Hotttness", outputCol="target")
target = feature.fit(data).transform(data)


def transData(row):
    return Row(label=row["target"],
               features=Vectors.dense([
                   row["Duration"], row["KeySignature"],
                   row["KeySignatureConfidence"], row["Tempo"],
                   row["TimeSignature"], row["TimeSignatureConfidence"]
               ]))


transformed = target.rdd.map(transData).toDF()
kmeans = KMeans(k=4)
model = kmeans.fit(transformed)
predict_data = model.transform(transformed)
train_err = predict_data.filter(
    predict_data['label'] != predict_data['prediction']).count()
total = predict_data.count()
print 23333333333333333333333333333333333333333333333333333333333333333333333333333333333333
print train_err, total, float(train_err) / total
print 23333333333333333333333333333333333333333333333333333333333333333333333333333333333333
示例#11
0
cols = [
    'Session_Connection_Time', 'Bytes_Transferred', 'Kali_Trace_Used',
    'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed'
]

#Assembling The Features
assembler = VectorAssembler(inputCols=cols, outputCol='features')

#Creating the new Dataframe with Features
assembled_data = assembler.transform(data)

#Scaling the Features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

scaler_model = scaler.fit(assembled_data)

scaled_data = scaler_model.transform(assembled_data)

#Creating the Model
k_means = KMeans(featuresCol='scaledFeatures', k=n)

#Training The Model
model = k_means.fit(scaled_data)

#Prediction
model_data = model.transform(scaled_data)

#Grouping and Displaying By Cluster
model_data.groupBy('prediction').count().show()
dataframe_mysql.show()
# read demographic data
demographic_df = spark.read.format("libsvm").csv("data/demographic.csv",
                                                 inferSchema="true",
                                                 header="true")

# assemble feathers vector
vecAssembler = VectorAssembler(inputCols=[
    columns_names.gender, columns_names.education, columns_names.age,
    columns_names.longitude, columns_names.latitude
],
                               outputCol="features")
demographic_df = vecAssembler.transform(demographic_df)

# Trains a k-means model.
kmeans = KMeans().setK(5).setInitMode("k-means||").setSeed(1).setFeaturesCol(
    "features")

#print(kmeans.explainParams())

model = kmeans.fit(demographic_df)

# Make predictions
predictions = model.transform(demographic_df)
predictions.show()

predictions = predictions.drop(predictions["features"])
predictions.show()
#predictions = predictions.toPandas()

predictions.coalesce(1).write.option("header", "true").option(
    "inferSchema", "true").csv("data/5_clusters.csv")
示例#13
0
from pyspark.sql import Row
from pyspark.ml.clustering import KMeans
from pyspark.mllib.linalg import Vectors


def transData(row):
    return Row(label=row["player_name"],
               features=Vectors.dense([
                   row["SHOT_DIST"], row["CLOSE_DEF_DIST"], row["SHOT_CLOCK"],
                   row["SHOT_DIST2"]
               ]))


# 转化成Dataframe格式
transformed = target.map(transData).toDF()
kmeans = KMeans(k=3)
model = kmeans.fit(transformed)

predict_data = model.transform(transformed)

train_err = predict_data.filter(
    predict_data['label'] != predict_data['prediction']).count()

total = predict_data.count()

print(float(train_err), total, float(train_err) / total)

#
# #导入数据
# data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
# df = spark.createDataFrame(data, ["features"])
示例#14
0
# Load in all the mongo data from the database and collection defined
df = spark.read.format("mongo").load()

# Parse game data and generate numeric values so we can feed it into kmeans
parsedData = df.rdd.map(transformChessData)

# Convert PipelinedRDD to dataframe
sqlContext = SQLContext(sc)
schemaFeatures = sqlContext.createDataFrame(parsedData)

# Normalize all the columns
normalizedDF = normalizeData(
    schemaFeatures, ["w_attack", "w_defend", "b_attack", "b_defend", "evals"])
# normalizedDF.show()

# Combine all normalized columns into one "features" column
assembler = VectorAssembler(inputCols=[
    "w_attack_norm", "w_defend_norm", "b_attack_norm", "b_defend_norm",
    "evals_norm"
],
                            outputCol="features")

training = assembler.transform(normalizedDF)

# Build the model (cluster the data)
kmeans = KMeans(k=2, maxIter=100)
model = kmeans.fit(training)

model.save("KMeansModel_final_both_norm")
for center in model.centers:
    print(center)
 def __train_with_clustering(self,df):
     kmeans = KMeans().setK(2).setSeed(1)
     return kmeans.fit(df)
    hashingTF.getOutputCol()).setOutputCol('idf'))

normalizer = (Normalizer().setInputCol(
    idf.getOutputCol()).setOutputCol('features'))

# COMMAND ----------

# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages.  We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`.  This will take about a minute to run.

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans

kmeans = (KMeans().setFeaturesCol('features').setPredictionCol(
    'prediction').setK(5).setSeed(0))

pipeline = Pipeline().setStages(
    [tokenizer, hashingTF, idf, normalizer, kmeans])
model = pipeline.fit(parsed)

# COMMAND ----------

# MAGIC %md
# MAGIC Let's take a look at a sample of the data to see if we can see a pattern between predicted clusters and titles.  We'll use a stratified sample to over-weight the less frequent predictions for inspection purposes.

# COMMAND ----------

predictions = model.transform(parsed)
stratifiedMap = {0: .03, 1: .04, 2: .06, 3: .40, 4: .005}
sampleDF = predictions.sampleBy('prediction', stratifiedMap, 0)
示例#17
0
    data = StringIndexer(inputCol="Embarked",
                         outputCol="EmbarkIndex").fit(data).transform(data)

    data = OneHotEncoder(inputCol="EmbarkIndex",
                         outputCol="EmbarkVec").transform(data)

    final_data = VectorAssembler(
        inputCols=["Survived", "Pclass", "SexVec", "Age", "Fare", "EmbarkVec"],
        outputCol="features").transform(data)

    # Split data into train and test sets
    # Nor necessary for Clustering

    # Model training
    kmeans = KMeans(k=5)
    model = kmeans.fit(final_data)

    # Transform the test data using the model to get predictions
    clustered_data = model.transform(final_data)

    # Prediction and model status
    clustered_data_sorted = clustered_data.orderBy("prediction")
    clustered_data_sorted.show(10000)

    clustered_data.groupBy("prediction").agg(avg("Survived"),
                                             avg("Pclass"),
                                             avg("Age"),
                                             avg("Fare"),
                                             avg("SexIndex"),
                                             avg("EmbarkIndex"),
示例#18
0
    .option("mode", "DROPMALFORMED") \
    .csv("file:///Users/beginspark/Temp/data3.csv")

d1.printSchema()

d2 = d1.toDF("number", "name", "SI", "GOO", "DONG", "x", "y", "b_code", "h_code", "utmk_x", "utmk_y", "wtm_x", "wtm_y")

d3 = d2.select(d2.GOO.alias("loc"), d2.x, d2.y)

d3.show(5, False)

indexer = StringIndexer(inputCol="loc", outputCol="loccode")

assembler = VectorAssembler(inputCols=["loccode", "x", "y"], outputCol="features")

kmeans = KMeans(k=5, seed=1, featuresCol="features")

pipeline = Pipeline(stages=[indexer, assembler, kmeans])

model = pipeline.fit(d3)

d4 = model.transform(d3)

d4.groupBy("prediction") \
    .agg(functions.collect_set("loc").alias("loc")) \
    .orderBy("prediction").show(100, False)

WSSSE = model.stages[2].computeCost(d4)
print("Within Set Sum of Squared Errors = %d" % WSSSE)

print("Cluster Centers: ")
示例#19
0
# use the model that has min RMSE
num_iter, param = 200, 0.2
als = ALS(maxIter=num_iter,
          regParam=param,
          userCol="user_id",
          itemCol="book_id",
          ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(ratings)

user_feature = model.userFactors
book_feature = model.itemFactors

k = 50
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
model = kmeans.fit(user_feature)
transformed = model.transform(user_feature).select('id', 'prediction')
rows = transformed.collect()
df = spark.createDataFrame(rows)

df.write.jdbc(url='jdbc:%s' % url + 'yelp',
              table='book_user_feature200_50',
              mode='overwrite',
              properties=properties)

k = 50
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
model = kmeans.fit(book_feature)
transformed = model.transform(book_feature).select('id', 'prediction')
rows = transformed.collect()
                           format="csv",
                           sep=",",
                           inferSchema="true",
                           header="true")

    columnaInicial = int(os.environ.get('COLUMNA_INICIAL'))
    columnaFinal = int(os.environ.get('COLUMNA_FINAL'))
    array = data.columns

    start_time = time()  # Comienzo de contar tiempo

    data = VectorAssembler(inputCols=array[columnaInicial:columnaFinal],
                           outputCol="features").transform(data)

    # Trains a k-means model.
    kmeans = KMeans().setK(4)
    model = kmeans.fit(data)

    elapsed_time = time() - start_time
    elapsed_time = format(elapsed_time, '.6f')
    salida = 'Tiempo ejecución:' + str(elapsed_time) + ' segundos'

    # Make predictions
    predictions = model.transform(data)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))
示例#21
0
# In[11]:


# Applying K-Means(12) on Pickup Trimmed Coordinates to get Zones/Neighbourhood 
vecAssembler = VectorAssembler(inputCols=['Pickup Trimmed Long', 'Pickup Trimmed Lat'], outputCol="features")
vector_df = vecAssembler.transform(sample_data)  # Vectorizing the features

k = 12
# for k in range(10,20):
#     kmeans = KMeans().setK(k).setSeed(1)
#     model = kmeans.fit(vector_df)
#     cost = model.computeCost(vector_df)
#     print(k, "Within Set Sum of Squared Errors = " + str(cost))

kmeans = KMeans().setK(k).setSeed(1)
model = kmeans.fit(vector_df)
transformed_data = model.transform(vector_df)


# In[12]:


# Extracting the Street Addresses and Zipcodes of Area Zones I got from K-Means
centers = model.clusterCenters()

street = list()
zipcode = list()
for center in centers:
    long, lat = round(center[0],3), round(center[1], 3)
    data = requests.get('https://nominatim.openstreetmap.org/reverse?format=json&lat={}&lon={}&zoom=18&addressdetails=1'.format(lat, long))
示例#22
0
    """.format(args.agg_table_name))

    agg_table = spark.table(args.agg_table_name)

    existing_tables = [table.name for table in spark.catalog.listTables()]
    # K-means on artist features
    if args.feature_kmeans_table_name not in existing_tables:
        # normalize features
        va = VectorAssembler(inputCols=[column for column in agg_table.columns if column != "a_id"], outputCol="raw_features")
        feature_table = va.transform(agg_table)
        standard_scaler = StandardScaler(inputCol="raw_features", outputCol="features")
        feature_table = standard_scaler.fit(feature_table).transform(feature_table).select("a_id", "raw_features", "features")
        feature_table.show()

        # k-means
        kmeans = KMeans(k=100)
        model = kmeans.fit(feature_table)
        clustered = model.transform(feature_table).select("a_id", "prediction")
        #clustered.show()
        clustered.write.saveAsTable(args.feature_kmeans_table_name, format="orc", mode="error")

    if args.smoothed_kmeans_table_name not in existing_tables:
        # Compute artist collaboration graph as edge list with self-loop
        collaboration = spark.sql("select a.artist_id node, b.artist_id neighbor from track_artists a, track_artists b where a.track_id = b.track_id") # and a.artist_id != b.artist_id
        collaboration.registerTempTable("collaboration")
        # Smooth the features of artists by averaging over their neighbors. For artist with no collaborator, its features should remain unchanged.
        artist_features = spark.sql("""select node, avg(am.a_track_number) track_number, avg(am.a_mode) modality, avg(am.a_acousticness) acousticness, avg(am.a_danceability) danceability, avg(am.a_energy) energy,
            avg(am.a_loudness) loudness, avg(am.a_speechiness) speechiness, avg(am.a_instrumentalness) instrumentalness, avg(am.a_liveness) liveness, avg(am.a_valence) valence, avg(am.a_tempo) tempo
            from collaboration, {0} am where am.a_id = neighbor
            group by node
        """.format(args.agg_table_name))
示例#23
0
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

csv_kt_file_name = "/user/ubuntu/kim/merge_ratio.csv"
# Loads data.
#dataset = spark.read.format("libsvm").load('a.txt')
kt = spark.read.csv(csv_kt_file_name,header=True,inferSchema=True)
# kt를 txt 파일로 변환하는 과정이 필요


# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

####################
import pandas as pd
示例#24
0
            palette="muted",
            data=df_show.toPandas())
plt.show()
df_show.toPandas().hist(
    bins=[1000, 2000, 4000, 6000, 8000, 10000, 15000, 20000, 25000])
plt.show()

# As K means can be applied only on numeric data. at this moment we will remove them
# Data
df_num = data.drop("protocol_type", "service", "flag").cache()
col = df_num.columns
# FeatureVector
assembler = VectorAssembler(inputCols=col[:-1], outputCol='featureVector')

# model
kmeans = KMeans(predictionCol="cluster", k=2, featuresCol='featureVector')

# pipeline to process it
pipeline = Pipeline(stages=[assembler, kmeans])
pipModel = pipeline.fit(df_num)
prediction = pipModel.transform(df_num)
prediction.select("cluster", "label").groupBy(
    "cluster", "label").count().orderBy("cluster", "label",
                                        ascending=True).show(25)

# ## Coice of k
cost = np.zeros(6)
i = 0
for k in range(20, 140, 20):
    kmea = KMeans().setK(k).setSeed(1).setFeaturesCol("featureVector")
    model = kmea.fit(prediction.sample(False, 0.1, seed=42))
示例#25
0
genre_and_sentences_after_flatmap.persist()

# TFIDF
tfidf_dataFrame = genre_and_sentences_after_flatmap.toDF(["genre","sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tfidf_words_data = tokenizer.transform(tfidf_dataFrame)

hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=512)
tfidf_featurized_data = hashing_tf.transform(tfidf_words_data)

idf_model = IDF(inputCol="rawFeatures", outputCol="features").fit(tfidf_featurized_data)
tfidf_rescaled_data = idf_model.transform(tfidf_featurized_data)
tfidf_genre_features = tfidf_rescaled_data.select("genre", "features")

# Confusion matrix for TFIDF
tfidf_kmeansmodel = KMeans().setK(5).setFeaturesCol('features').setPredictionCol('prediction').fit(tfidf_genre_features)
tfidf_predictions = tfidf_kmeansmodel.transform(tfidf_genre_features).select("prediction", "genre")
tfidf_res = tfidf_predictions.groupBy(['prediction', 'genre']).count().collect()
print("Confusion matrix for TFIDF:")
toPrint(tfidf_res)
print()

#######################################################################
## Vocabulary Exploration - Part B                                   ##
#######################################################################

# pretrained
pretrained_genre_features = genre_and_sentences_after_flatmap.mapPartitions(emb)
pretrained_dataFrame = pretrained_genre_features.map(toList).toDF(["genre","features"])

new_schema = ArrayType(DoubleType(), containsNull=False)
result = model.transform(testData)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

# COMMAND ----------

evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("F1 Score = " + str(evaluator.evaluate(predictionAndLabels)))

# COMMAND ----------

from pyspark.ml.clustering import KMeans

# Trains a k-means model.
model = KMeans().setK(20).setSeed(1).fit(df_)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(df_)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.feature import PCA as PCAml
from pyspark.ml.linalg import Vectors  # Pre 2.0 pyspark.mllib.linalg
示例#27
0
    #.drop("Player")

# see github for scraping python pipeline (after abstract)

# join player's cluster back to all_clean.bsv to ES for visualization and grouping

cols_features = list(set(feature_data.columns) - {'Player'})

function = lambda df, column: df.withColumnRenamed(column, column+"_").withColumn(column, col(column+"_").cast("double")).drop(column+"_")

feature_data_2 = reduce(function, cols_features, feature_data)

VectorAss = VectorAssembler(inputCols = cols_features, outputCol = "features")
vdf = VectorAss.transform(feature_data_2)

kmeans = KMeans(k=numCluster, seed=1)

kmm = kmeans.fit(vdf.select("features"))

transformed = kmm.transform(vdf)

#print kmm.clusterCenters()

#print (type(kmm))

if os.path.exists(output_file):
  shutil.rmtree(output_file)
transformed.drop("features").write.option("header", "true").csv(output_file)

if os.path.exists(output_file_2):
  shutil.rmtree(output_file_2)
示例#28
0
display(kmeans_df) 

# Need to infer correctly the schema. Data are doubles, not string
kmeans_df = sqlContext.read.format("com.databricks.spark.csv") \
  .option("header", "false").option("delimiter"," ").option("inferschema", "true") \
  .load("/FileStore/tables/1x1xr57q1502297004187/kmeans_data.txt")
  
# Prepare data for training (see later the explanation about ML Pipelines)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=["C0","C1","C2"], outputCol="features") 
assembler.transform(kmeans_df)

# Create the KMeans model
kmeans_estimator = KMeans().setFeaturesCol("features").setPredictionCol("prediction")
    
# Pipeline stages definition
pipeline = Pipeline(stages=[assembler, kmeans_estimator])

# Pipeline training
model = pipeline.fit(kmeans_df)

# Get the results: 
results = model.transform(kmeans_df)

# Check results:
display(results) 


示例#29
0
tokenizer = Tokenizer(inputCol="MsgLine", outputCol="MsgLine2")
wordsData = tokenizer.transform(df)

remover = StopWordsRemover(inputCol='MsgLine2', outputCol='MsgLine3')
wordsData = remover.transform(wordsData)

TF = HashingTF(inputCol="MsgLine3", outputCol="Itf-MSG")
tfidfDf = TF.transform(wordsData)
#tfidfDf.show(truncate=False)

idf = IDF(inputCol="Itf-MSG", outputCol="Itf-MSG2")
idfd = idf.fit(tfidfDf)
tfidf = idfd.transform(tfidfDf)

kmeans = KMeans().setK(11).setSeed(1).setFeaturesCol('Itf-MSG2')
model = kmeans.fit(tfidf)
print(model.summary.trainingCost)
transformed = model.transform(tfidf)
transformed.show(truncate=False)
transformed.printSchema()
writetoFile = transformed.select("Thread", "Serial", "MsgLine", "prediction")
writetoFile.repartition(1).write.csv('/home/vishnu/Desktop/out_cluster.csv')

sc.stop()

#wordsData.withColumn("words_clean", concat_ws(" - ",col("words_clean")))

#df_words.printSchema()
#wordsData.show(truncate=False)
# Calculate cost and plot
示例#30
0
def kmeans_train(pm_options, spark):
    """
    Kmeans Training function
    :param pm_options:
    :param spark:
    :return:
    """

    # Import Data
    ##################################
    input_data = (spark.read.format("csv")
                  .option("header", pm_options.with_headers)
                  .option("ignoreLeadingWhiteSpace", "true")
                  .option("ignoreTrailingWhiteSpace", "true")
                  .option("inferschema", "true")
                  .load(pm_options.data_file)).repartition(10)

    # If Data doesn't have headers Create column names c0-cn
    column_names_all = input_data.columns
    if not pm_options.with_headers == "true":
        for col_index in range(0, len(column_names_all)):
            input_data = input_data.withColumnRenamed(column_names_all[col_index],
                                                      'c' + str(col_index))

    input_data = input_data.cache()

    # Set both train and tesst data to the entire dataset
    input_train = input_data
    input_test = input_data

    # SparkML pipeline
    ##################################
    # Create column names for vector assembler. Handle exclude columns for vector assembler
    exclude_cols = [] # No columns to exclude - kmeans of all columns
    column_names = input_train.columns
    input_col_names = []
    for elmts in column_names:
        ind = True
        for excludes in exclude_cols:
            if elmts == excludes:
                ind = False
        if ind:
            input_col_names.append(elmts)
    print(input_col_names)

    # Set hyper parameters search parameters
    k_range = pm_options.KRange.split(',')
    db_index_max = np.finfo(np.float64).max
    k_max = k_range[0]
    db_index_array = np.zeros(len(k_range))

    for index_hs in range (0,len(k_range)):
        vector_assembler = VectorAssembler(
                inputCols=input_col_names,
                outputCol="features")
        kmeans_pipe = KMeans(
            k=int(k_range[index_hs]),
            initMode="k-means||",
            initSteps=5,
            tol=1e-4,
            maxIter=100,
            featuresCol="features")
        full_pipe = [vector_assembler, kmeans_pipe]
        model_kmeans = Pipeline(stages=full_pipe).fit(input_train)

        # Test validation and statistics collection
        ############################################################
        predicted_df = model_kmeans.transform(input_test)

        print("model_kmeans.stages(1) = ", model_kmeans.stages[1])

        sum_errors = model_kmeans.stages[1].computeCost(predicted_df)
        print("Sum of Errors for Kmeans = " + str(sum_errors))

        kmeans_centers = model_kmeans.stages[1].clusterCenters()
        print("Kmeans Centers: ")
        for center in kmeans_centers:
            print(center)

        # calculating stats
        ############################################################

        # Calculating Inter cluster distance
        inter_cluster_distance = np.zeros((len(kmeans_centers), len(kmeans_centers)))

        for centerIndex1 in range(0, len(kmeans_centers)):
            for centerIndex2 in range(0, len(kmeans_centers)):
                inter_cluster_distance[centerIndex1, centerIndex2] = \
                    eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2])

        print("inter_cluster_distance = ", inter_cluster_distance)
        
        # Calculating Intra cluster distances and the bars for the cluster distribution
        intra_cluster_distance = np.zeros(len(kmeans_centers))
        cluster_dist = np.zeros(len(kmeans_centers))

        for centerIndex1 in range(0, len(kmeans_centers)):
            filtered_df = predicted_df.filter(predicted_df["prediction"] == centerIndex1)
            cluster_dist[centerIndex1] = filtered_df.count()
            if cluster_dist[centerIndex1] == 0:
                intra_cluster_distance[centerIndex1] = 0
            else:
                filtered_df = \
                    filtered_df.withColumn('distance',
                                           udf(eq_dist, FloatType())(col("features"),
                                                                     array([lit(v) for v in kmeans_centers[centerIndex1]])))
                intra_cluster_distance[centerIndex1] = \
                    filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1]

        # calculating Davies-Boulding Index
        ############################################################
        # R[i,j] = (S[i] + S[j])/M[i,j]
        # D[i] = max(R[i,j]) for i !=j
        # DB = (1/K) * sum(D[i])
        r_index = np.zeros((len(kmeans_centers), len(kmeans_centers)))
        for centerIndex1 in range(0, len(kmeans_centers)):
            for centerIndex2 in range(0, len(kmeans_centers)):
                r_index[centerIndex1, centerIndex2] = 0
                if not inter_cluster_distance[centerIndex1, centerIndex2] == 0:
                    r_index[centerIndex1, centerIndex2] = \
                        (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2]) \
                        / inter_cluster_distance[centerIndex1, centerIndex2]
        d_index = np.max(r_index, axis=0)
        db_index = np.sum(d_index, axis=0) / len(kmeans_centers)
        db_index_array[index_hs] = db_index

        # Check Hyper Parameter Search max
        if (db_index < db_index_max):
            db_index_max = db_index
            k_max = k_range[index_hs]
            model_kmeans_max = model_kmeans
            sum_errors_max = sum_errors
            kmeans_centers_max = kmeans_centers
            inter_cluster_distance_max = inter_cluster_distance
            intra_cluster_distance_max = intra_cluster_distance
            cluster_dist_max = cluster_dist




    # PM stats
    ############################################################
    print("Optimal K = " + str(k_max))
    pm.set_stat("Optimal number of clusters", k_max, st.TIME_SERIES)

    print("Sum of Errors for Kmeans = " + str(sum_errors_max))
    pm.set_stat("Sum of Errors for Kmeans", sum_errors_max, st.TIME_SERIES)

    print("Davies-Bouldin index = " + str(db_index_max))
    pm.set_stat("Davies-Bouldin index", db_index_max, st.TIME_SERIES)

    # Tables
    tbl_col_name = []
    for j in range(0, len(k_range)):
        tbl_col_name.append(str(k_range[j]))
    tbl = Table().name("Davies-Bouldin index for hyper parameter Search").cols(tbl_col_name)
    tbl.add_row("Davies-Bouldin index:", ["%.2f" % x for x in db_index_array])
    pm.set_stat(tbl)

    tbl_col_name = []
    for j in range(0, len(kmeans_centers_max)):
        tbl_col_name.append(str(j))
    tbl = Table().name("Inter cluster distance").cols(tbl_col_name)
    for j in range(0, len(kmeans_centers_max)):
        tbl.add_row(str(j) + ":", ["%.2f" % x for x in inter_cluster_distance_max[j, :]])
    pm.set_stat(tbl)

    tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name)
    tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance_max])
    pm.set_stat(tbl)

    if (len(kmeans_centers_max) < 6) & (len(kmeans_centers_max[0]) < 12):
        tbl_col_name1 = []
        for j in range(0, len(kmeans_centers_max[0])):
            tbl_col_name1.append(str(j))
        tbl = Table().name("Centers (for K<6, Attr<12)").cols(tbl_col_name1)
        for j in range(0, len(kmeans_centers_max)):
            tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers_max[j]])
        pm.set_stat(tbl)

    # BarGraph
    bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(cluster_dist_max.tolist())
    pm.set_stat(bar)


    return model_kmeans_max