示例#1
0
文件: engine.py 项目: jleaniz/bdsa
    def initializeModels(self):
        try:
            if self.kmeansDF:
                logger.info("Already loaded this DataFrame")
                pass
        except AttributeError:
            self.kmeansDF = None

        commandsDF = self.bashDF.map(lambda row: Row(date=row.date,
                                                     source=row.source,
                                                     username=row.username,
                                                     exec_as=row.exec_as,
                                                     srcip=row.srcip,
                                                     command=row.command.split(" "))).toDF()
        commandsDF.cache()

        word2Vec = Word2Vec(vectorSize=100, minCount=1, inputCol="command", outputCol="features")
        w2model = word2Vec.fit(commandsDF)
        resultDF = w2model.transform(commandsDF)
        resultDF.cache()

        kmeans = KMeans(k=650, seed=42, featuresCol="features", predictionCol="prediction", maxIter=10, initSteps=3)
        kmodel = kmeans.fit(resultDF)

        kmeansDF = kmodel.transform(resultDF)
        kmeansDF.cache()
        kmeansDF.coalesce(1).write.parquet('/user/jleaniz/ml/kmeans', mode='append')

        outliers = kmeansDF.groupBy("prediction").count().filter('count < 10').withColumnRenamed("prediction", "cluster")

        self.outlierCmds = outliers.join(kmeansDF, kmeansDF.prediction == outliers.cluster)
示例#2
0
#fitting the vector data and transforming with scaler transformation
scaler_model = scaler.fit(final_df)
final_df = scaler_model.transform(final_df)
final_df.show(6)

# In[28]:

import numpy as np
import matplotlib.pyplot as plt
from time import time

cost = np.zeros(20)
for k in range(2, 20):
    start = time()
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(final_df.sample(False, 0.1, seed=42))
    cost[k] = model.computeCost(final_df)
    end = time()
    print("K means from spark took {:.4f} seconds(k = {:.4f})".format(
        end - start, k))

# In[8]:

fig, ax = plt.subplots(1, 1, figsize=(8, 6))
ax.plot(range(2, 20), cost[2:20])
ax.set_xlabel('k')
ax.set_ylabel('cost')

# In[39]:

best_k = 15  # choose best k from elbow curve
df_clus = vec.transform(data).select('Player', 'features')
df_clus.show()

# In[20]:

features_ball = ['Runs', 'Wkts', 'Ave', 'Econ', 'SR']
vec_ball = VectorAssembler(inputCols=features_ball, outputCol="features_ball")
df_clus_ball = vec_ball.transform(dataBowl).select('Player', 'features_ball')
df_clus_ball.show()

# In[21]:

error = np.zeros(15)
for k in range(2, 15):
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(df_clus.sample(False, 0.25, seed=1))
    error[k] = model.computeCost(df_clus)

# In[22]:

errorBowl = np.zeros(15)
for k in range(2, 15):
    kmeans_ball = KMeans().setK(k).setSeed(1).setFeaturesCol("features_ball")
    model_ball = kmeans_ball.fit(df_clus_ball.sample(False, 0.25, seed=1))
    errorBowl[k] = model_ball.computeCost(df_clus_ball)

# In[23]:

get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
bowling.show()

vecAssembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df_kmeans = vecAssembler.transform(batting).select('player_name', 'features')
df_kmeans.show()

vecAssembler_bowl = VectorAssembler(inputCols=FEATURES_COL_BOWL,
                                    outputCol="features_bowl")
df_kmeans_bowl = vecAssembler_bowl.transform(bowling).select(
    'player_name', 'features_bowl')
df_kmeans_bowl.show()

cost = np.zeros(20)
for k in range(2, 20):
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(df_kmeans.sample(False, 0.1, seed=42))
    cost[k] = model.computeCost(df_kmeans)  # requires Spark 2.0 or later

cost_bowl = np.zeros(20)
for k in range(2, 20):
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features_bowl")
    model = kmeans.fit(df_kmeans_bowl.sample(False, 0.1, seed=42))
    cost_bowl[k] = model.computeCost(
        df_kmeans_bowl)  # requires Spark 2.0 or later

fig, ax = plt.subplots(1, 1, figsize=(8, 6))
ax.plot(range(2, 20), cost[2:20])
ax.set_xlabel('k')
ax.set_ylabel('cost')

fig, ax = plt.subplots(1, 1, figsize=(8, 6))
示例#5
0
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2, seed=1)


def mapper(line):
    return line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[
        7], line[8], line[9], line[10], line[11], line[12],


weather_features = latlongagain.map(mapper)

weather_features_df = weather_features.toDF()
weather_df = weather_features_df.selectExpr(
    "_1 as datetime1", "_2 as day", "_3 as month", "_4 as lat", "_5 as lng",
    "_6 as base", "_7 as humidity", "_8 as wind", "_9 as temp", "_10 as desc",
    "_11 as rain", "_12 as latlng", "_13 as borough", "_14 as features")

test1 = weather_df.withColumn("features", udf_foo("features"))
test1.printSchema()

model = kmeans.fit(test1.select('features'))
示例#6
0
# In[2]:

#I.Generate the two dimensional dataset 
#with three cluster centroid
np.random.seed(0)
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)


# In[3]:

#Clustering with KMeans
K_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
K_means.fit(X)

#Return labels and cluster centers
K_means_labels = K_means.labels_
K_means_cluster_centers = K_means.cluster_centers_
K_means_inertia = K_means.inertia_
#find the unique elements
K_means_labels_unique = np.unique(K_means_labels)


# In[4]:

# Plot result
fig = plt.figure(figsize=(7, 7))
colors = ['#4EACC5', '#FF9C34', '#4E9A06']
ax = fig.add_subplot(111)