예제 #1
0
def k_means_transform(book_at, k=100, load_model=True):
    '''
    input: attribute feature matrix of all books
    output: transformed matrix including cluster assignment
    This function is used to cluster all books for faster calculation for knn later
    '''

    if load_model == False:

        ###k-means clustering###
        #Since the data is too big to do knn, first cluster them
        from pyspark.ml.clustering import KMeans
        kmeans = KMeans(
            k=k, seed=42
        )  #divide all books to 1000 clusters (1/1000, less computation for knn)
        model = kmeans.fit(book_at.select('features'))
        #model.save('k-means_model_001_10')
    else:
        from pyspark.ml.clustering import KMeansModel
        model = KMeansModel.load('hdfs:/user/yw2115/k-means_model_001')

    #add the cluster col to original attribute matrix
    transformed = model.transform(book_at)
    transformed = transformed.withColumnRenamed("prediction", "cluster")
    #transormed.show(3)
    return transformed
예제 #2
0
def useModel(path):
    model_path = '/home/hadoop/PycharmProjects/SparkMlib/model/kmeans'
    df = createDataframeKMeans(path)
    df = df.where(df.TotalFee != '0').where(df.DiseaseCode == '24495')
    df = df.withColumn("Age", df.Age.cast(IntegerType())) \
        .withColumn("TotalFee", df.TotalFee.cast(FloatType()))

    data = df.drop("DiseaseCode")
    data.show()

    # 转换数据
    featureCreator = VectorAssembler(inputCols=data.columns[1:], outputCol='feature')
    data = featureCreator.transform(data)

    model = KMeansModel.load(model_path)

    # 聚合
    test = model.transform(data)
    test.show()
    points = []
    for i in test.select("Age", "TotalFee", "prediction", "HosRegisterCode").collect():
        temp = [float(i['Age']), float(i['TotalFee']), int(i['prediction']), i['HosRegisterCode']]
        points.append(temp)

    centers = model.clusterCenters()

    centerPoints = []
    for i in centers:
        temp = [float(i[0]), float(i[1])]
        centerPoints.append(temp)
    getDistance(points, centerPoints)
예제 #3
0
def predict(bucket_name, feature_path, feature_name, output_path, plot_path):
    sc = SparkContext.getOrCreate()
    sqlCtx = SQLContext(sc)

    # load existing model
    model_path = output_path + "k-means.model"
    model = KMeansModel.load(model_path)

    # read from s3 csv and store to local
    path = feature_path + feature_name  # used both locally and remotely: features/pca.csv
    s3 = boto3.resource('s3')
    s3.Object(bucket_name, path).download_file(path)
    df_spark = sqlCtx.read.csv(path, header=True, inferSchema=True)

    # Dataframe to rdd
    vecAssembler = VectorAssembler(inputCols=df_spark.columns,
                                   outputCol="features")
    df_spark = vecAssembler.transform(df_spark)
    rdd = df_spark.rdd.map(lambda x: array(x["features"]))
    print rdd.take(10)

    # From here: K-means model use for prediction

    data = model.transform(df_spark).toPandas()
    print output_path + "pred-" + feature_name
    data.to_csv(path_or_buf=(output_path + "pred-" + feature_name))
예제 #4
0
def loadModel(path):
    '''
    Loading model from path.
    Input: path
    Output: loaded model
    '''
    model = KMeansModel.load(path)
    return model
예제 #5
0
def add_high_low_flag(_data, original_features=True):

    # Add features
    _data = create_KMeans_features(_data, original=original_features)
    # add high-low predictions
    high_low_classifier = KMeansModel.load('KMeans_model')
    _data = high_low_classifier.transform(_data)

    return _data
예제 #6
0
def cluster(player_profile):
    df = player_profile
    #columns used for clustering -> features
    FEATURES_COL = [
        'fouls', 'goals', 'own_goals', 'pass1', 'pass2', 'pass3', 'st1', 'st2',
        'st3'
    ]
    for col in df.columns:
        if col in FEATURES_COL:  #converts all feature_cols to float datatype
            df = df.withColumn(col, df[col].cast('float'))
    df = df.na.drop()

    #combines features columns to make a single feature vector
    vecAssembler = VectorAssembler(inputCols=FEATURES_COL,
                                   outputCol="features")
    df_kmeans = vecAssembler.transform(df).select('Id', 'features')

    k = 5  #number of clusters
    kmeans = KMeans().setK(k).setMaxIter(20).setSeed(1).setFeaturesCol(
        "features")

    #seeing if a model exists, then we use that.Otherwise make a new model in except and save it
    model_path = "/home/revanth/Desktop/SEM5/BD/Big_Data_SEM5/PROJECT_FPL_ANALYTICS/" + "kmeans_model"
    try:
        model = KMeansModel.load(model_path)
        #print("loading saved model")
    except:
        model = kmeans.fit(df_kmeans)  #this is the time consuming part
        model.save(model_path)

    centers = model.clusterCenters()  #centroid of each cluster
    transformed = model.transform(df_kmeans).select(
        'Id', 'prediction'
    )  #applies our model to the given data, giving the cluster number for all id's
    rows = transformed.collect()
    df_pred = sqlContext.createDataFrame(rows)
    df_pred = df_pred.join(df, 'Id')

    #gives a mean rating for each cluster of players
    ratings = df_pred.groupby('prediction').agg({
        'player_rating': 'mean'
    }).withColumnRenamed('avg(player_rating)',
                         'avg_player_rating').select("prediction",
                                                     "avg_player_rating")

    for i in ratings.collect():  #convert dataframe to dictionary
        ratings_di[i.__getitem__('prediction')] = i.__getitem__(
            'avg_player_rating')

    df_pred = df_pred.withColumn(
        "player_rating",
        when(df_pred.no_of_matches < 5,
             udf_func(df_pred.prediction)).otherwise(df_pred.player_rating))

    #returns the ratings
    #print(df_pred.show())
    return df_pred
예제 #7
0
def load_kmeans_model(_model_dir):
    """Load the specified model.
    """
    if os.path.exists(_model_dir):
        print("Loading model from {} direcory...".format(_model_dir))
        model = KMeansModel.load(_model_dir)
    else:
        print('Model {} not found.'.format(_model_dir))
        sys.exit(1)

    return model
예제 #8
0
def find_anomalies(points):
    global cur_model
    if cur_model is None:
        model_path = os.getcwd() + "/kmean_model"
        cur_model = KMeansModel.load(model_path)

    labels = cur_model.transform(points).select('prediction')
    points_array = np.asarray(points.collect())
    labels_array = np.asarray(labels.collect())
    results = list()
    for item, label in zip(points_array, labels_array):
        temp = list()
        temp.append(item[0][0])
        temp.append(item[0][1])
        temp.append(item[0][2])
        temp.append(label[0])
        results.append(temp)
    return results
예제 #9
0
def k_means():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    from pyspark.ml.linalg import Vectors
    data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ),
            (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]
    df = spark.createDataFrame(data, ["features"])
    kmeans = KMeans(k=2, seed=1)
    model = kmeans.fit(df)
    centers = model.clusterCenters()
    len(centers)
    # 2
    model.computeCost(df)
    # 2.000...
    transformed = model.transform(df).select("features", "prediction")
    rows = transformed.collect()
    rows[0].prediction == rows[1].prediction
    # True
    rows[2].prediction == rows[3].prediction
    # True
    model.hasSummary
    # True
    summary = model.summary
    summary.k
    # 2
    summary.clusterSizes
    # [2, 2]
    temp_path = "./"
    kmeans_path = temp_path + "/kmeans"
    kmeans.save(kmeans_path)
    kmeans2 = KMeans.load(kmeans_path)
    kmeans2.getK()
    # 2
    model_path = temp_path + "/kmeans_model"
    model.save(model_path)
    model2 = KMeansModel.load(model_path)
    model2.hasSummary
    # False
    model.clusterCenters()[0] == model2.clusterCenters()[0]
    # array([ True,  True], dtype=bool)
    model.clusterCenters()[1] == model2.clusterCenters()[1]
예제 #10
0
        #load test data
        test_data = sc.textFile(golden_file)
        parsed_test_data = test_data.map(
            kup.parse_as_binaryTuple).filter(lambda x: x[0] != -1.0)
        parsed_test_data_df = spark.createDataFrame(parsed_test_data,
                                                    ["label", "features"])

        # load the scaler model, perform feature scaling on test data
        scalerModel = StandardScalerModel.load(scaler_model_path)
        test_df_tmp = scalerModel.transform(parsed_test_data_df)
        test_df = test_df_tmp.drop("features").withColumnRenamed(
            "scaledFeatures", "features")

        #load the kmeans model
        best_model = KMeansModel.load(model_path)
        start = time()

        # assign clusters to test data
        predict_df = best_model.transform(test_df).select(
            col("label").alias("actualLabel"), "prediction")

        # assign the label to test data according to the assigned clusters
        labelPredictedLabel = predict_df.join(
            cluster_label,
            cluster_label.prediction == predict_df.prediction).select(
                predict_df.actualLabel, cluster_label.label)
        labelPredictedLabel.show(3)

        testTime = time() - start
        print("Test time: {} ".format(round(testTime, 3)))
예제 #11
0
pathVD = args["visualDictionaryPath"]
descriptorName = args["descriptor"]
output = args["output"]

#estimating VLAD descriptors for the whole dataset
print("estimating VLAD descriptors using " + descriptorName +
      " for dataset: /" + path + " and visual dictionary: /" + pathVD)

# with open(pathVD, 'rb') as f:
#     visualDictionary=pickle.load(f)

# xianjie parallel
spark = SparkSession\
        .builder\
        .appName("PythonKMeans")\
        .getOrCreate()
visualDictionary = KMeansModel.load(pathVD)
##

#computing the VLAD descriptors
dict = {"SURF": describeSURF, "SIFT": describeSIFT, "ORB": describeORB}
V, idImages = getVLADDescriptors(path, dict[descriptorName], visualDictionary)

#output
file = output + ".pickle"

with open(file, 'wb') as f:
    pickle.dump([idImages, V, path], f)

print("The VLAD descriptors are  saved in " + file)
# 	plt.plot(centroid1[0], centroid1[1], 'ro')
# 	plt.annotate("Centroid 1", (centroid1[0], centroid1[1]))
# 	plt.plot(centroid2[0], centroid2[1], 'ro')
# 	plt.annotate("Centroid 2", (centroid2[0], centroid2[1]))
#
# 	plt.savefig("{}VS{}".format(xVar, yVar))

sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession \
    .builder \
    .appName("ChessKMeans") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/chess_data_testing.games?readPreference=primaryPreferred") \
    .getOrCreate()

model = KMeansModel.load("KMeansModel_final_both_norm")

df = spark.read.parquet("sample.parquet")

# Combine all normalized columns into one "features" column
assembler = VectorAssembler(inputCols=[
    "w_attack_norm", "w_defend_norm", "b_attack_norm", "b_defend_norm",
    "evals_norm"
],
                            outputCol="features")

testing = assembler.transform(df)

transformed = model.transform(testing).select('w_attack_norm', 'w_defend_norm',
                                              'b_attack_norm', 'b_defend_norm',
                                              'evals_norm', 'prediction')
예제 #13
0
    dataDF = assembler.transform(dataDF)
    #dataDF.show()

    if mode == "training":

        # splitting datasets for training and testing
        (training, testdata) = dataDF.randomSplit([0.7, 0.3], seed=5043)
        kmeans = KMeans().setK(k)
        model = kmeans.fit(training)

        # Predicting the cluster that each id will belong
        transformed = model.transform(testdata).withColumnRenamed(
            "prediction", "cluster_id")

        #archives old model
        model_old = KMeansModel.load(modelpath)
        model_old.write().overwrite().save(modelpath_archives)
        logger.info(
            'Old Daily Clustering Bikes by location Model has been archived on the {}'
            .format(datetime.now()))
        ##### Save model
        model.write().overwrite().save(modelpath)
        logger.info(
            'New Daily Clustering Bikes by location Model has been trained on the {}'
            .format(datetime.now()))

    if mode == "predicting":
        model = KMeansModel.load(modelpath)
        logger.info(
            'Daily Clustering Bikes by location Started on the {} '.format(
                datetime.now()))
예제 #14
0
from pyspark.ml.linalg import DenseVector

spark.read.load("test_fet.csv", format="csv", inferSchema="true", header="true").rdd \
          .map(lambda x: (x[2], x[1], DenseVector(lis(x[0])))) \
          .toDF(["index", "file", "features"])
          .write.parquet("test_fet.parquet")

# Now I get the Bag of Visual Words representation using K-means model built on training data
from pyspark import StorageLevel
schema = spark.read.parquet("test_fet.parquet").persist(StorageLevel(True, True, False, False, 1))

import numpy as np
from pyspark.ml.clustering import KMeansModel

model = KMeansModel.load('KmeansModel')
P = np.load('P.npy')

from pyspark.ml.linalg import DenseVector
predictions = model.transform(schema)
df = predictions.rdd \
    .map(lambda x: (x[2], x[0], DenseVector(np.matmul(np.array(x[0]), P.T)), x[1], x[3])) \
    .toDF(["Index", "Features", "Projections", "File", "VisualWords"])
	
# Then, I generate binary signatures for all test images
tau = np.load('tau.npy')

from pyspark.ml.linalg import DenseVector
def binsig(z, c, tau):
    return DenseVector((z > tau[c,:]))
	
예제 #15
0
def calc_error(rdd):
    now = datetime.now()
    data = rdd.toDF()

    output = transform_model.transform(data)
    predictions = model.transform(output)

    get_logger().info("Processing logfiles")
    wssse = predictions.select(['endpoint','method','response_code','features','prediction'])\
      .rdd\
      .map(lambda line: (error(line.features,clusterCenters[line.prediction]), line.response_code, line.endpoint, line.method))\
      .filter(lambda x: x[0] > 100.0)
    if wssse.count() > 0:
        for line in wssse.collect():
            get_logger().warning(line)
    return wssse

model = KMeansModel.load(MODEL_LOCATION)
transform_model = PipelineModel.load(TRANSFORM_MODEL_LOCATION)
clusterCenters = model.clusterCenters()

access_logs = ssc.socketTextStream(SOCKET_HOST, SOCKET_PORT)
struc_logs = access_logs.flatMap(lambda line: parse_apache_log_line(line))
struc_logs.pprint()
rc_dstream = struc_logs.map(lambda parsed_line: (parsed_line.response_code, 1)) 
rc_count = rc_dstream.reduceByKey(lambda x,y: x+y)
rc_count.pprint(num = 30)
struc_logs.foreachRDD(calc_error)
ssc.start()
ssc.awaitTermination()
from pyspark.ml.clustering import KMeansModel
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

if __name__ == "__main__":

    sparkSession = SparkSession\
        .builder\
        .appName("Spark ML KMeans")\
        .getOrCreate()

    model = KMeansModel.load("KMEANSMODELDF")
    print("Model loaded")

    # Prepare test data
    test = sparkSession.createDataFrame([
        (1, Vectors.dense([1.1, 3.2])),
        (2, Vectors.dense([5.1, 1.4])),
        (3, Vectors.dense([5.2, 2.0])),
        (4, Vectors.dense([1.0, 4.0]))],
        ["id", "features"])\
        .cache()

    for row in test.collect():
        print(row)

    prediction = model.transform(test)
    prediction.printSchema()
    prediction.show()
 def load_model(self, path_to_model):
     """Load K-Means model from path_to_model."""
     from pyspark.ml.clustering import KMeansModel
     model = KMeansModel.load(path_to_model)
     return (model)
예제 #18
0
# заготовка для API

from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import PipelineModel
from pyspark import SparkConf, SparkContext

spark = SparkContext()

df1 = spark.createDataFrame([
    (1353, 1347),
], ['user', 'summa'])

va = VectorAssembler(inputCols=['user', 'summa'], outputCol="features")

modelka = KMeansModel.load('./models/clusters.model')

result = modelka.transform(
    va.transform(df1)).select('prediction').take(1)[0][0]

# Проверка на новенького.
# Если страый юзер - тянем монгу
# Иначе поднимаем мини-инстанс.
from pyspark.ml.clustering import KMeansModel
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

if __name__ == "__main__":

    sparkSession = SparkSession\
        .builder\
        .appName("Spark ML KMeans")\
        .getOrCreate()

    model = KMeansModel.load("KMEANSMODELML")
    print("Model loaded")

    # Prepare test data
    test = sparkSession.createDataFrame([
        (1, Vectors.dense([1.1, 3.2])),
        (2, Vectors.dense([5.1, 1.4])),
        (3, Vectors.dense([5.2, 2.0])),
        (4, Vectors.dense([1.0, 4.0]))],
        ["id", "features"])\
        .cache()

    for row in test.collect():
        print(row)

    prediction = model.transform(test)
    prediction.printSchema()
    prediction.show()

    selected = prediction.select("id", "prediction")
예제 #20
0
tfIdf = PipelineModel.load(tfIdf_model_path)
dataset = tfIdf.transform(dataset)
    
    # VectorAssembler

vector_assembler_output_path = "{}/data/vectorAssemblerModel.bin".format(base_path)
vector_assembler = VectorAssembler.load(vector_assembler_output_path)
dataset = vector_assembler.transform(dataset)

print('# Modelos de preprocesamiento cargados')
print('> Cargando KMeans')

# Clasificación

model_path = "{}/data/distanceKmeansRfModel.bin".format(base_path)
model = KMeansModel.load(model_path)
predictions = model.transform(dataset)

centers = model.clusterCenters()

vectorCent = F.udf(lambda k: centroid(k,centers), ArrayType(DoubleType()))
euclDistance = F.udf(lambda data,centroid: distToCentroid(data,centroid),FloatType())
detectAnom = F.udf(lambda prediction, distance: anomalia(prediction, distance, threshold, limit), BooleanType())

predictions  = predictions.withColumn('centroid', vectorCent(F.col('prediction')))
predictions = predictions.withColumn('distance', euclDistance(F.col('features'),F.col('centroid')))
predictions = predictions.withColumn('anomalia', detectAnom(F.col('prediction'),F.col('distance')))

print('# KMeans cargado ')

only_predictions = predictions.select('version','timestamp','id','type','event','signal','freq','mod','payload','time','anomalia','distance','prediction')
def kmeans_inference(
        original_data,
        msg_col,
        id_col,
        w2v_model_path,
        tks_vec,
        ft_col,
        kmeans_mode,
        kmeans_model_path,
        pred_mode="static",
        new_cluster_thresh=None,
        k_list=[12, 16, 20],  # update_model_path=None,
        distance="cosine",
        opt_initSteps=10,
        opt_tol=0.0001,
        opt_maxIter=30,
        log_path=None,
        n_cores=5,
        # K_optim
        tr_initSteps=200,
        tr_tol=0.000001,
        tr_maxIter=100,  # train_kmeans
):
    """Perform inference on new error messages (Note: only K-Means can be re-trained/updated).

    -- params:
    original_data (pyspark.sql.dataframe.DataFrame): data frame with at least error string and id columns
    msg_col (string): name of the error string column
    id_col (string): name of the message id column
    model_path (string): path where to load pre-trained word2vec model
    tks_vec (string): name of the word2vec representations column
    ft_col (string): name of the features column
    kmeans_mode (\"load\" or \"train\"): kmeans mode: \"load\" uses pre-trained model, while \"train\" performs online training
    kmeans_model_path (string): path to pre-trained model (Specify None for re-training)
    pred_mode (\"static\" or \"update\"): prediction mode: \"static\" does not allow for creating new clusters
    new_cluster_thresh (float): distance threshold: if closest centroid is more distant than new_cluster_thresh 
                                then a new cluster is created for the new observation
    k_list (list): grid of K values to try
    distance (\"euclidean\" or \"cosine\"): distance measure for the kmeans algorithm
    opt_initStep (int): number of different random intializations for the kmeans algorithm in the optimization phase
    opt_tol (int): tolerance for kmeans algorithm convergence in the optimization phase
    opt_maxIter (int): maximum number of iterations for the kmeans algorithm in the optimization phase
    n_cores (int): number of cores to use
    log_path (string): where to save optimization stats. Default None (no saving)
    tr_initStep (int): number of different random intializations for the kmeans algorithm in the training phase
    tr_tol (int): tolerance for kmeans algorithm convergence in the training phase
    tr_maxIter (int): maximum number of iterations for the kmeans algorithm in the training phase

    Returns:
    original_data (pyspark.sql.dataframe.DataFrame): the input data frame with an extra \"prediction\" column
    """
    from language_models import w2v_preproc
    from pyspark.ml.clustering import KMeansModel
    import time
    import datetime
    from pyspark.ml.evaluation import ClusteringEvaluator
    from pathlib import Path

    if kmeans_mode not in ["load", "train"]:
        print(
            """WARNING: invalid param \"kmeans_mode\". Specify either \"load\" to train load a pre-trained model 
              or \"train\" to train it online.""")
        return (None)

    original_data = w2v_preproc(original_data, msg_col, id_col, w2v_model_path)

    if kmeans_mode == "load":
        original_data = kmeans_preproc(original_data, tks_vec)
        kmeans_model = KMeansModel.load(kmeans_model_path)
    else:
        # K_optim()
        # initialize a grid of K (number of clusters) values
        #         k_list = [12, 16, 20]

        # train for different Ks
        res = K_optim(k_list,
                      dataset=original_data,
                      tks_vec=tks_vec,
                      ft_col=ft_col,
                      distance=distance,
                      initSteps=opt_initSteps,
                      tol=opt_tol,
                      maxIter=opt_maxIter,
                      n_cores=n_cores,
                      log_path=log_path)

        k_sil = get_k_best(res, "silhouette")

        if pred_mode == "update":
            save_mode = "overwrite"
            kmeans_model_path = "temp_ciccio"
    #         elif kmeans_mode=="load":
    #             kmeans_model_path = None
    #             save_mode = "new"
    #         else:
    #             save_mode = "new"
        else:
            kmeans_model_path = None
            save_mode = "new"

        best_k_log_path = Path(log_path).parent / "best_K={}.txt".format(k_sil)
        original_data = kmeans_preproc(original_data, tks_vec)
        kmeans_model = train_kmeans(original_data,
                                    ft_col=ft_col,
                                    k=k_sil,
                                    distance=distance,
                                    initSteps=tr_initSteps,
                                    tol=tr_tol,
                                    maxIter=tr_maxIter,
                                    save_path=kmeans_model_path,
                                    mode=save_mode,
                                    log_path=best_k_log_path)

    original_data = kmeans_predict(original_data,
                                   kmeans_model["model"],
                                   pred_mode=pred_mode,
                                   new_cluster_thresh=None,
                                   update_model_path=kmeans_model_path)

    return (original_data)
예제 #22
0
 def _init_model(self):
     model_path = self._model_path
     if os.path.exists(model_path):
         self._kmeans_model = KMeansModel.load(model_path)
예제 #23
0
 def __load_from_hdfs(self):
     sameModel = KMeansModel.load(self.hdfs_uri)
     print("k-means() - model loaded from uri {}".format(self.hdfs_uri))
     return sameModel