def read_csv(path): df = spark.read.csv(path, header=True, inferSchema=True) udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT()) new_df = df.withColumn('features', udf(df.features)) return new_df
def transform(self, X_rdd, y_rdd=None): ''' given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels ''' #check input type if type(X_rdd) != RDD: raise TypeError("Arguments must be pySpark RDDs") if y_rdd and type(y_rdd) != RDD: raise TypeError("Arguments must be pySpark RDDs") #convert X to URL paths X = X_rdd.map( lambda x: 'https://s3.amazonaws.com/eds-uga-csci8360/data/project2/binaries/' + x + '.bytes') X = X.map(self._term_frequency) #check if labels exist if y_rdd: #combine X and y into single dataframe X = X.zipWithIndex().map(lambda r: (r[1], r[0])) y = y_rdd.zipWithIndex().map(lambda r: (r[1], r[0])) data = X.join(y).map(lambda r: r[1]) data = data.toDF(['features', 'label']) else: X = X.map(lambda row: [row]) schema = StructType([StructField("features", VectorUDT(), True)]) data = X.toDF(schema) return data
class VectorUDTTests(PySparkTestCase): dv0 = DenseVector([]) dv1 = DenseVector([1.0, 2.0]) sv0 = SparseVector(2, [], []) sv1 = SparseVector(2, [1], [2.0]) udt = VectorUDT() def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for v in [self.dv0, self.dv1, self.sv0, self.sv1]: self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) def test_infer_schema(self): sqlCtx = SQLContext(self.sc) rdd = self.sc.parallelize( [LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)]) srdd = sqlCtx.inferSchema(rdd) schema = srdd.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = srdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(v, self.dv1) else: raise ValueError("expecting a vector but got %r of type %r" % (v, type(v)))
class VectorUDTTests(MLlibTestCase): dv0 = DenseVector([]) dv1 = DenseVector([1.0, 2.0]) sv0 = SparseVector(2, [], []) sv1 = SparseVector(2, [1], [2.0]) udt = VectorUDT() def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for v in [self.dv0, self.dv1, self.sv0, self.sv1]: self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) def test_infer_schema(self): rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(v, self.dv1) else: raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) def test_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(Vectors.dense(1))]) row_matrix = RowMatrix(df) self.assertEqual(row_matrix.numRows(), 1) self.assertEqual(row_matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): RowMatrix(df.selectExpr("'monkey'")) def test_indexed_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))]) matrix = IndexedRowMatrix(df) self.assertEqual(matrix.numRows(), 1) self.assertEqual(matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1")) def test_row_matrix_invalid_type(self): rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]]) invalid_type = "" matrix = RowMatrix(rows) self.assertRaises(TypeError, matrix.multiply, invalid_type) irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])]) imatrix = IndexedRowMatrix(irows) self.assertRaises(TypeError, imatrix.multiply, invalid_type)
def read_csv(path): df = spark.read.csv(path, header=True, inferSchema=True) udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT()) # https://spark.apache.org/docs/latest/ml-migration-guides.html new_df = MLUtils.convertVectorColumnsToML( df.withColumn('features', udf(df.features))) return new_df
def _sax_transform(self, df): # normalize series normalize_udf = F.udf(lambda x: Vectors.dense((x.toArray()-np.mean(x.toArray()))/np.std(x.toArray())), returnType=VectorUDT()) df = df.withColumn("normalized_serie",normalize_udf(df[self._featuresCol])) # piecewise aggregate aproXimation (PAA) to_paa_udf = F.udf(lambda x: Vectors.dense(self._to_paa(x.toArray())), returnType=VectorUDT()) df = df.withColumn("paa_serie",to_paa_udf(df["normalized_serie"])) # discretization discretize_udf = F.udf(lambda x: Vectors.dense(self._discretize(x.toArray())), returnType=VectorUDT()) df = df.withColumn("discretized_serie",discretize_udf(df["paa_serie"])) return df
def get_features(df): """ Proj Denoise and feature extraction on X, Y and Z from the data frame we have after tasks_to_intervals """ schema = StructType([ StructField("proj_ver", VectorUDT(), False), StructField("proj_hor", VectorUDT(), False) ]) proj_func = udf(proj_for_spark.project_gravity_xyz, schema) df = df['X', 'Y', 'Z', 'key'].withColumn('proj', proj_func("X", "Y", "Z")) df = df.select('key', 'proj.proj_ver', 'proj.proj_hor') df = df['proj_ver','proj_hor', 'key'].withColumn('denoised_ver', utils_function_spark.denoise_func("proj_ver")).withColumn('denoised_hor', utils_function_spark.denoise_func("proj_hor")) df = df.select('key', "denoised_ver", "denoised_hor") df = df["denoised_ver", "denoised_hor", 'key'].withColumn('rel_features_ver', utils_function_spark.toDWT_relative_udf("denoised_ver")).\ withColumn('cont_features_ver', utils_function_spark.toDWT_cont_udf("denoised_ver")) df = df["rel_features_ver", "cont_features_ver", "denoised_hor", 'key'].\ withColumn('rel_features_hor', utils_function_spark.toDWT_relative_udf("denoised_hor")).\ withColumn('cont_features_hor', utils_function_spark.toDWT_cont_udf("denoised_hor")) df = df.select('key', 'rel_features_ver', 'cont_features_ver', 'rel_features_hor', 'cont_features_hor') return df
def data_frame_from_file(sqlContext, file_name, fraction): lines = sc.textFile(file_name).sample(False, fraction) parts = lines.map(lambda l: map(lambda s: int(s), l.split(","))) samples = parts.map(lambda p: (float(p[ 0]), DenseVector(map(lambda el: el / 255.0, p[1:])))) fields = [ StructField("label", DoubleType(), True), StructField("features", VectorUDT(), True) ] schema = StructType(fields) data = sqlContext.createDataFrame(samples, schema) return data
def cat2Num(self, df, indices): """ Write your code! """ # function to select one feature from a list of feature def select_feature(raw_feature, index): return raw_feature[index] # function to select remove features from a list of feature def delete_feature(raw_feature, indices): feature = [ i for j, i in enumerate(raw_feature) if j not in indices ] return Vectors.dense(feature) # Get categorical features and perform One-Hot Encoding df_prev = df for index in indices: select_feature_udf = udf(lambda x: select_feature(x, index), StringType()) df_encoded = df_prev.withColumn("cat_" + str(index), select_feature_udf("rawFeatures")) # string index stringIndexer = StringIndexer(inputCol="cat_" + str(index), outputCol="cat_index_" + str(index)) model_stringIndexer = stringIndexer.fit(df_encoded) indexed = model_stringIndexer.transform(df_encoded) # one-hot encode encoder = OneHotEncoder(inputCol="cat_index_" + str(index), outputCol="cat_vector_" + str(index), dropLast=False) encoded = encoder.transform(indexed) df_prev = encoded # Get continious features by removing categorical indices from rawFeatures delete_feature_udf = udf(lambda x: delete_feature(x, indices), VectorUDT()) df_cont = df_prev.withColumn("cont", delete_feature_udf("rawFeatures")) # Combine one-hot encoded categorical and continious features feature = [] for index in indices: feature.append("cat_vector_" + str(index)) feature.append("cont") assembler = VectorAssembler(inputCols=feature, outputCol="features") df_transformed = assembler.transform(df_cont) \ .select("id","rawFeatures","features") return df_transformed
def pipe_scale_cols(df, with_mean=True, with_std=True, use_dense_vector=True): newdf = df if use_dense_vector: to_dense_udf = udf(lambda v: v.toDense, VectorUDT()) dense_df = newdf.withColumn("features-dense", to_dense_udf(newdf["features"])) newdf = dense_df.drop("features").withColumnRenamed( "features-dense", "features") scaler = StandardScaler(withMean=with_mean, withStd=False, inputCol="features", outputCol="features-scaled") model = scaler.fit(newdf) newdf = model.transform(newdf) newdf = newdf.drop("features") newdf = newdf.withColumnRenamed("features-scaled", "features") return newdf
def transform(self, df): # dataframe columns cols = df.columns # make SAX transformation df = self._sax_transform(df) # calculate distance to centroids distance_to_centroids_udf = F.udf(lambda x: Vectors.dense(self._distance_to_centroids(x.toArray().astype(int))), returnType=VectorUDT()) df = df.withColumn("dist_centroids",distance_to_centroids_udf(df["discretized_serie"])) # assignation min_distance_udf = F.udf(lambda x: int(np.argmin(x.toArray())),returnType=IntegerType()) df = df.withColumn("assignation",min_distance_udf(df["dist_centroids"])) # return prediction dataframe return df.select(cols+["assignation"])
def _fit(self, df): self._centroid_init_function(df) # fit kmeans algorithm cost_values = [np.inf] for it in range(self._maxIter): # calculate distance to centroids distance_to_centroids_udf = F.udf(lambda x: Vectors.dense(self._distance_to_centroids(x.toArray().astype(int))), returnType=VectorUDT()) df = df.withColumn("dist_centroids",distance_to_centroids_udf(df["discretized_serie"])) # assignation min_distance_udf = F.udf(lambda x: int(np.argmin(x.toArray())),returnType=IntegerType()) df = df.withColumn("assignation",min_distance_udf(df["dist_centroids"])) # recalculate centroids df_centroids = df.select(["assignation","paa_serie","dist_centroids"]) centroids_samples = df_centroids.map(lambda x: (x[0],(x[1].toArray(),1,x[2].toArray()[x[0]]))) centroids_sum = centroids_samples.reduceByKey(lambda x,y: (np.add(x[0],y[0]),x[1]+y[1],x[2]+y[2])) centroids_mean = centroids_sum.map(lambda x: (x[0],x[1][0]/float(x[1][1]))).collect() centroids_mean = sorted(centroids_mean) for i,centroid in centroids_mean: self.centroids_[i] = self._discretize(centroid) # calculate cost cost_sum = centroids_sum.map(lambda x: x[1][2]).reduce(add) # check for convergence if abs(cost_values[-1] - cost_sum) <= self._tol: # convergence reached cost_values.append(cost_sum) break cost_values.append(cost_sum) self.cost_ = cost_values[1:] return df
def _kpp_init(self, df): self.centroids_ = [] new_centroid = df.select("discretized_serie").sample(False,0.5).first()[0] self.centroids_.append(new_centroid.toArray()) sw = True while(sw): df_aux = df.select("discretized_serie") # calculate distance to centroids distance_to_centroids_udf = F.udf(lambda x: Vectors.dense(self._distance_to_centroids(x.toArray().astype(int))), returnType=VectorUDT()) df_aux = df_aux.withColumn("dist_centroids",distance_to_centroids_udf(df_aux["discretized_serie"])) # calculate assignation min_distance_udf = F.udf(lambda x: int(np.argmin(x.toArray())),returnType=IntegerType()) df_aux = df_aux.withColumn("assignation",min_distance_udf(df_aux["dist_centroids"])) # distance to nearest centroid nearest_centroid_udf = F.udf(lambda x: float(np.amin(x.toArray())),returnType=FloatType()) df_aux = df_aux.withColumn("dist_nearest_centroid",nearest_centroid_udf(df_aux["dist_centroids"])) # order centroids by distance df_aux = df_aux.withColumn("dist_nearest_centroid_reversed",(-1)*df_aux["dist_nearest_centroid"]) window = Window.partitionBy("assignation").orderBy("dist_nearest_centroid_reversed") df_aux = df_aux.select(df_aux["discretized_serie"],df_aux["dist_nearest_centroid_reversed"],F.row_number().over(window).alias("ordering")) df_aux = df_aux.where(df_aux["ordering"]==4) # get new centroids new_centroids = df_aux.select("discretized_serie").collect() for new_centroid in new_centroids: self.centroids_.append(new_centroid["discretized_serie"].toArray()) if len(self.centroids_)>=self._k: sw = False break self.centroids_ = [centroid.astype(int) for centroid in self.centroids_]
def vectorizeBi(row, dico): vector_dict = {} for w in row.bigrams: if w in dico: vector_dict[dico[w]] = 1 return (row.label, SparseVector(len(dico), vector_dict)) # In[321]: from pyspark.mllib.linalg import VectorUDT from pyspark.sql.types import StructType, StructField, DoubleType schema = StructType([ StructField('label', DoubleType(), True), StructField('bigramVectors', VectorUDT(), True) ]) # In[322]: from functools import partial print "Converting bigrams to sparse vectors in a dataframe for the train set" t0 = time() features = dfTrain.map(partial(vectorizeBi, dico=dict_broad.value)).toDF(schema) features.take(1) tt = time() - t0 print "Done in {} second".format(round(tt, 3)) # In[323]:
def vectorize(row, dico): vector_dict = {} for w in row.words: if w in dico: vector_dict[dico[w]] = 1 return (row.label, SparseVector(len(dico), vector_dict)) from pyspark.mllib.linalg import VectorUDT from pyspark.sql.types import StructType, StructField, DoubleType schema = StructType([ StructField('label', DoubleType(), True), StructField('Vectors', VectorUDT(), True) ]) features = dfTrainTok.map(partial(vectorize, dico=dict_broad.value)).toDF(schema) print "Features created" from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed"
length = len(dicoUni) for w in row.words: if w in dicoUni: vector_dict[dicoUni[w]]=1 for tri in row.wordTrigrams: if tri in dicoTri: vector_dict[dicoTri[tri]+length]=1 return (row.label,SparseVector(length+len(dicoTri),vector_dict)) # In[15]: from pyspark.mllib.linalg import VectorUDT from pyspark.sql.types import StructType, StructField,DoubleType,ArrayType,StringType t = ArrayType(StringType()) schema = StructType([StructField('label',DoubleType(),True), StructField('featureVectors',VectorUDT(),True)]) # In[16]: print "Creating feature vectors" t0 = time() dfTrainVec=dfTrain.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema) dfTestVec=dfTest.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema) tt = time() - t0 print "Dataframe created in {} second".format(round(tt,3)) # In[19]: print "Indexing labels"
def project(comp): return udf(lambda s: Vectors.dense(np.dot(s, comp)), VectorUDT())
df_flat = df_test3.rdd.map(lambda raw: ((raw[0], raw[1], raw[2], raw[3]) , list(zip(raw[4], raw[5], raw[6])))).\ flatMapValues(lambda raw :raw) df_flat = df_flat.map(lambda raw: (raw[0],raw[1][0],raw[1][1],raw[1][2])).\ toDF(['key', 'X', 'Y', 'Z']) ######################################################################################## schema = StructType([ StructField("proj_ver", VectorUDT(), False), StructField("proj_hor", VectorUDT(), False) ]) #proj_new = partial(project_gravity_core, rel = True) proj_func = udf(project_gravity_xyz, schema) df_proj = df_flat['X', 'Y', 'Z', 'key'].withColumn('proj', proj_func("X", "Y", "Z")) df_proj = df_proj.select('key', 'proj.proj_ver', 'proj.proj_hor') df_proj.show(2) ######################################################################################## from scipy.signal import butter, filtfilt from future.utils import lmap
return _convert_to_vector(array) if __name__ == "__main__": FEATURES_COL = "features" if len(sys.argv) != 3: print("Usage: kmeans_example.py <file> <k>", file=sys.stderr) exit(-1) path = sys.argv[1] k = sys.argv[2] spark = SparkSession.builder.appName("PythonKMeansExample").getOrCreate() lines = spark.read.text(path).rdd data = lines.map(parseVector) row_rdd = data.map(lambda x: Row(x)) schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)]) df = spark.createDataFrame(row_rdd, schema) kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol(FEATURES_COL) model = kmeans.fit(df) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) spark.stop()
dict_broad=sc.broadcast(dictionaryBigrams) from pyspark.mllib.linalg import SparseVector def vectorizeBi(row,dico): vector_dict={} for w in row.bigrams: if w in dico: vector_dict[dico[w]]=1 return (row.label,SparseVector(len(dico),vector_dict)) from pyspark.mllib.linalg import VectorUDT from pyspark.sql.types import StructType, StructField,DoubleType schema = StructType([StructField('label',DoubleType(),True),StructField('bigramVectors',VectorUDT(),True)]) features=dfBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema) print "Features from bigrams created" from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import DecisionTreeClassifier string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed"
from test_helper import Test Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0], 'incorrect value for irisDFZeroIndex') # COMMAND ---------- # MAGIC %md # MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`. We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`. To do that we'll need to create a `udf` and apply it to our dataset. Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction). # MAGIC # MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method. # COMMAND ---------- # ANSWER from pyspark.sql.functions import udf # Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types # VectorUDT should be the return type of the udf from pyspark.mllib.linalg import Vectors, VectorUDT # Take the first two values from a SparseVector and convert them to a DenseVector firstTwoFeatures = udf(lambda sv: Vectors.dense(sv.toArray()[:2]), VectorUDT()) irisTwoFeatures = irisDFZeroIndex.select(firstTwoFeatures('features').alias('features'), 'label').cache() display(irisTwoFeatures) # COMMAND ---------- # TEST Test.assertEquals(str(irisTwoFeatures.first()), 'Row(features=DenseVector([-0.5556, 0.25]), label=0.0)', 'incorrect definition of firstTwoFeatures')
def _transform(self, dataset): # dataset format -> peer_paper_id | paper_id | user_id | citeulike_paper_id def diff(v1, v2): """ Calculate the difference between two arrays. :return: array of their difference """ array1 = numpy.array(v1) array2 = numpy.array(v2) result = numpy.subtract(array1, array2) return Vectors.dense(result) def split_papers(papers_id_list): """ Shuffle the input list of paper ids and divide it into two lists. The ratio is 50/50. :param: papers_id_list initial list of paper ids that will be split :return: two arrays with paper ids. The first one contains the "positive paper ids" or those which difference will be added with label 1. The second - "the negative paper ids" - added with label 0. """ shuffle(papers_id_list) ratio = int(0.5 * len(papers_id_list)) positive_class_set = papers_id_list[:ratio] negative_class_set = papers_id_list[ratio:] return [positive_class_set, negative_class_set] vector_diff_udf = F.udf(diff, VectorUDT()) split_papers_udf = F.udf(split_papers, ArrayType(ArrayType(StringType()))) if (self.pairs_generation == "edp" ): # self.Pairs_Generation.EQUALLY_DISTRIBUTED_PAIRS): # 50 % of the paper_pairs with label 1, 50% with label 0 peers_per_paper = None if (self.model_training == "gm"): # get a list of peer paper ids per paper dataset = dataset.select( self.paperId_col, self.peer_paperId_col).dropDuplicates() peers_per_paper = dataset.groupBy(self.paperId_col).agg( F.collect_list( self.peer_paperId_col).alias("peers_per_paper")) else: peers_per_paper = dataset.groupBy( self.userId_col, self.paperId_col).agg( F.collect_list( self.peer_paperId_col).alias("peers_per_paper")) # generate 50/50 distribution to positive/negative class peers_per_paper = peers_per_paper.withColumn( "equally_distributed_papers", split_papers_udf("peers_per_paper")) # positive label 1 # user_id | paper_id | peers_per_paper | equally_distributed_papers | positive_class_papers | positive_class_per_paper = peers_per_paper.withColumn( "positive_class_papers", F.col("equally_distributed_papers")[0]) # user_id | paper_id | peer_paper_id if (self.model_training == "gm"): positive_class_per_paper = positive_class_per_paper.select( self.paperId_col, F.explode("positive_class_papers").alias( self.peer_paperId_col)) else: positive_class_per_paper = positive_class_per_paper.select( self.userId_col, self.paperId_col, F.explode("positive_class_papers").alias( self.peer_paperId_col)) # add lda paper representation to each paper based on its paper_id positive_class_dataset = self.vectorizer_model.transform( positive_class_per_paper) # get in which columns the result of the transform is stored former_paper_output_column = self.vectorizer_model.output_col former_papeId_column = self.vectorizer_model.paperId_col # add lda ids paper representation for peer papers self.vectorizer_model.setPaperIdCol("peer_paper_id") self.vectorizer_model.setOutputCol("peer_paper_lda_vector") # schema -> peer_paper_id | paper_id | user_id | citeulike_paper_id | lda_vector | peer_paper_lda_vector positive_class_dataset = self.vectorizer_model.transform( positive_class_dataset) # return the default columns of the paper profiles model, the model is ready for the training # of the next SVM model self.vectorizer_model.setPaperIdCol(former_papeId_column) self.vectorizer_model.setOutputCol(former_paper_output_column) # add the difference (paper_vector - peer_paper_vector) with label positive_class_dataset = positive_class_dataset.withColumn( self.output_col, vector_diff_udf(former_paper_output_column, "peer_paper_lda_vector")) # add label 1 positive_class_dataset = positive_class_dataset.withColumn( self.label_col, F.lit(1)) # negative label 0 negative_class_per_paper = peers_per_paper.withColumn( "negative_class_papers", F.col("equally_distributed_papers")[1]) if (self.model_training == "gm"): negative_class_per_paper = negative_class_per_paper.select( self.paperId_col, F.explode("negative_class_papers").alias( self.peer_paperId_col)) else: negative_class_per_paper = negative_class_per_paper.select( self.userId_col, self.paperId_col, F.explode("negative_class_papers").alias( self.peer_paperId_col)) # add lda paper representation to each paper based on its paper_id negative_class_dataset = self.vectorizer_model.transform( negative_class_per_paper) # get in which columns the result of the transform is stored former_paper_output_column = self.vectorizer_model.output_col former_papeId_column = self.vectorizer_model.paperId_col # add lda ids paper representation for peer papers self.vectorizer_model.setPaperIdCol("peer_paper_id") self.vectorizer_model.setOutputCol("peer_paper_lda_vector") # schema -> peer_paper_id | paper_id | user_id | citeulike_paper_id | lda_vector | peer_paper_lda_vector negative_class_dataset = self.vectorizer_model.transform( negative_class_dataset) # return the default columns of the paper profiles model, the model is ready for the training # of the next SVM model self.vectorizer_model.setPaperIdCol(former_papeId_column) self.vectorizer_model.setOutputCol(former_paper_output_column) # add the difference (peer_paper_vector - paper_vector) with label 0 negative_class_dataset = negative_class_dataset.withColumn( self.output_col, vector_diff_udf("peer_paper_lda_vector", former_paper_output_column)) # add label 0 negative_class_dataset = negative_class_dataset.withColumn( self.label_col, F.lit(0)) result = positive_class_dataset.union(negative_class_dataset) elif (self.pairs_generation == "dp" ): #self.Pairs_Generation.DUPLICATED_PAIRS): # add lda paper representation to each paper based on its paper_id dataset = self.vectorizer_model.transform(dataset) # get in which columns the result of the transform is stored former_paper_output_column = self.vectorizer_model.output_col former_papeId_column = self.vectorizer_model.paperId_col # add lda ids paper representation for peer papers self.vectorizer_model.setPaperIdCol("peer_paper_id") self.vectorizer_model.setOutputCol("peer_paper_lda_vector") # schema -> peer_paper_id | paper_id | user_id ? | citeulike_paper_id | lda_vector | peer_paper_lda_vector dataset = self.vectorizer_model.transform(dataset) # return the default columns of the paper profiles model, the model is ready for the training # of the next SVM model self.vectorizer_model.setPaperIdCol(former_papeId_column) self.vectorizer_model.setOutputCol(former_paper_output_column) # add the difference (paper_vector - peer_paper_vector) with label 1 positive_class_dataset = dataset.withColumn( self.output_col, vector_diff_udf(former_paper_output_column, "peer_paper_lda_vector")) # add label 1 positive_class_dataset = positive_class_dataset.withColumn( self.label_col, F.lit(1)) # add the difference (peer_paper_vector - paper_vector) with label 0 negative_class_dataset = dataset.withColumn( self.output_col, vector_diff_udf("peer_paper_lda_vector", former_paper_output_column)) # add label 0 negative_class_dataset = negative_class_dataset.withColumn( self.label_col, F.lit(0)) result = positive_class_dataset.union(negative_class_dataset) elif (self.pairs_generation == "ocp" ): #self.Pairs_Generation.ONE_CLASS_PAIRS): # add lda paper representation to each paper based on its paper_id dataset = self.vectorizer_model.transform(dataset) # get in which columns the result of the transform is stored former_paper_output_column = self.vectorizer_model.output_col former_papeId_column = self.vectorizer_model.paperId_col # add lda ids paper representation for peer papers self.vectorizer_model.setPaperIdCol("peer_paper_id") self.vectorizer_model.setOutputCol("peer_paper_lda_vector") # schema -> peer_paper_id | paper_id | user_id ? | citeulike_paper_id | lda_vector | peer_paper_lda_vector dataset = self.vectorizer_model.transform(dataset) # return the default columns of the paper profiles model, the model is ready for the training # of the next SVM model self.vectorizer_model.setPaperIdCol(former_papeId_column) self.vectorizer_model.setOutputCol(former_paper_output_column) # add the difference (paper_vector - peer_paper_vector) with label 1 result = dataset.withColumn( self.output_col, vector_diff_udf(former_paper_output_column, "peer_paper_lda_vector")) # add label 1 result = result.withColumn(self.label_col, F.lit(1)) else: # throw an error - unsupported option raise ValueError('The option' + self.pairs_generation + ' is not supported.') # drop lda vectors - not needed anymore result = result.drop("peer_paper_lda_vector", former_paper_output_column) return result
return SparseVector(len(dictionaryBigrams), vector_dict) # In[52]: # La ca devient le bordel, j'en ai chié pour arriver à appliquer une fonction à toute une colonne # d'une dataframe. Contrairement à pandas, y a pas de fonction "apply", il faut recourir à des # UserDefinedFunctions, et penser que le type sparseVector ne sera pas reconnu par la dataframe, qui # n'est compatible qu'avec un nombre restreint de types # EDIT : en fait je m'en suis pas rendu compte, mais cette manip je l'avais déjà faite pour la surcharge des # tokenizer et postagger... les cinq dernières lignes à la fin avec udf et tout from pyspark.sql.functions import UserDefinedFunction from pyspark.mllib.linalg import VectorUDT udfVectorizeUni = UserDefinedFunction(lambda x: vectorizeUni(x), VectorUDT()) # Une dataframe est un objet immutable, donc pas la peine d'essayer de modifier une colonne, # à la place on crée une deuxième dataframe où on ajoute la colonne qu'on veut. dfVect = dfBigram.withColumn("words", udfVectorizeUni("words")) # On a bien remplacé ici du coup les mots par les vecteurs sparse print "DataFrame(1-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse" dfVect.show() udfVectorizeBi = UserDefinedFunction(lambda x: vectorizeBi(x), VectorUDT()) dfVect2 = dfVect.withColumn("bigrams", udfVectorizeBi("bigrams")) print "DataFrame(bi-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse" dfVect2.show() # Pour les opérations de traitement du langage, il est d'usage de normaliser (L2) # les vecteurs de features : c'est ce qui marche le mieux apparemment.
def f4(): spark_builder = SparkSession.builder.appName( 'PythonStreamingReceiverKafkaWordCount') spark_builder.config( 'spark.jars.packages', ','.join([ 'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.0', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0', 'mysql:mysql-connector-java:5.1.38' ])) spark_builder.config('spark.master', 'local[*]') url = "jdbc:mysql://oxumare.ctweb.inweb.org.br:33060/festival" properties = {"user": "******", "password": "******"} ss = spark_builder.getOrCreate() kafka_server = "oxumare:9092" topic_name = "tweets_ctb" stream = ss.readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", kafka_server) \ .option("subscribe", topic_name) \ .load() json_schema = StructType([ StructField("_id", LongType()), StructField("created_at", TimestampType()), StructField("latitude", FloatType()), StructField("longitude", FloatType()), StructField("cell", IntegerType()), StructField("text", StringType()), StructField("user", StringType()) ]) tweets = stream.select( functions.from_json(stream.value.cast( StringType()), json_schema).alias('json')).select( functions.col('json.user').alias('user'), functions.col('json.text').alias('text'), functions.col('json.cell').alias('cell'), functions.col('json.latitude').alias('latitude'), functions.col('json.longitude').alias('longitude'), functions.col('json.created_at').alias('date')) tweets = remove_accents_punctuation(tweets) tweets = tokenize(tweets) tweets = remove_stop_words(ss, tweets) tweets = generate_n_grams(tweets) tweets = words_to_vector(tweets) tweets = tweets.drop('text_cleaned', 'words', 'words_stops_removed', 'n_grams') tweets, labels = feature_index(tweets) # tweets = tweets.select(['latitude', 'longitude', 'cell']) tweets.printSchema() actions = [ apply_decision_tree_classifier, apply_logistic_regression_classifier, apply_naive_bayes_classifier ] for i, action in enumerate(actions): tweets = action(tweets) \ .withColumnRenamed('probability', 'probability{}'.format(i)) \ .drop('rawPrediction', 'prediction') tweets.printSchema() tweets = tweets.withColumn( 'final_probability', functions.udf(average_probabilities, VectorUDT())( tweets['probability0'], tweets['probability1'], tweets['probability2'])).withColumn( 'result', functions.udf(lambda x: labels[np.argmax(x)])( 'final_probability')) cols = [ functions.udf(get_at_pos(i), FloatType())( tweets['final_probability']).alias(labels[i]) for i in range(3) ] tweets = tweets.select( ['date', 'text', 'cell', 'latitude', 'longitude', 'result'] + cols) tweets = tweets.groupBy( functions.window( functions.col('date'), "60 minutes", "30 minutes"), functions.col('cell')).agg(functions.avg('positive').alias('score'), functions.count('cell').alias('count')) \ .orderBy('window') tweets = tweets.select( functions.to_json(functions.struct([tweets[x] for x in tweets.columns ])).alias("value")) # query = tweets.writeStream \ # .outputMode('complete') \ # .option("truncate", False) \ # .format('console') \ # .start() # query = tweets.writeStream \ # .outputMode('complete') \ # .option("truncate", False) \ # .format('console') \ # .trigger(processingTime='60 seconds') \ # .start() query = tweets.writeStream \ .format("kafka").option("kafka.bootstrap.servers", kafka_server) \ .outputMode('complete') \ .option("topic", topic_name + "_result") \ .option("checkpointLocation", "/data/checkpoint/1").start() query.awaitTermination()
def get_euclidean_mfcc(vec1, vec2): mean1 = np.empty([13, 1]) mean1 = vec1[0:13] cov1 = np.empty([13,13]) cov1 = vec1[13:].reshape(13, 13) mean2 = np.empty([13, 1]) mean2 = vec2[0:13] cov2 = np.empty([13,13]) cov2 = vec2[13:].reshape(13, 13) iu1 = np.triu_indices(13) #You need to pass the arrays as an iterable (a tuple or list), thus the correct syntax is np.concatenate((,),axis=None) div = distance.euclidean(np.concatenate((mean1, cov1[iu1]),axis=None), np.concatenate((mean2, cov2[iu1]),axis=None)) return div tic1 = int(round(time.time() * 1000)) list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT()) ######################################################### # Pre- Process RH and RP for Euclidean # rp = sc.textFile("features[0-9]*/out[0-9]*.rp") rp = rp.map(lambda x: x.split(",")) kv_rp= rp.map(lambda x: (x[0].replace(";","").replace(".","").replace(",","").replace(" ",""), list(x[1:]))) rp_df = sqlContext.createDataFrame(kv_rp, ["id", "rp"]) rp_df = rp_df.select(rp_df["id"],list_to_vector_udf(rp_df["rp"]).alias("rp")) rh = sc.textFile("features[0-9]*/out[0-9]*.rh") rh = rh.map(lambda x: x.split(",")) kv_rh= rh.map(lambda x: (x[0].replace(";","").replace(".","").replace(",","").replace(" ",""), list(x[1:]))) rh_df = sqlContext.createDataFrame(kv_rh, ["id", "rh"]) rh_df = rh_df.select(rh_df["id"],list_to_vector_udf(rh_df["rh"]).alias("rh"))
def main(argv=None): plot = "" if argv is None: inputs = sys.argv[1] if (len(sys.argv) > 2): plot = sys.argv[2] # "plot" to show test RMSE plot conf = SparkConf().setAppName('matrix-factorization-recommend') sc = SparkContext(conf=conf) sqlCt = SQLContext(sc) #read train text file and prepare rating data (userID, movieID, rating) text = sqlCt.read.text(inputs + "/MovieLens100K_train.txt") train = text.map(lambda row: row.value.split("\t")) \ .map(lambda l: (int(l[0]), int(l[1]), float(l[2]))) \ .toDF(["userID", "movieID", "rating"]) train.cache() #read test text file and prepare rating data (userID, movieID, rating) text = sqlCt.read.text(inputs + "/MovieLens100K_test.txt") test = text.map(lambda row: row.value.split("\t")) \ .map(lambda l: (int(l[0]), int(l[1]), float(l[2]))) \ .toDF(["userID", "movieID", "rating"]) test.cache() #read movie names text = sqlCt.read.text(inputs + "/u.item") movie_names = text.map(lambda row: row.value.split("|")) \ .map(lambda l: (int(l[0]), l[1])) \ .toDF(["id", "movieName"]) movie_names.cache() # Build the recommendation model using explicit ALS als = ALS(maxIter=20, userCol="userID", itemCol="movieID", ratingCol="rating") # List to store results: model_result = [] cluster_result = [] # Parameter grid for cross validation evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") ranks = [2, 4, 8, 16, 32, 64, 128, 256] for rank in ranks: paramGrid = ParamGridBuilder() \ .addGrid(als.rank, [rank]) \ .build() # 5-fold cross validation crossval = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross-validation. model = crossval.fit(train) # RMSE on test data - filtering out new users who would not have any prediction prediction_test = model.transform(test).filter("prediction <> 'NaN'") rmse_test = evaluator.evaluate(prediction_test) model_result.append((rank, rmse_test)) # K-mean clustering for items based on 50 factors item_factors = model.bestModel.itemFactors \ .withColumn("features_vector", udf(lambda x: Vectors.dense(x),VectorUDT())("features")) \ .cache() kmeans = KMeans(featuresCol="features_vector", predictionCol="cluster", \ initMode="random", k=50, seed = 1) model_kmeans = kmeans.fit(item_factors) item_clusters = model_kmeans.transform(item_factors) item_factors.unpersist() # Number of items small enough to collect two_clusters = item_clusters.filter("cluster < 2") \ .join(movie_names, on="id") \ .select("cluster", "movieName") \ .map(lambda row: (row[0],row[1])).collect() cluster1 = list( map(lambda x: x[1].encode("utf-8"), filter(lambda x: x[0] == 0, two_clusters))) cluster2 = list( map(lambda x: x[1].encode("utf-8"), filter(lambda x: x[0] == 1, two_clusters))) cluster_result.append((rank, (cluster1, cluster2))) # Show plot if run locally if (plot == "plot"): plotRMSE(model_result) # Print results print("MATRIX FACTORIZATION COLLABORATIVE FILTERING: ") for i in model_result: print("- Rank = %i: Test RMSE = %s" % (i[0], i[1])) print("\nTwo Clusters: ") for i in cluster_result: print("- Rank = %i:\n Cluster-1: %s\n Cluster-2: %s" \ %(i[0], i[1][0], i[1][1]))
baseDir = '/mnt/ml-class/' irisFourFeatures = sqlContext.read.parquet(baseDir + 'irisFourFeatures.parquet') print '\n'.join(map(repr, irisFourFeatures.take(2))) # COMMAND ---------- # MAGIC %md # MAGIC Convert the data from `SparseVector` to `DenseVector` types. # COMMAND ---------- from pyspark.sql.functions import udf from pyspark.mllib.linalg import Vectors, VectorUDT, DenseVector sparseToDense = udf(lambda sv: Vectors.dense(sv.toArray()), VectorUDT()) irisDense = irisFourFeatures.select(sparseToDense('features').alias('features'), 'label') print '\n'.join(map(repr, irisDense.take(2))) # COMMAND ---------- # MAGIC %md # MAGIC Save the new format for use in another notebook. # COMMAND ---------- #irisDense.write.mode('overwrite').parquet('/tmp/irisDense.parquet') # COMMAND ----------
def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
idxes.append(temporary[_][0]) return Vectors.dense(idxes) def get_top_1_topics_idx(topicDistribution,k,num_topics): array_dict = {} for i in range(0,num_topics): array_dict[i] = topicDistribution[i] temporary = sorted(array_dict.items(), key=operator.itemgetter(1), reverse = True)[0:k] #print(temporary) idxes = [] for _ in range(0,k): idxes.append(temporary[_][0]) return Vectors.dense(idxes)[0] k = 3 sqlContext.registerFunction("get_top_k_topics_idx",udf(lambda x: get_top_k_topics_idx(x,k,num_topics), VectorUDT())) sqlContext.registerFunction("get_top_1_topics_idx",udf(lambda x: float(get_top_1_topics_idx(x,1,num_topics)), FloatType())) def extractTopics(transformed): transformed.createOrReplaceTempView("transformed") estrai_topic = sqlContext.sql("select *, get_top_k_topics_idx(topicDistribution) as topTopics,get_top_1_topics_idx(topicDistribution) as firstTopic from transformed") #, get_top_k_topics_1(topicDistribution) as topTopics display(estrai_topic) estrai_topic.createOrReplaceTempView("extracted_transformed") return estrai_topic # COMMAND ---------- transformed = pipeline_model.transform(freqItemsets) new_transformed = extractTopics(transformed) dislpay(new_transformed)
def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
display(fig) # COMMAND ---------- # MAGIC %md # MAGIC Prepare the data so that we have the sepal width as our target and a dense vector containing sepal length as our features. # COMMAND ---------- from pyspark.sql.functions import udf, lit from pyspark.sql.types import DoubleType from pyspark.mllib.linalg import VectorUDT, Vectors getElement = udf(lambda v, i: float(v[i]), DoubleType()) getElementAsVector = udf(lambda v, i: Vectors.dense([v[i]]), VectorUDT()) irisSepal = irisDense.select( getElement('features', lit(1)).alias('sepalWidth'), getElementAsVector('features', lit(0)).alias('features')) irisSepal.cache() display(irisSepal) # COMMAND ---------- # MAGIC %md # MAGIC #### Build a linear regression model # COMMAND ----------